### Title: Email Spam Detection With Machine Learning
 Email Spam Detection using
**Machine Learning.**
<hr>
<strong>OIBSIP Task No : 04</strong><br>
<hr>

### Loading the Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [2]:
from sklearn.pipeline import Pipeline

In [25]:
from sklearn.metrics import r2_score

### Loading the Dataset

In [3]:
spam_data = pd.read_csv("spam.csv",encoding='latin')

In [4]:
spam_data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
# spam_data2 = pd.read_csv("spam.csv", encoding_errors="replace")
# spam_data2.head()

In [6]:
spam_data.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [7]:
spam_data.isna().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [8]:
unused_cols = spam_data.columns[2:].tolist()

In [9]:
spam_data.drop(unused_cols, axis=1, inplace=True)

In [10]:
spam_data['Spam'] = spam_data['v1'].apply(lambda x:1 if x=='spam' else 0)
spam_data.head()

Unnamed: 0,v1,v2,Spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


Renaming the columns from<br>
<br>v1 ==> Category <br>
<br>v2 ==> Message <br>

In [11]:
new_cols = {"v1":"Category", "v2": "Message"}

In [12]:
spam_data = spam_data.rename(columns=new_cols)

In [13]:
spam_data

Unnamed: 0,Category,Message,Spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1
5568,ham,Will Ì_ b going to esplanade fr home?,0
5569,ham,"Pity, * was in mood for that. So...any other s...",0
5570,ham,The guy did some bitching but I acted like i'd...,0


### Creating Train and Test Datasets

In [14]:
X = spam_data.Message
y = spam_data.Spam

In [15]:
X.head(1)

0    Go until jurong point, crazy.. Available only ...
Name: Message, dtype: object

In [16]:
y.head(1)

0    0
Name: Spam, dtype: int64

### Train and Test

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                   test_size=0.2, 
                                                   train_size=0.8, 
                                                   random_state=42)

In [18]:
print(f"""
    X_train : {X_train.shape}
    X_test  : {X_test.shape}
    y_train : {y_train.shape}
    y_test  : {y_test.shape}
    """)


    X_train : (4457,)
    X_test  : (1115,)
    y_train : (4457,)
    y_test  : (1115,)
    


### Creating 'MNB' Model for Prediction

In [19]:
clf = Pipeline([
        ('vectorizer', CountVectorizer()),
        ('nb', MultinomialNB())
        ])

In [20]:
model = clf.fit(X_train, y_train)

In [21]:
predictions = model.predict(X_test)

In [22]:
test_sample1 = [' 100% cashback on the first item redeem']
test_sample2 = ["It's holiday today."]

In [23]:
def test(test_sample):
    if model.predict(test_sample)[0] == 1:
        print("The Message is 'Spam'")
    else:
        print("The Message is 'Not Spam'")

In [24]:
test(test_sample1)
test(test_sample2)

The Message is 'Spam'
The Message is 'Not Spam'


### Accuracy

In [26]:
r2 = r2_score(y_true=y_test,
             y_pred=predictions)

In [27]:
print(f"Accuracy : {r2:.2%}")

Accuracy : 86.13%
