# Email Classification

### Check the CSV encoding

In [4]:
with open('spam.csv') as f:
    print(f)

<_io.TextIOWrapper name='spam.csv' mode='r' encoding='cp1252'>


### Import dataset

In [42]:
import pandas as pd
df=pd.read_csv('spam.csv',encoding="cp1252") 
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


### Check null value

In [11]:
df.isnull().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

#### Note:- Totla number of samples = 5572 and missing value is very large. SO simply drop it.

In [14]:
df=df.iloc[:,:2]
df=df.set_axis(labels=['label','sms'],axis=1)
sms=df.sms

In [15]:
sms

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: sms, Length: 5572, dtype: object

### Preprocessing

### 1. Tokenizaton,stopwords,stemming, Bag of words (tokenization+counting+normalization)

In [16]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [17]:
ps=PorterStemmer()
corpus=[]
for i in sms:
    data=re.sub('[^a-zA-Z]',' ',i)
    data=data.lower()
    data=data.split()
    data=[ps.stem(x) for x in data if not x in stopwords.words('english')]
    data=' '.join(data)
    corpus.append(data)

In [41]:
corpus[-5:]

['nd time tri contact u u pound prize claim easi call p per minut bt nation rate',
 'b go esplanad fr home',
 'piti mood suggest',
 'guy bitch act like interest buy someth els next week gave us free',
 'rofl true name']

### 2. Vectorization

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=5000)
X=cv.fit_transform(corpus).toarray()
X.shape

(5572, 5000)

In [22]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

### 3. Ordinal encoding

In [23]:
y=pd.get_dummies(df.label).iloc[:,1]
y

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: spam, Length: 5572, dtype: uint8

### 4. Modeling (Classification Problem)

In [24]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

In [25]:
from sklearn.naive_bayes import MultinomialNB
model=MultinomialNB()
model.fit(X_train,y_train)
model.score(X_train,y_train)

0.9910253533767108

### 5. Accuracy test

In [26]:
from sklearn.metrics import confusion_matrix,accuracy_score

In [27]:
pred=model.predict(X_test)
confusion_matrix(pred,y_test)

array([[938,   6],
       [ 11, 160]], dtype=int64)

In [28]:
accuracy_score(pred,y_test)

0.9847533632286996

In [30]:
# also chck this method
model.score(X_test,y_test)

0.9847533632286996

### So finally model train accuracy= 99.10% and model test accuracy= 98.47% .

### Check some other classifier

In [34]:
from sklearn.tree import DecisionTreeClassifier

In [35]:
dc=DecisionTreeClassifier()
dc.fit(X_train,y_train)
pred=dc.predict(X_test)

In [36]:
accuracy_score(pred,y_test)

0.9802690582959641

In [37]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()
lr.fit(X_train,y_train)
pred=lr.predict(X_test)

In [38]:
accuracy_score(pred,y_test)

0.9775784753363229