In [11]:
import numpy as np
import pandas as pd

import nltk
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.corpus import stopwords
# nltk.download('stopwords')
stop = stopwords.words('english')

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
import re
from nltk.stem.porter import PorterStemmer

In [2]:
df = pd.read_csv(r'E:\Machine Learning datasets\Email Spam Classification\data.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df['length'] = df['Message'].apply(lambda x : len(x))

In [4]:
df.head()

Unnamed: 0,Category,Message,length
0,ham,"Go until jurong point, crazy.. Available only ...",111
1,ham,Ok lar... Joking wif u oni...,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,ham,U dun say so early hor... U c already then say...,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",61


In [5]:
df.isnull().sum()

Category    0
Message     0
length      0
dtype: int64

In [6]:
df['Category'].value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [12]:
def pre_process(post):
    post = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ', post)
    post = re.sub("[^a-zA-Z]", " ", post)
    posts = re.sub('\s+', ' ', post)
    posts = posts.lower()
    posts = posts.split()
    posts = [word for word in posts if not word in set(stopwords.words('english'))]
    ps = PorterStemmer()
    posts = [ps.stem(word) for word in posts]
    posts = ' '.join(posts)
    return posts

In [13]:
df['Message'] = df['Message'].apply(pre_process)

In [14]:
df.head()

Unnamed: 0,Category,Message,length
0,ham,go jurong point crazi avail bugi n great world...,111
1,ham,ok lar joke wif u oni,29
2,spam,free entri wkli comp win fa cup final tkt st m...,155
3,ham,u dun say earli hor u c alreadi say,49
4,ham,nah think goe usf live around though,61


In [15]:
map1 = {"ham": 0, "spam": 1}
df['label'] = df['Category'].map(map1)

In [16]:
df.head()

Unnamed: 0,Category,Message,length,label
0,ham,go jurong point crazi avail bugi n great world...,111,0
1,ham,ok lar joke wif u oni,29,0
2,spam,free entri wkli comp win fa cup final tkt st m...,155,1
3,ham,u dun say earli hor u c alreadi say,49,0
4,ham,nah think goe usf live around though,61,0


In [17]:
df["Message"]=df["Message"].astype('str')

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 2000)
features = cv.fit_transform(df['Message'])

In [19]:
features

<5572x2000 sparse matrix of type '<class 'numpy.int64'>'
	with 39295 stored elements in Compressed Sparse Row format>

In [22]:
new_features = features.toarray()

In [23]:
new_features

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [33]:
print(features[2])

  (0, 81)	1
  (0, 314)	1
  (0, 361)	1
  (0, 481)	2
  (0, 516)	2
  (0, 547)	1
  (0, 579)	1
  (0, 1015)	1
  (0, 1349)	1
  (0, 1367)	1
  (0, 1386)	1
  (0, 1629)	1
  (0, 1640)	1
  (0, 1732)	1
  (0, 1766)	1
  (0, 1813)	1
  (0, 1926)	1
  (0, 1938)	1


In [30]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
tfidf=TfidfTransformer().fit(features)
tfidf_trans=tfidf.transform(features)

In [31]:
print(tfidf_trans[0])

  (0, 1947)	0.2955150564860361
  (0, 1890)	0.24351152977620932
  (0, 1277)	0.30100408229021347
  (0, 871)	0.35669425551518014
  (0, 659)	0.24351152977620932
  (0, 650)	0.20644595883879124
  (0, 635)	0.17601131742076365
  (0, 350)	0.3413879962785804
  (0, 287)	0.37241006370826374
  (0, 218)	0.37241006370826374
  (0, 114)	0.32972212355664315


In [32]:
tfidf_trans

<5572x2000 sparse matrix of type '<class 'numpy.float64'>'
	with 39295 stored elements in Compressed Sparse Row format>

In [34]:
X_train,X_test,y_train,y_test=train_test_split(tfidf_trans,df.label,test_size=0.2,random_state=42)

In [35]:
svc=SVC(gamma=0.5,tol=0.51,decision_function_shape='ovo')

In [36]:
svc.fit(X_train,y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovo', degree=3, gamma=0.5, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.51,
    verbose=False)

In [37]:
print("train accuracy: {0} \ntest accuracy: {1}".format(accuracy_score(y_train,svc.predict(X_train)),
                                                       accuracy_score(y_test,svc.predict(X_test))))

train accuracy: 0.991249719542293 
test accuracy: 0.9838565022421525


In [38]:
svc=SVC(gamma=0.5,tol=0.51,decision_function_shape='ovr',random_state=42)
svc.fit(X_train,y_train)
print("train accuracy: {0} \ntest accuracy: {1}".format(accuracy_score(y_train,svc.predict(X_train)),
                                                       accuracy_score(y_test,svc.predict(X_test))))
print("******************************************************")
print(classification_report(y_test,svc.predict(X_test)))
print("******************************************************")
print(confusion_matrix(y_test,svc.predict(X_test)))

train accuracy: 0.991249719542293 
test accuracy: 0.9838565022421525
******************************************************
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       1.00      0.88      0.94       149

    accuracy                           0.98      1115
   macro avg       0.99      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115

******************************************************
[[966   0]
 [ 18 131]]


## XGB Classifier

In [39]:
X_train1,X_test1,y_train1,y_test1=train_test_split(features,df.label,test_size=0.2,random_state=42)

In [40]:
from xgboost import XGBClassifier

xgb_clf = XGBClassifier()
xgb_clf.fit(X_train1, y_train1)
clf_train=xgb_clf.score(X_train1, y_train1)

In [41]:
clf_train

0.9692618353152345

In [42]:
accuracy_score(y_test1, xgb_clf.predict(X_test1))

0.9695067264573991

In [43]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators=5)

In [44]:
rf_clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=5,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [48]:
print("Train acccuarcy is {0} \nTest accuracy is {1}".format(accuracy_score(y_train,rf_clf.predict(X_train)),
     accuracy_score(y_test,rf_clf.predict(X_test))))

Train acccuarcy is 0.9957370428539376 
Test accuracy is 0.9730941704035875
