### Email spam filtering

### Load Data

In [7]:
import pandas as pd

In [8]:
data = pd.read_csv('sms.tsv',delimiter='\t',names=['label','Messages'])

In [9]:
data.head()

Unnamed: 0,label,Messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
data.shape

(5572, 2)

In [11]:
data['label']=data['label'].map({'ham':0,'spam':1})

In [12]:
data.head()

Unnamed: 0,label,Messages
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [13]:
data.label.value_counts()

0    4825
1     747
Name: label, dtype: int64

### clean and Prepare data

In [14]:
import re

def clean(x):
    
    #remove all html tags from data
    #remove all numbers from data
    #remove all special chars from data
    #etc..
    s = re.sub('[^A-Za-z]',' ',x)  #to replace everything except A-Z or a-z
    s = re.sub('\s+',' ',s)
    s = s.strip()
    
    return s.lower()

In [15]:
data['Messages'] = data.Messages.apply(clean)

In [16]:
X = data.Messages.values
y = data.label.values

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
xtrain,xtest,ytrain,ytest = train_test_split(X,y,test_size=.25,random_state=12)

In [19]:
from nltk.corpus import stopwords

stopwords = stopwords.words('english')

In [20]:
### Remove 'not' from stopword list

In [21]:
if 'not' in stopwords:
    stopwords.remove('not')

In [22]:
## confirm 

if 'not' in stopwords:
    print('found')
else:
    print('not found')

not found


### Transform text data into Numeric

In [23]:
from sklearn.feature_extraction.text import CountVectorizer

In [24]:
cv1 = CountVectorizer(stop_words=stopwords)  

In [25]:
cv_train = cv1.fit_transform(xtrain).toarray()

cv_test = cv1.transform(xtest).toarray()

In [26]:
#cv1.get_feature_names()

In [27]:
cv_train.shape

(4179, 6489)

In [28]:
cv_test.shape

(1393, 6489)

In [29]:
#cv1.get_feature_names()

### Train model

In [30]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import confusion_matrix,recall_score,precision_score

import numpy as np

### Naive bayes

In [31]:
nb = MultinomialNB()

nb.fit(cv_train,ytrain)

test_score = nb.score(cv_test,ytest)
test_score

0.9741564967695621

In [32]:
pred = nb.predict(cv_test)

In [33]:
np.bincount(ytest)

array([1197,  196], dtype=int64)

In [34]:
confusion_matrix(ytest,pred)

array([[1182,   15],
       [  21,  175]], dtype=int64)

In [35]:
recall_score(ytest,pred)

0.8928571428571429

In [36]:
precision_score(ytest,pred)

0.9210526315789473

### Logistic Regression

In [37]:
from sklearn.model_selection import GridSearchCV

In [38]:
#nb = MultinomialNB()
log = LogisticRegression(C=.01,class_weight={1:3})

log.fit(cv_train,ytrain)

test_score = log.score(cv_test,ytest)
test_score

0.9540559942569993

In [39]:
log_pred = log.predict(cv_test)

In [40]:
confusion_matrix(ytest,log_pred)

array([[1182,   15],
       [  49,  147]], dtype=int64)

In [41]:
recall_score(ytest,log_pred)

0.75

In [42]:
precision_score(ytest,log_pred)

0.9074074074074074

### Evaluate model on test data

In [43]:
test = ['Get free tickets..!Win cash','hi john I will call you later']

In [44]:
cleaned_data=[]

for i in test:
    t = clean(i)
    cleaned_data.append(t)

In [45]:
cleaned_data

['get free tickets win cash', 'hi john i will call you later']

In [46]:
t1 = cv1.transform(cleaned_data)

In [47]:
t1.shape

(2, 6489)

In [48]:
nb.predict(t1)

array([1, 0], dtype=int64)

In [49]:
log.predict(t1)

array([0, 0], dtype=int64)