In [None]:
import numpy as np
import pandas as pd
import sklearn
import seaborn as sn
import time
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn import model_selection
import matplotlib.pyplot as plt
%matplotlib inline
from scipy.sparse import coo_matrix, hstack
from nltk.corpus import stopwords
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import BernoulliNB, GaussianNB

Load data from text file and keep in data frame

In [None]:
lines = tuple(open("SMSSpamCollection.txt", 'r'))
df = pd.DataFrame(columns=['label','msg'])
#df.head(5)
count=0
for line in lines:
    df.loc[count] =(line.split("\t"))
    count = count + 1

In [None]:
print('Record Count', count)
df.head(5)

In [None]:
df['category'] = df['label'].map({'ham':1, 'spam':0})

In [None]:
print ('Spam Count', (df['category'] == 0).sum())
print ('Ham Count', (df['category'] == 1).sum())

Feature Enginering on message: 1) Lower cases 2) Removing punctuation

In [None]:
df['msg']=df['msg'].str.lower()

In [None]:
df['msg'] = df['msg'].str.replace('[{}]'.format(string.punctuation),'')

In [None]:
#TF-IDF for messages texts
stop = stopwords.words('english')
tfidfVectorizer=TfidfVectorizer('english', stop_words=stop)
X_tfidf=tfidfVectorizer.fit_transform(df['msg'])
df['vect_msg'] = X_tfidf.toarray().tolist()

print('TFID matrix shape: ', X_tfidf.shape)

Feature Enginering on message: encoding using Bow (Count vectors)

In [None]:
#BOW for messages texts
stop = stopwords.words('english')
bowVectorizer = CountVectorizer('english', stop_words=stop)
X_bow=bowVectorizer.fit_transform(df['msg'])
df['bow_msg'] = X_bow.toarray().tolist()

print('BOW matrix shape: ', X_bow.shape)

Feature Engineering: Creating new feature with text message length

In [None]:
df['len'] = df['msg'].apply(len)

Word Length distribution visulaization for spams and non-spams messages

In [None]:
plt.hist(df.loc[df['label'] == 'spam'].len, normed=False, bins=50)
plt.ylabel('[spams] word counts')

In [None]:
plt.hist(df.loc[df['label'] == 'ham'].len, normed=False, bins=50)
plt.ylabel('[Non spams] word counts');

Feature Engineering: Scaling the created text message length feature

In [None]:
#Addiotnal feature word length
scaler = MinMaxScaler()
df[['len']] = scaler.fit_transform(df[['len']])
lenCol= np.array(df[['len']])
df[['len']] = lenCol

print('Word Length Column shape: ', lenCol.shape)

ombining TF-IDF, BOW and word length length into single feature set Pleaee provide input based on below posibilities: 1 - Only TF-IDF 2 - Only BOW 3 - Only Word Length 4 - TF-IDF + BOW 5 - TF-IDF + Word Length 6 - BOW + Word Length 7 - TF-IDF + BOW + Word Length


In [None]:
input = 7

In [None]:
#Feature combining
if input == 1:
    combinedFeature= X_tfidf
    print('[TF-IDF] shape: ', combinedFeature.shape)
if input == 2:
    combinedFeature= X_bow
    print('[BOW] shape: ', combinedFeature.shape)
if input == 3:
    combinedFeature= lenCol
    print('[Word Length] shape: ', combinedFeature.shape)
elif input == 4:
    combinedFeature= hstack((X_tfidf, X_bow))
    print('[TF-IDF + BOW] shape: ', combinedFeature.shape)
elif input == 5:
    combinedFeature= hstack((X_tfidf, lenCol))
    print('[TF-IDF + Word Length] shape: ', combinedFeature.shape)
elif input == 6:
    combinedFeature= hstack((X_bow, lenCol))
    print('[BOW + Word Length] shape: ', combinedFeature.shape)
elif input == 7:
    combinedFeature= hstack((X_tfidf, X_bow, lenCol))
    print('[TF-IDF + BOW + Word Length] shape: ', combinedFeature.shape)

In [None]:
#label column
y=df['category']
print('Label(Spam/Ham) shape: ', y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(combinedFeature, y, test_size = 0.3, random_state=12)

In [None]:
print("X train/test shape",X_train.shape,X_test.shape)

In [None]:
modelDict = {}

In [None]:
modelDict['Logistic_Regression'] = LogisticRegression(C=1200, penalty='l2', solver='liblinear')

In [None]:
modelDict['Knn'] = KNeighborsClassifier(n_neighbors=20)

In [None]:
modelDict['Decision_Tree'] = DecisionTreeClassifier()

In [None]:
modelDict['Random_Forest'] = RandomForestClassifier(n_estimators=20)

In [None]:
modelDict['Support_Vector_Machine'] = SVC(kernel='sigmoid', gamma=1.19)

In [None]:
modelDict['Gradient_Boosting'] = GradientBoostingClassifier(learning_rate=0.08, min_samples_split=500, max_depth=8, max_features='sqrt', subsample=0.9)

In [None]:
modelDict['Naive_Bayes_Bernoulli'] = BernoulliNB()
#modelDict['Naive_Bayes_Gaussian'] = GaussianNB()

In [None]:
kfoldValidation = model_selection.KFold(n_splits = 15, random_state = 7)
for key in modelDict:
    cv_results = model_selection.cross_val_score(modelDict[key], combinedFeature, y, cv = kfoldValidation, scoring='roc_auc')
    print(key,' : ', cv_results.mean(), cv_results.std())

#output:
#Logistic_Regression  :  0.9895978014627492 0.008559605220390407
#Naive_Bayes_Bernoulli  :  0.9900221045410834 0.010792419523689572

Training & testing

In [None]:
for key in modelDict:
    modelDict[key].fit(X_train, y_train)
    predictions = modelDict[key].predict(X_test)
    print(key," : " ,metrics.accuracy_score(y_test, predictions))
    print(confusion_matrix(y_test, predictions))

#results:
#Logistic_Regression  :  0.9814704124327556
#[[ 200   29]
# [   2 1442]]

#Naive_Bayes_Bernoulli  :  0.972504482964734
#[[ 193   36]
#[  10 1434]]