<h1 align=center>SMS spam detection with various classifiers</h1>

In [None]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import string
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
%matplotlib inline
import pandas_profiling

In [None]:
sms = pd.read_csv('input_Data/spam.csv', encoding='latin-1')
sms.head()

In [None]:
sms = sms.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'], axis= 1)
sms = sms.rename(columns = {'v1':'Label','v2':'Message'})

In [None]:
sms.head()

In [None]:
sms.tail()

In [None]:
sms.shape

In [None]:
sms.info()

In [None]:
sms.describe()

In [None]:
pandas_profiling.ProfileReport(sms)

In [None]:
sms.columns

In [None]:
sms.index

In [None]:
sms.groupby('Label').describe()

In [None]:
sms['length'] = sms['Message'].apply(len)
sms.head()

In [None]:
mpl.rcParams['patch.force_edgecolor'] = True
plt.style.use('seaborn')
sms.hist(column='length', by='Label', bins=100,figsize=(16,8))

In [None]:
text_feat = sms['Message'].copy()
text_feat.head()

In [None]:
def text_process(text):
    
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = [word for word in text.split() if word.lower() not in stopwords.words('english')]
    
    return " ".join(text)

In [None]:
text_feat = text_feat.apply(text_process)
text_feat.head()

In [None]:
vectorizer = TfidfVectorizer("english")
vectorizer

In [None]:
features = vectorizer.fit_transform(text_feat)
features

### Classifiers and predictions
First of all let's split our features to test and train set

In [None]:
features_train, features_test, labels_train, labels_test = train_test_split(features, sms['Label'], test_size=0.3)

In [None]:
features_test.shape

In [None]:
features_train.shape

In [None]:
labels_train.shape

In [None]:
labels_test.shape

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score

In [None]:
svc = SVC(kernel='sigmoid', gamma=1.0)
knc = KNeighborsClassifier(n_neighbors=49)
mnb = MultinomialNB(alpha=0.2)
dtc = DecisionTreeClassifier(min_samples_split=7, random_state=111)
lrc = LogisticRegression(solver='liblinear', penalty='l1')
rfc = RandomForestClassifier(n_estimators=31, random_state=111)
abc = AdaBoostClassifier(n_estimators=62, random_state=111)
bc = BaggingClassifier(n_estimators=9, random_state=111)
etc = ExtraTreesClassifier(n_estimators=9, random_state=111)

In [None]:
clfs = {'SVC' : svc,
        'KNN' : knc, 
        'NB': mnb, 
        'DT': dtc, 
        'LR': lrc, 
        'RF': rfc, 
        'AdaBoost': abc, 
        'BgC': bc, 
        'ETC': etc}

In [None]:
def train_classifier(clf, feature_train, labels_train):    
    clf.fit(feature_train, labels_train)

In [None]:
def predict_labels(clf, features):
    return (clf.predict(features))

In [None]:
pred_scores = []
for k,v in clfs.items():
    train_classifier(v, features_train, labels_train)
    pred = predict_labels(v,features_test)
    acc = accuracy_score(labels_test,pred)
    pred_scores.append((k, acc))

In [None]:
df = pd.DataFrame(pred_scores,columns=['Model_names','Score'])
df

In [None]:
df.plot(kind='bar', ylim=(0.9,1.0), figsize=(11,6), align='center', colormap="Accent")
plt.xticks(np.arange(9), df.index)
plt.ylabel('Accuracy Score')
plt.title('Distribution by Classifier')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

Looks like ensemble classifiers are not doing as good as expected.

#### Stemmer
It is said that stemming short messages does no goot or even harm predictions. Let's try this out.

Define our stemmer function

In [None]:
def stemmer (text):
    text = text.split()
    words = ""
    for i in text:
            stemmer = SnowballStemmer("english")
            words += (stemmer.stem(i))+" "
    return words

In [None]:
text_feat = text_feat.apply(stemmer)
text_feat

In [None]:
features = vectorizer.fit_transform(text_feat)

In [None]:
features_train, features_test, labels_train, labels_test = train_test_split(features, sms['Label'], test_size=0.3, random_state=111)

In [None]:
pred_scores = []
for k,v in clfs.items():
    train_classifier(v, features_train, labels_train)
    pred = predict_labels(v,features_test)
    acc = accuracy_score(labels_test,pred)
    pred_scores.append((k, acc))

In [None]:
df2 = pd.DataFrame(pred_scores,columns=['Model_names-stemmer','Score_stemmer'])
#df2 = pd.DataFrame.from_items(pred_scores,orient='index', columns=['Score2'])
df = pd.concat([df,df2],axis=1)
df

In [None]:
df.plot(kind='bar', ylim=[0.85,1.0], figsize=(16,10), align='center', colormap="Accent")
plt.xticks(np.arange(9), df.index)
plt.ylabel('Accuracy Score')
plt.title('Distribution by Classifier')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

### What have we forgotten? Message length!
Let's append our message length feature to the matrix we fit into our classifiers

In [None]:
lf = sms['length'].as_matrix()
newfeat = np.hstack((features.todense(),lf[:, None]))
newfeat

In [None]:
features_train, features_test, labels_train, labels_test = train_test_split(newfeat, sms['label'], test_size=0.3, random_state=111)

In [None]:
pred_scores = []
for k,v in clfs.items():
    train_classifier(v, features_train, labels_train)
    pred = predict_labels(v,features_test)
    pred_scores.append((k, [accuracy_score(labels_test,pred)]))

In [None]:
df3 = pd.DataFrame.from_items(pred_scores,orient='index', columns=['Score3'])
df = pd.concat([df,df3],axis=1)
df

In [None]:
df.plot(kind='bar', ylim=(0.85,1.0), figsize=(11,6), align='center', colormap="Accent")
plt.xticks(np.arange(9), df.index)
plt.ylabel('Accuracy Score')
plt.title('Distribution by Classifier')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

This time everyone are doing a little bit worse, except for LinearRegression and RandomForest. But the winner is still MultinominalNaiveBayes.

#### Voting classifier
We are using ensemble algorithms here, but what about ensemble of ensembles? Will it beat NB?

In [None]:
from sklearn.ensemble import VotingClassifier

In [None]:
eclf = VotingClassifier(estimators=[('BgC', bc), ('ETC', etc), ('RF', rfc), ('Ada', abc)], voting='soft')

In [None]:
eclf.fit(features_train,labels_train)

In [None]:
pred = eclf.predict(features_test)

In [None]:
print(accuracy_score(labels_test,pred))