In [None]:
# some necessary imports
import string
import numpy as np
import pandas as pd
from sklearn import preprocessing, linear_model, metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score
import seaborn as sns
color = sns.color_palette()
from matplotlib import pyplot as plt
# using plotly since it's very clear to interpret, though seems complicated to code 
from plotly import subplots
import plotly.graph_objs as go
import plotly.offline as py
py.init_notebook_mode(connected=True)

In [None]:
train_df = pd.read_csv('../input/spam-text-message-classification/SPAM text message 20170820 - Data.csv')
train_df.head()

In [None]:
train_df['Category'].unique() 

In [None]:
y = [1 if el == 'spam' else 0 for el in train_df['Category']]

In [None]:
train_df.info() 

There are no missed value so we don't need to handle it

In [None]:
train_texts, valid_texts, train_y, valid_y = \
        train_test_split(train_df['Message'], train_df['Category'], random_state=5, train_size=.75)

In [None]:
trgt_counts = train_df['Category'].value_counts()
trace = go.Bar(
    x=trgt_counts.index, 
    y = trgt_counts.values,
    marker=dict(
        color=trgt_counts.values,
        colorscale='Picnic',
        reversescale=True
    ),
)
layout = go.Layout(
    title='Target Count',
    font=dict(size=18),
    width = 400, 
    height =500,
)
data=[trace]
fig=go.Figure(data=data,layout=layout)
py.iplot(fig,filename='TargetCount')

We'll from the graph we can see that dataset is not balanced

Now we'll analyse ngrams and co-occurences

In [None]:
from wordcloud import STOPWORDS # no need to use nltk here
from collections import defaultdict


def generate_ngrams(text, n_gram = 1):
    token = [token for token in text.lower().split(' ') if token != '' if token not in STOPWORDS]
    ngrams = zip(*[token[i:] for i in range(n_gram)])
    return [' '.join(ngram) for ngram in ngrams]

In [None]:
# custom function for horizontal bar chart showing n-gram distribution
def horizontal_bar_chart(df, color):
    trace = go.Bar(
        y = df['word'].values[::-1],
        x = df['wordcount'].values[::-1],
        showlegend=False,
        orientation='h',
        marker=dict(color=color),
    )
    return trace


def get_bar_chart(df, ngram = 1, color = 'blue'):
    freq_dict = defaultdict(int)
    for sent in df:
        for word in generate_ngrams(sent, ngram):
            freq_dict[word] += 1
    fd_sorted = pd.DataFrame(sorted(freq_dict.items(),key=lambda x:x[1])[::-1])
    fd_sorted.columns = ['word','wordcount']
    return horizontal_bar_chart(fd_sorted.head(50), color)


def create_two_subplots(trace0, 
                        trace1, 
                        subplot_titles = ['Freq words of ham mes','Freq words of spam mes'],
                        title = 'Count spam plots', 
                        filename = 'Word_count_plots'
                       ):
    fig = subplots.make_subplots(rows=1,cols=2, vertical_spacing=0.01,
                         subplot_titles = subplot_titles)
    fig.append_trace(trace0,1,1)
    fig.append_trace(trace1,1,2)
    fig['layout'].update(height=1000, width=900,paper_bgcolor='rgb(233,233,233)',title = title)
    py.iplot(fig, filename)

First we'll look at ngrams distributions:

In [None]:
train1_df = train_texts[train_y== 'spam']
train0_df = train_texts[train_y == 'ham']

# get the bar chart for ham messages
trace0 = get_bar_chart(train0_df, 1, 'blue')

# get the bar chart for spam messages
trace1 = get_bar_chart(train1_df, 1, 'red')

#create two subplots
sub_tit = ['Freq words of ham mes', 'Freq words of spam mes']
tit= 'Word_count_plots'
create_two_subplots(trace0, trace1, sub_tit, tit)

In [None]:
# get the bar chart for ham messages
trace0 = get_bar_chart(train0_df, 2, 'green')

# get the bar chart for spam messages
trace1 = get_bar_chart(train1_df, 2, 'yellow')

#create two subplots
sub_tit = ['Bigram freq of ham mes', 'Bigram freq of spam mes']
tit= 'Bigram_count_plots'
create_two_subplots(trace0, trace1, sub_tit, tit)

In [None]:
# get the bar chart for ham messages
trace0 = get_bar_chart(train0_df, 3, 'brown')

# get the bar chart for spam messages
trace1 = get_bar_chart(train1_df, 3, 'orange')

#create two subplots
sub_tit = ['Trigram freq of ham mes', 'Trigram freq of spam mes']
tit= 'Trigram'
create_two_subplots(trace0, trace1, sub_tit, tit)

So in trigrams distribution we clearly can see specific template for spam messages: "prizes guaranteed" etc. While for ham messages we can see friendly speech specific for messengers.

In [None]:
train_texts = train_texts.to_frame('message')
valid_texts=valid_texts.to_frame('message')

train_texts['label'] = train_y
valid_texts['label'] = valid_y

Since we have texts we can do feature engineering and make extra-features for improving our model. 

We'll make features num of words, num of unique words, num of chars, num of stopwords, punctuations, num of punctuations, num of upper words, num title words and mean of words length

In [None]:
# creating some extra features for better prediction accuracy

train_texts['num_words'] = train_texts['message'].apply(lambda x: len(str(x).split()))
valid_texts['num_words'] = valid_texts['message'].apply(lambda x: len(str(x).split()))

train_texts['num_unique_words'] = train_texts['message'].apply(lambda x: len(set(str(x).split())))  # for each mess
valid_texts['num_unique_words'] = valid_texts['message'].apply(lambda x: len(set(str(x).split())))

train_texts['num_chars'] = train_texts['message'].apply(lambda x: len(str(x))) # for each mess
valid_texts['num_chars'] = valid_texts['message'].apply(lambda x: len(str(x)))

train_texts['num_stopwords'] = train_texts['message'].apply(lambda x: len([w for w in str(x).lower().split() if w in STOPWORDS]))
valid_texts['num_stopwords'] = valid_texts['message'].apply(lambda x: len([w for w in str(x).lower().split() if w in STOPWORDS]))

train_texts['num_punctuations'] = train_texts['message'].apply(lambda x: len([p for p in str(x) if p in string.punctuation]))
valid_texts['num_punctuations'] = valid_texts['message'].apply(lambda x: len([p for p in str(x) if p in string.punctuation]))

train_texts['num_words_upper'] = train_texts['message'].apply(lambda x: len([u for u in str(x) if u.isupper()]))
valid_texts['num_words_upper'] = valid_texts['message'].apply(lambda x: len([u for u in str(x) if u.isupper()]))

train_texts['num_words_title'] = train_texts['message'].apply(lambda x: len([t for t in str(x) if t.istitle()]))
valid_texts['num_words_title'] = valid_texts['message'].apply(lambda x: len([t for t in str(x) if t.istitle()]))

train_texts['mean_word_len'] = train_texts['message'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
valid_texts['mean_word_len'] = valid_texts['message'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

Let's look at the boxplots to make sure we've build some specific features for each class:

In [None]:
# Truncate some extreme values for better visuals ##
train_texts['num_words'].loc[train_texts['num_words']>60] = 60
train_texts['num_punctuations'].loc[train_texts['num_punctuations']>10] = 10
train_texts['num_chars'].loc[train_texts['num_chars']>350] = 350

f, axes = plt.subplots(3, 1, figsize=(10,20))
sns.boxplot(x='label', y='num_words', data=train_texts,ax=axes[0])
axes[0].set_xlabel('Label', fontsize=12)
axes[0].set_title('Number of words in each class', fontsize=15)

sns.boxplot(x='label', y='num_chars', data=train_texts,ax=axes[1])
axes[1].set_xlabel('Label', fontsize=12)
axes[1].set_title('Number of characters in each class', fontsize=15)

sns.boxplot(x='label', y='num_punctuations', data=train_texts,ax=axes[2])
axes[2].set_xlabel('Label', fontsize=12)
axes[2].set_title('Number of punctuations in each class', fontsize=15)
f.savefig('distributions.pdf', bbox_inches='tight')

On boxplots we can see, that distributions for 'ham' and 'spam' messages differ a lot. Thus, it will definitely make our model better (we have unique and specific features for each class)

In [None]:
# get the tfidf vectors 
tfidf_vec = TfidfVectorizer(stop_words ='english', ngram_range=(1,3))
tfidf_vec.fit_transform(train_texts['message'].values.tolist() + valid_texts['message'].values.tolist())
train_tfidf = tfidf_vec.transform(train_texts['message'].values.tolist())
valid_tfidf = tfidf_vec.transform(valid_texts['message'].values.tolist())

In [None]:
train_tfidf.shape

In [None]:
import scipy
train_matrix = scipy.sparse.hstack([train_tfidf, train_texts.drop(['message', 'label'], axis = 1)]) # concatenate our features to tfidf features
valid_matrix = scipy.sparse.hstack([valid_tfidf, valid_texts.drop(['message', 'label'], axis =1)])

Creating lists for metrics:

In [None]:
f1s = list(); accuracies = list(); rocs = list()

# Naive Bayesian estimator from ```sklearn```

In [None]:
train_y = [1 if x == 'spam' else 0 for x in train_y]
valid_y = [1 if x == 'spam' else 0 for x in valid_y]

In [None]:
from sklearn import naive_bayes
from sklearn.metrics import roc_auc_score as roc
from sklearn.metrics import accuracy_score as accuracy

nb = naive_bayes.GaussianNB()
nb.fit(train_matrix.toarray(), train_y)
np_preds = nb.predict(valid_matrix.toarray())

In [None]:
f1s.append(metrics.f1_score(valid_y, np_preds) )
accuracies.append(accuracy(valid_y, np_preds))
rocs.append(roc(valid_y, np_preds))

In [None]:
print("F_1 score is {}".format(f1s[-1]) )
print("Accuracy is {}".format(accuracies[-1]) ) 
print("ROC-AUC score is {}".format(rocs[-1]) )

# kNN classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import Normalizer

In [None]:
knn = KNeighborsClassifier(n_neighbors=15)
knn.fit(train_matrix, train_y)
knn_preds = knn.predict(valid_matrix)

In [None]:
f1s.append(metrics.f1_score(valid_y, knn_preds) )
accuracies.append(accuracy(valid_y, knn_preds))
rocs.append(roc(valid_y, knn_preds))

In [None]:
print("F_1 score is {}".format(f1s[-1]) )
print("Accuracy is {}".format(accuracies[-1]) )
print("ROC-AUC score is {}".format(rocs[-1]) )

# Logistic Regression approach
Begin with only tf-idf features, later we'll do it for extended matrix for quality comparison.

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score as accuracy
from seaborn import heatmap

In [None]:
est = LogisticRegression()
param_grid = {'C' : np.arange(1, 10, 1), 
              'solver' : ['liblinear', 'newton-cg', 'sag'], 
             'penalty' : ['l1', 'l2']}
grid_search = GridSearchCV(est, param_grid, n_jobs = -1, cv=5, verbose = 0)
grid_search.fit(train_tfidf, train_y)

In [None]:
grid_search.best_params_

In [None]:
logit = LogisticRegression(C = 9, penalty = 'l1', solver = 'liblinear')
logit.fit(train_tfidf, train_y)
logit_preds = logit.predict(valid_tfidf)

In [None]:
f1s.append(metrics.f1_score(valid_y, logit_preds) )
accuracies.append(accuracy(valid_y, logit_preds))
rocs.append(roc(valid_y, logit_preds))

In [None]:
print("F_1 score is {}".format(f1s[-1]) )
print("Accuracy is {}".format(accuracies[-1]) )
print("ROC-AUC score is {}".format(rocs[-1]) )

In [None]:
heatmap(confusion_matrix(valid_y, logit_preds) , annot= True, fmt = 'd', cmap="YlGnBu")
None

# Logreg with extended matrix:

In [None]:
est = LogisticRegression()
param_grid = {'C' : np.arange(1, 10, 1), 
              'solver' : ['liblinear', 'newton-cg', 'sag'], 
             'penalty' : ['l1', 'l2']}
grid_search = GridSearchCV(est, param_grid, n_jobs = -1, cv=5, verbose = 0)
grid_search.fit(train_matrix, train_y)

In [None]:
grid_search.best_params_

In [None]:
logit_ext = LogisticRegression(C = 9, penalty = 'l1', solver = 'liblinear')
logit_ext.fit(train_matrix, train_y)
logit_ext_preds = logit_ext.predict(valid_matrix)

In [None]:
f1s.append(metrics.f1_score(valid_y, logit_ext_preds) )
accuracies.append(accuracy(valid_y, logit_ext_preds))
rocs.append(roc(valid_y, logit_ext_preds))

In [None]:
print("F_1 score is {}".format(f1s[-1]) )
print("Accuracy is {}".format(accuracies[-1]) ) 
print("ROC-AUC score is {}".format(rocs[-1]) )

In [None]:
heatmap(confusion_matrix(valid_y, logit_ext_preds) , annot= True, fmt = 'd', cmap="YlGnBu")
None

# SVM approach

In [None]:
from sklearn.svm import SVC
svm = SVC()
params = {'C': np.arange(1, 12, 1), "degree" : np.arange(3, 7, 1)}
grid = GridSearchCV(svm, params, n_jobs = -1, cv = 5)
grid.fit(train_matrix, train_y)

In [None]:
grid.best_params_

In [None]:
svm = SVC(C = 10)
svm.fit(train_matrix, train_y)
svm_preds = svm.predict(valid_matrix)

In [None]:
f1s.append(metrics.f1_score(valid_y, svm_preds) )
accuracies.append(accuracy(valid_y, svm_preds))
rocs.append(roc(valid_y, svm_preds))

In [None]:
print("F_1 score is {}".format(f1s[-1]) ) 
print("Accuracy is {}".format(accuracies[-1]) ) 
print("ROC-AUC score is {}".format(rocs[-1]) )

In [None]:
methods = ['bayesian', 'knn', 'logit', 'logit-ext', 'svm']
f1s_s = pd.Series(f1s, index = methods)
accuracies_s = pd.Series(accuracies, index = methods)
rocs_s = pd.Series(rocs, index = methods)

In [None]:
# Create a bar for the F_1 metric
trace0 = go.Bar(
    x=f1s_s.index, y=f1s_s, name="F_1"
)

# Create a bar for the accuracies
trace1 = go.Bar(
    x=accuracies_s.index,
    y=accuracies_s,
    name="Accuracy",
)

trace2 = go.Bar(
    x=rocs_s.index,
    y=rocs_s,
    name="ROC-AUC",
)

data = [trace0, trace1, trace2]
layout = {"title": "Comparison of methods on metrics distribution"}

# Create a `Figure` and plot it
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, show_link=False)

### Clearly we see that logit-ext method is best for hacking this case

Now we'll take ```eli5``` library for showing weights of our best model. Therefore we'll be able to interpret it and see impact of most weightful n-grams for classificating sample as a "spam" or "ham"

In [None]:
import eli5
eli5.show_weights(logit, vec=tfidf_vec, top = 50, feature_filter=lambda x: x != '<BIAS>')

In [None]:
eli5.show_weights(logit_ext, vec=tfidf_vec, top=50, feature_filter=lambda x: x != '<BIAS>')

Using eli-5 we can see that the biggest weights for spam class is words like "claim", "awarded", "reply", "service" - it makes sence since we always get messages, where we were awarded or had service. While for the ham messages most the haviest words describing sms chatting with friends. 