In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import plot_confusion_matrix
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import regex as re
from sklearn.svm import SVC
from wordcloud import WordCloud, ImageColorGenerator

In [None]:
# Read data

# Read csv
# Use cp1252 as the dataset is not suitable to be read with utf8 encoding
train = pd.read_csv("../input/email-classification-nlp/SMS_train.csv", encoding='cp1252')
test = pd.read_csv("../input/email-classification-nlp/SMS_test.csv",encoding='cp1252')

train.head()

In [None]:
train.info()

# Data preprocessing

In [None]:
def tokenize(x):
    tokenizer = RegexpTokenizer(r'\w+')
    return tokenizer.tokenize(x)
                                
def stemmer(x):
    stemmer = PorterStemmer()
    return ' '.join([stemmer.stem(word) for word in x])
 
def lemmatize(x):
    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(word) for word in x])

stop_words = stopwords.words('english')

#  Preprocess train dataset
# remove special characters from text column
train.Message_body = train.Message_body.str.replace('[#,@,&]', '')
# Remove digits
train.Message_body = train.Message_body.str.replace(' \d+ ','')
#Remove www
train.Message_body = train.Message_body.str.replace('w{3}','')
# remove urls
train.Message_body = train.Message_body.str.replace("http\S+", "")
# remove multiple spaces with single space
train.Message_body = train.Message_body.str.replace('\s+', ' ')
#remove all single characters
train.Message_body = train.Message_body.str.replace(r'\s+[a-zA-Z]\s+', '')
train['tokens'] = train['Message_body'].map(tokenize)
train['lemma'] = train['tokens'].map(lemmatize)
train['stems'] = train['tokens'].map(stemmer)

#  Preprocess test dataset
# remove special characters from text column
test.Message_body = test.Message_body.str.replace('[#,@,&]', '')
#Remove digits
test.Message_body = test.Message_body.str.replace(' \d+ ','')
#Remove www
test.Message_body = test.Message_body.str.replace('w{3}','')
# remove urls
test.Message_body = test.Message_body.str.replace("http\S+", "")
# remove multiple spaces with single space
test.Message_body = test.Message_body.str.replace('\s+', ' ')
#remove all single characters
test.Message_body = test.Message_body.str.replace(r'\s+[a-zA-Z]\s+', '')
test['tokens'] = test['Message_body'].map(tokenize)
test['lemma'] = test['tokens'].map(lemmatize)
test['stems'] = test['tokens'].map(stemmer)

In [None]:
train

In [None]:
# WordCloud for spam marked emails in train set

# Get a string of e-mails
message_body_spam = ",".join(spam_mail.lower() for spam_mail in train.Message_body[train.Label == 'Spam'])

# Create and generate a word cloud image:
wordcloud = WordCloud(max_font_size=50, 
                      max_words=70, 
                      stopwords=stop_words,
                      scale=5,
                      background_color="white").generate(message_body_spam)

# Display the generated image:
plt.figure(figsize=(10,7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title('Most repeated words in spam mails',fontsize=15)
plt.show()

In [None]:
# WordCloud for Non-spam marked emails in train set

# Get a string of e-mails
message_body_spam = ",".join(spam_mail.lower() for spam_mail in train.Message_body[train.Label == 'Non-Spam'])

# Create and generate a word cloud image:
wordcloud = WordCloud(max_font_size=50, 
                      max_words=70, 
                      stopwords=stop_words,
                      scale=5,
                      background_color="white").generate(message_body_spam)

# Display the generated image:
plt.figure(figsize=(10,7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title('Most repeated words in not spam mails',fontsize=15)
plt.show()

In [None]:
data_labels = train['Label']
data_tweets = train['lemma']

train_X, test_X, train_y, test_y = train_test_split(data_tweets, 
                                                    data_labels, 
                                                    test_size=0.25, 
                                                    random_state = 42)

val_y = test['Label']
val_X = test['lemma']

# Naive Bayes Pipeline

In [None]:
pipe_mnnb = Pipeline(steps = [('tf', TfidfVectorizer()), ('mnnb', MultinomialNB())])

# Create Parameter Grid
pgrid_mnnb = {
 'tf__max_features' : [1000, 2000, 3000],
 'tf__stop_words' : ['english', None],
 'tf__ngram_range' : [(1,1),(1,2)],
 'tf__use_idf' : [True, False],
 'mnnb__alpha' : [0.1, 0.5, 1]
}

# Apply GridSearch to Pipeline to find the best parameters
gs_mnnb = GridSearchCV(pipe_mnnb, pgrid_mnnb, cv=5, n_jobs=-1, verbose=2)

# Fit the model
gs_mnnb.fit(train_X, train_y)

In [None]:
# Check the best parameters for our model
gs_mnnb.best_params_

In [None]:
print('Score of train set', gs_mnnb.score(train_X, train_y))
print('Score of test set',gs_mnnb.score(test_X, test_y))

In [None]:
## Naive Bayes Predictions on val set and confusion matrix
preds_mnnb = gs_mnnb.predict(val_X)
test['preds'] = preds_mnnb

# Generate confusion matrix
matrix_nb = plot_confusion_matrix(gs_mnnb, test_X, test_y,
                                 cmap=plt.cm.Blues,
                                 normalize='true')

plt.title('Confusion matrix for NB classifier')
plt.show(matrix_nb)
plt.show()

# Logistic Regression

In [None]:
pipe_lgrg = Pipeline(steps = [('tf', TfidfVectorizer()), ('lgrg', LogisticRegression())])

# Create Parameter Grid
pgrid_lgrg = {
 'tf__max_features' : [1000, 2000, 3000],
 'tf__ngram_range' : [(1,1),(1,2)],
 'tf__use_idf' : [True, False],
 'lgrg__penalty' : ['l1', 'l2', 'elasticnet', 'none'],
 'lgrg__class_weight' : ['balanced', None],
 'lgrg__C' : [1.0, 0.9]
}

# Apply GridSearch to Pipeline to find the best parameters
gs_lgrg = GridSearchCV(pipe_lgrg, pgrid_lgrg, cv=5, n_jobs=-1, verbose=2)

# Fit the model
gs_lgrg.fit(train_X, train_y)

In [None]:
gs_lgrg.best_params_

In [None]:
print('Score of train set', gs_lgrg.score(train_X, train_y))
print('Score of test set',gs_lgrg.score(test_X, test_y))

In [None]:
## LR Predictions on val set and confusion matrix
preds_lgrg = gs_lgrg.predict(val_X)
test['preds'] = preds_lgrg

#conf_lgrg = confusion_matrix(val_y, preds_lgrg)
#conf_lgrg

# Generate confusion matrix
matrix_lr = plot_confusion_matrix(gs_lgrg, test_X, test_y,
                                 cmap=plt.cm.Blues,
                                 normalize='true')

plt.title('Confusion matrix for LR classifier')
plt.show(matrix_lr)
plt.show()

# SVC

In [None]:
pipe_svc = Pipeline(steps = [('tf', TfidfVectorizer()), ('svc', SVC())])

# Create Parameter Grid
pgrid_svc = {
 'tf__max_features' : [1000, 2000, 3000],
 'tf__ngram_range' : [(1,1),(1,2)],
 'tf__use_idf' : [True, False],
 'svc__kernel' : ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
 'svc__decision_function_shape' : ['ovo', 'ovr'],
 'svc__C' : [1.0, 0.9, 0.8, 0.7]
}

# Apply GridSearch to Pipeline to find the best parameters
gs_svc = GridSearchCV(pipe_svc, pgrid_svc, cv=5, n_jobs=-1, verbose=2)

# Fit the model
gs_svc.fit(train_X, train_y)

In [None]:
gs_svc.best_params_

In [None]:
print('Score of train set', gs_svc.score(train_X, train_y))
print('Score of test set',gs_svc.score(test_X, test_y))

In [None]:
## LR Predictions on val set and confusion matrix
preds_svc = gs_svc.predict(val_X)
test['preds'] = preds_svc

# Generate confusion matrix
matrix_svc = plot_confusion_matrix(gs_svc, test_X, test_y,
                                 cmap=plt.cm.Blues,
                                 normalize='true')

plt.title('Confusion matrix for SVC classifier')
plt.show(matrix_svc)
plt.show()

# Choose the best model based on score

In [None]:
# List with our trained models
models = []

models.append(gs_mnnb)
models.append(gs_lgrg)
models.append(gs_svc)

# Build a list of (score, model) tuples
scores = [(model.score(test_X, test_y), model) for model in models]

# sort it on score
scores = sorted(scores, key=lambda x: x[0], reverse=True)

print('Results for the three models: ')
for item in scores:
    print('The model {} has reached {} accuracy on test set'.format(item[1].estimator[1], round(item[0], 2)))


In [None]:
# get the model with the best score, which is the
# the second element of the first item
best_model = scores[0][1]
print(best_model)