In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import re

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
data_path = "../input/sms-spam-collection-dataset/spam.csv"

In [None]:
with open(data_path, 'r', encoding="ISO-8859-1") as f:
  lines = f.readlines()
  print(lines[:5])

In [None]:
data = pd.read_csv(data_path, encoding="ISO-8859-1", usecols=['v1', 'v2'])

In [None]:
data.head()

# EDA

## Target Variable

In [None]:
sns.countplot(x='v1', data=data)
plt.show() 
print(f"% of Spam Obervations {data[data['v1']=='spam'].shape[0]/data.shape[0]}")
print(f"% of Non Spam Obervations {data[data['v1']=='ham'].shape[0]/data.shape[0]}")

## Analyzing Text

In [None]:
def count_punctuation(x):
  x = re.sub(r" ", "", x)
  lst_punc = re.findall(r'[^A-Za-z0-9.,/]', x)
  return len(lst_punc)

def count_capitals(x):
  x = re.sub(r" ", "", x)
  lst_caps = re.findall(r'^[A-Z][A-Z]+', x)
  #print(lst_caps)
  return len(lst_caps)

In [None]:
num_spam = data[data['v1']=='spam'].shape[0]
num_ham = data[data['v1']=='ham'].shape[0]

new_features = pd.DataFrame(data={"Punct Count":data['v2'].apply(lambda x:count_punctuation(x)),
                                  "Cap Count":data['v2'].apply(lambda x:count_capitals(x)),
                                  "Text Len":data['v2'].apply(lambda x:len(x))})
new_features['target'] = data['v1']
average_spam_punct_count = new_features[new_features['target']=='spam']['Punct Count'].sum()/num_spam
average_ham_punct_count = new_features[new_features['target']=='ham']['Punct Count'].sum()/num_ham
print("Average Number of Punctuations for Spam: {:.3f}".format(average_spam_punct_count))
print("Average Number of Punctuations for Ham: {:.3f}".format(average_ham_punct_count), end='\n\n')

average_spam_cap_count = new_features[new_features['target']=='spam']['Cap Count'].sum()/num_spam
average_ham_cap_count = new_features[new_features['target']=='ham']['Cap Count'].sum()/num_ham
print("Average Number of Capitals for Spam: {:.3f}".format(average_spam_cap_count))
print("Average Number of Capitals for Ham: {:.3f}".format(average_ham_cap_count), end='\n\n')

average_spam_len = new_features[new_features['target']=='spam']['Text Len'].sum()/num_spam
average_ham_len = new_features[new_features['target']=='ham']['Text Len'].sum()/num_ham
print("Average Text Length for Spam: {:.3f}".format(average_spam_len))
print("Average Text Length for Ham: {:.3f}".format(average_ham_len))


# Predictive Modelling

## Using just the numerical attributes from text

In [None]:
new_features.head()

In [None]:
# Normalizing numerical data

mm1 = MinMaxScaler()
mm2 = MinMaxScaler()
mm3 = MinMaxScaler()
new_features['Punct Count'] = mm1.fit_transform(new_features['Punct Count'].to_numpy().reshape((-1, 1)))
new_features['Cap Count'] = mm2.fit_transform(new_features['Cap Count'].to_numpy().reshape((-1, 1)))
new_features['Text Len'] = mm3.fit_transform(new_features['Text Len'].to_numpy().reshape((-1, 1)))

In [None]:
new_features.head()

In [None]:
X = new_features.iloc[:, :3].to_numpy()
lb = LabelEncoder() 
Y = lb.fit_transform(new_features['target'].tolist())

In [None]:
lb.classes_

In [None]:
skf = StratifiedKFold(n_splits=2)
for train_index, test_index in skf.split(X, Y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]

In [None]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [None]:
preds = lr.predict(X_test)
pred_probs = lr.predict_proba(X_test)

In [None]:
print(classification_report(y_test, preds))
print("AUC Score for Logistic Regression: {:.3f}".format(roc_auc_score(y_test, pred_probs[:, 1])))

## Using TF-IDF with Logistic Classifier

In [None]:
tfidf = TfidfVectorizer(ngram_range=(2, 3), max_df=600, min_df=5)
tfidf.fit(data['v2'].tolist())

In [None]:
tfidf_feats = tfidf.transform(data['v2'].tolist())
print(tfidf_feats.shape)

# Merge our numerical Text features(Punctuation/Capital Counts) with Tfidf features

X_tf = tfidf_feats.toarray()
X_all = np.concatenate([X_tf, X], axis=1)
print(X_all.shape)

In [None]:
skf = StratifiedKFold(n_splits=2)
for train_index, test_index in skf.split(X_all, Y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X_all[train_index], X_all[test_index]
    y_train, y_test = Y[train_index], Y[test_index]

In [None]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
preds = lr.predict(X_test)
pred_probs = lr.predict_proba(X_test)

In [None]:
print(classification_report(y_test, preds))
print("AUC Score: {:.3f}".format(roc_auc_score(y_test, pred_probs[:, 1])))

## We definitely need to improve the recall for Spam messages and precision for ham messages can also be improved 

## Trying a few more classifiers and optimizing hyperparameters

In [None]:
rf = RandomForestClassifier(n_estimators=300, max_depth=5)
rf.fit(X_train, y_train)
preds = rf.predict(X_test)
pred_probs = rf.predict_proba(X_test)
print(classification_report(y_test, preds))

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
preds = knn.predict(X_test)
pred_probs = knn.predict_proba(X_test)
print(classification_report(y_test, preds))

In [None]:
mlp = MLPClassifier(hidden_layer_sizes=(128, 256, 512))
mlp.fit(X_train, y_train)
preds = mlp.predict(X_test)
pred_probs = mlp.predict_proba(X_test)
print(classification_report(y_test, preds))

## Let's take two of our better classifiers - MLP and Logistic Regeression (We can also go for Knn) and try to find best hyperparameters for them before ensembing

In [None]:
# Optimizing hyperparameters for Logisitic Regression classifier

lr_param_dict = {"C":[0.001, 0.01, 0.1, 1, 10],
                 "max_iter": [50, 100, 200, 500]
                 }
scores = ['precision', 'recall']

# Using boiler plate code from Scikit-learn documentation
for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(
        LogisticRegression(), lr_param_dict, scoring='%s_macro' % score
    )
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

Best params for Logistic Regression classifier - C : 10, max_iter : 500

Skipping for MLP for now.

## Ensembling

In [None]:
vc = VotingClassifier([('lr', LogisticRegression(C=10, max_iter=500)), ('mlp', MLPClassifier(hidden_layer_sizes=(128, 256, 512)))], voting='soft')
vc.fit(X_train, y_train)
preds = vc.predict(X_test)
pred_probs = vc.predict_proba(X_test)
print(classification_report(y_test, preds))

In [None]:
print("AUC Score for Ensemble: {:.3f}".format(roc_auc_score(y_test, pred_probs[:, 1])))

So, we have improved our recall a fare bit, optimizing MLP hyperparameters should improve it further. We can also try KNN/SVC for further analysis.

Future Work - 
* Try a few more classifiers.
* Analyze important features/ feature selection.
* Error Analysis

Constructive criticism/suggesstions are welcome.

Happy Kaggling!