# Get and Explore Data

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

Take a look at the data we have available to get a better understanding of what we are dealing with

In [None]:
dataset = pd.read_csv('/kaggle/input/spam-or-not-spam-dataset/spam_or_not_spam.csv')

In [None]:
dataset.info()

In [None]:
dataset.head()

> Drop some unwanted values that would be problematic for training

In [None]:
dataset = dataset.dropna()

> Split data to train and test sets. We will use the train set for fitting our model to the data and the use the test set in order to make our predictions and see how well the model generalizes

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(dataset.email, dataset.label, test_size=0.1)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

# Feature Engineering

> We create a custom sklearn transformer that performes stemming (stemming is the process of reducing inflected (or sometimes derived) words to their word stem, base or root form) using an ntlk stemmer

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.stem.porter import *

class EmailStemming(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        x_temp = []
        stemmer = PorterStemmer()
        for email in X:
            x_temp.append([stemmer.stem(word) for word in email.split()])
        X = x_temp
        
        return X

> This custom transformer creates a corpus from all the unique words available on all emails, and then creates a vector for each email that count the words of that email present on the corpus

> For example:
> If the corpus contained [best, the, is, big] the email "He is the best of the best" would be vector [2, 2, 1, 0]

In [None]:
from collections import Counter
from sklearn.base import BaseEstimator, TransformerMixin


class FeatureVectors(BaseEstimator, TransformerMixin):
    def __init__(self, corpus=None, corpus_len=100):
        self.corpus = corpus
        self.corpus_len = corpus_len
        
    def fit(self, X, y=None):
        if self.corpus == None:
            self.corpus = Counter([
                word 
                for email in X
                for word in email
            ]).most_common(self.corpus_len)
        return self
    
    def transform(self, X):
        corpus_list = [key for key, _ in self.corpus]
        x_temp = []
        for email in X:
            x_temp.append(np.array([email.count(word) for word in corpus_list]))
        X = x_temp
        return np.array(X)

> We create a data processing pipeline, that will apply our custom transformers to the data passed to it

In [None]:
from sklearn.pipeline import Pipeline

corpus_len = 1000
preprocess_pipeline = Pipeline([
    ('email_stemming', EmailStemming()),
    ('feature_vectors', FeatureVectors(corpus_len=corpus_len)),
])

X_train_proc = preprocess_pipeline.fit_transform(X_train, y_train)

In [None]:
X_train_proc.shape, y_train.shape

# Train and Select Models

> Lets try some different Machine Learning algorithms to see what fits our training dataset best

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

svm_clf = SVC(gamma="auto")
svm_scores = cross_val_score(svm_clf, X_train_proc, y_train, cv=10)
svm_scores.mean()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log_clf = LogisticRegression(penalty='l2', max_iter=1000, fit_intercept=True)
log_scores = cross_val_score(log_clf, X_train_proc, y_train, cv=10)
log_scores.mean()

In [None]:
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(n_estimators=100)
forest_scores = cross_val_score(forest_clf, X_train_proc, y_train, cv=10)
forest_scores.mean()

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier()
knn_scores = cross_val_score(knn_clf, X_train_proc, y_train, cv=10)
knn_scores.mean()

# Training Visualization

> We train the model on subsets of the dataset in order to get a sense of learning progress marked by the blue line. As we can see the f1 score starts from 40% and gets gradually better it converges near the end with the red line

In [None]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

def plot_learning_curves(model, X, y):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=10)
    train_errors, val_errors = [], []
    for m in range(20, len(X_train), 10):
        model.fit(X_train[:m], y_train[:m])
        y_train_predict = model.predict(X_train[:m])
        y_val_predict = model.predict(X_val)
        train_errors.append(f1_score(y_train[:m], y_train_predict) * 100)
        val_errors.append(f1_score(y_val, y_val_predict) * 100)

    plt.plot(train_errors, "r-+", linewidth=2, label="train")
    plt.plot(val_errors, "b-", linewidth=3, label="val")
    plt.legend(loc="upper right", fontsize=14)
    plt.xlabel("", fontsize=14)
    plt.ylabel("F1", fontsize=14)             

In [None]:
log_clf = LogisticRegression(penalty='l2', C=1.0, max_iter=1000, fit_intercept=True)
plot_learning_curves(log_clf, X_train_proc, y_train)
plt.axis([0, 100, 0, 100])
plt.show()

# Make predictions

> Logistic regression fits well to the training set so it will be used for making predictions on the test set after passing it through the processing pipeline

In [None]:
X_test_proc = preprocess_pipeline.transform(X_test)

log_clf = LogisticRegression(penalty='l2', C=1.0, max_iter=1000, fit_intercept=True)
log_clf.fit(X_train_proc, y_train)
y_pred = log_clf.predict(X_test_proc)

F1 score and Accuracy

In [None]:
from sklearn.metrics import f1_score, accuracy_score

print(f'F1 score: {f1_score(y_pred, y_test)}')
print(f'Accuracy score: {accuracy_score(y_pred, y_test)}')

# Thank you

Thank you very much for the read, consider upvoting if you found something useful on this Notebook