#### Loading Libraries

In [None]:
import warnings
warnings.filterwarnings("ignore")

## General Libraries
import pandas as pd
import numpy as np
import re

## modules for 
## stemming and lemmatization 
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords

## some sklearn modules
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
## Loading the data
## removing empty columns
## appropriate renaming others
df = pd.read_csv("../input/sms-spam-collection-dataset/spam.csv", sep=",", encoding='latin-1').iloc[:,:2]
df.rename(columns={"v1":"Class", "v2":"Text"}, inplace=True)
df.head()

### Preprocessing

In [None]:
## stemmer object
ps = PorterStemmer()

## Lemmatizer object
wn = WordNetLemmatizer()

In [None]:
## A function to clean the text
def clean_text(st, process="lemma"):
    """
        clean_text: returns the stem or lemma of a given word
        
        args: 
            st - input string
            process - takes either "stem" or "lemma"
                    - stem performes stemming and lemma performs lemmatization
    """
    cleaned = re.sub("[^A-z]", " ", st)
    cleaned = cleaned.lower().split()
    
    if process=="stem": cleaned = [ps.stem(wrd) for wrd in cleaned if wrd not in stopwords.words("english")]
    elif process=="lemma": cleaned = [wn.lemmatize(wrd) for wrd in cleaned if wrd not in stopwords.words("english")]
    
    return " ".join(cleaned)

## updating dataframe
df["Stemmed"] = df.Text.apply(lambda x:clean_text(x, "stem"))
df["Lemmatized"] = df.Text.apply(lambda x:clean_text(x, "lemma"))

df.head()

In [None]:
## initializing an empty dict for storing metrics
metrics = {
    "stem_NB":[], "lemma_NB":[],
    "stem_SGD":[], "lemma_SGD":[],
}

## Loops runs through all keys of metrics
## splits the X, y accordingly
## trains, tests and records the evaluation metric
for key in metrics.keys():
    if "stem" in key: X, y = df.Stemmed, pd.get_dummies(df.Class).iloc[:,1]
    elif "lemma" in key: X, y = df.Lemmatized, pd.get_dummies(df.Class).iloc[:,1]

    skf = StratifiedKFold(n_splits=5)
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        vect = CountVectorizer(max_features=2500)
        X_train_dtm = vect.fit_transform(X_train)
        X_test_dtm = vect.transform(X_test)
        
        if "NB" in key:
            clf = MultinomialNB()
        elif "SGD" in key:
            clf = SGDClassifier()
            
        clf.fit(X_train_dtm, y_train)
        y_pred = clf.predict(X_test_dtm)
        
        metrics[key].append(accuracy_score(y_test, y_pred))

    scores = np.array(metrics[key])
    print('Mean accuracy: ', np.mean(scores, axis=0))
    print('Std for accuracy: ', np.std(scores, axis=0))
    print(f"{key} ------------------------------------: complete\n")

In [None]:
## metrics recieved
metricsDf = pd.DataFrame.from_dict(metrics).transpose()
metricsDf

In [None]:
## a heatmap of metrics
metricsDf.style.background_gradient(cmap="viridis")

#### Final Split

As per the above analysis, stemming works well for this dataset than Lemmatization. And Also Navie Bayes Classifer worked well than SGD.

In [None]:
### Final Splitting of X,y
X, y = df.Stemmed, pd.get_dummies(df.Class).iloc[:,1]

skf = StratifiedKFold(n_splits=5)
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    vect = CountVectorizer(max_features=2500)
    X_train_dtm = vect.fit_transform(X_train)
    X_test_dtm = vect.transform(X_test)

In [None]:
## required classifier
clf = MultinomialNB()
clf.fit(X_train_dtm, y_train)

In [None]:
## Training Scores
clf.score(X_train_dtm, y_train)

In [None]:
## Cross validation Scores
cross_val_score(clf, X_train_dtm, y_train, cv=5)

### Hyperparameter Tuning

In [None]:
## log prior probabilities
clf.class_log_prior_

In [None]:
## params for grid search
params = { 'alpha': [0.01, 0.1, 0.5, 1.0, 10.0, ] }

## grid searching the req clf
grid_search = GridSearchCV(clf, param_grid=params, n_jobs=-1, cv=5)
grid_search.fit(X_train_dtm, y_train)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_score_

In [None]:
grid_search.best_estimator_

#### Evaluation

In [None]:
grid_search.best_estimator_.score(X_train_dtm, y_train)

In [None]:
grid_search.best_estimator_.score(X_test_dtm, y_test)

In [None]:
### Final Testing with test data
fin_clf = grid_search.best_estimator_
fin_clf.fit(X_train_dtm, y_train)
print(f"Test Scores: {fin_clf.score(X_test_dtm, y_test)}")

y_pred = fin_clf.predict(X_test_dtm)
print(f"Accuracy: {accuracy_score(y_pred, y_test)}")