In [1]:
import os

import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

from src.Engine import Engine

ROOT = os.getcwd()
DATAPATH = os.path.join(ROOT,"data")
email_eng = Engine()
df, vocab = email_eng.run(DATAPATH)

cnt_vec = CountVectorizer()
X = cnt_vec.fit_transform(df["message"])
y = df["spam"]
vocab = cnt_vec.vocabulary_
rev = {j:i for i,j in vocab.items()}
nb = MultinomialNB()
param_grid = {"alpha": np.logspace(-2,1)}
grid = GridSearchCV(nb, param_grid=param_grid, scoring = "f1", return_train_score=True, n_jobs=-2)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [2]:
#grid.fit(X_train.toarray(), y_train)

In [4]:
sparse_grid = GridSearchCV(nb, param_grid=param_grid, scoring = "f1", return_train_score=True)
sparse_grid.fit(X_train, y_train)

In [5]:
#dense_pred = grid.predict(X_test.toarray())
sparse_pred = sparse_grid.predict(X_test)

In [6]:
from sklearn.metrics import classification_report
#print(classification_report(y_test, dense_pred))

              precision    recall  f1-score   support

       False       0.99      1.00      0.99      1301
        True       1.00      0.96      0.98       421

    accuracy                           0.99      1722
   macro avg       0.99      0.98      0.99      1722
weighted avg       0.99      0.99      0.99      1722



In [7]:
print(classification_report(y_test, sparse_pred))

              precision    recall  f1-score   support

       False       0.99      1.00      0.99      1301
        True       1.00      0.96      0.98       421

    accuracy                           0.99      1722
   macro avg       0.99      0.98      0.99      1722
weighted avg       0.99      0.99      0.99      1722



In [10]:
#import joblib
#joblib.dump(grid,"dense_multinomialNB_model.joblib")
#joblib.dump(sparse_grid, "sparse_multinomialNB_model.joblib")

['sparse_multinomialNB_model.joblib']

In [26]:
print(sparse_grid.cv_results_["mean_test_score"])


[0.97663106 0.97663106 0.97728765 0.97761917 0.97794641 0.97794641
 0.97794641 0.97827685 0.97762977 0.97765085 0.97831276 0.97831276
 0.97831276 0.97864321 0.97864321 0.97897256 0.97863563 0.97830519
 0.97830519 0.97830519 0.97797368 0.97797368 0.9773139  0.97698617
 0.9773242  0.97666444 0.97599927 0.9756527  0.9756527  0.97497879
 0.97396903 0.9736436  0.97161203 0.97062167 0.97025922 0.96884466
 0.96717358 0.96381054 0.95620921 0.94752494 0.94081585 0.93072989
 0.91701432 0.8991476  0.87726466 0.84612815 0.80463558 0.75721275
 0.70748101 0.64953041]


In [27]:
print(sparse_grid.cv_results_["mean_train_score"])

[0.99840701 0.99840701 0.99840701 0.99840701 0.99840701 0.99824752
 0.99824752 0.99824752 0.99824752 0.99824752 0.99824752 0.99800813
 0.99800813 0.99784845 0.99768864 0.99752883 0.99720903 0.99704909
 0.99672903 0.99648875 0.99624827 0.99584868 0.99536741 0.99496598
 0.99472511 0.99424243 0.99367868 0.99311641 0.99247199 0.99126252
 0.99053908 0.98997113 0.98916238 0.9882732  0.98681205 0.98485292
 0.98297007 0.9792689  0.97428563 0.9692503  0.96185763 0.95262053
 0.94037001 0.92591132 0.90449552 0.88007968 0.84387874 0.7898306
 0.73840565 0.68416461]
