In [25]:
import numpy as np
import pandas as pd
import spam_classifier as spm

In [26]:
training_df: pd.DataFrame = spm.load_data("data/ds6_train.tsv")
dictionary: dict = spm.load_dictionary("dictionary.json")
labels: np.ndarray = training_df["label"].to_numpy()
messages: pd.Series = training_df["text"]

In [27]:
#save training data to file and load it instead:
#design_matrix: np.ndarray = spm.construct_matrix_data(messages, dictionary)
#np.save("features_training_matrix.npy", design_matrix)
design_matrix: np.ndarray = np.load("features_training_matrix.npy")

In [28]:
print(f"labels have a shape {labels.shape} and the feature matrix has a shape {design_matrix.shape}")

labels have a shape (4457,) and the feature matrix has a shape (4457, 1385)


In [29]:
a,b,c = spm.fit_naive_bayes(design_matrix, labels)
training_predictions = spm.make_prediction_nb(design_matrix,a,b,c)
training_accuracy = 1 - np.sum(np.abs(training_predictions - labels))/len(training_predictions)
print(f"training accuracy is {training_accuracy}")

training accuracy is 0.9874354947273951


In [30]:
# now let's see how good it runs on the validation set:
valid_df: pd.DataFrame = spm.load_data("data/ds6_val.tsv")
valid_labels: np.ndarray = valid_df["label"].to_numpy()
messages: pd.Series = valid_df["text"]
valid_features_matrix = spm.construct_matrix_data(messages, dictionary)
valid_predictions = spm.make_prediction_nb(valid_features_matrix, a,b,c)
valid_accuracy = 1 - np.sum(np.abs(valid_predictions - valid_labels))/len(valid_predictions)
print(f"valid set accuracy is {valid_accuracy}")

valid set accuracy is 0.9910233393177738


In [31]:
#validation set also looks lovely so we may as well do on the test set as well
test_df: pd.DataFrame = spm.load_data("data/ds6_test.tsv")
test_labels: np.ndarray = test_df["label"].to_numpy()
messages: pd.Series = test_df["text"]
test_features_matrix = spm.construct_matrix_data(messages, dictionary)
test_predictions = spm.make_prediction_nb(test_features_matrix, a,b,c)
test_accuracy = 1 - np.sum(np.abs(test_predictions - test_labels))/len(test_predictions)
print(f"test set accuracy is {test_accuracy}")

test set accuracy is 0.989247311827957


In [32]:
print(f"the top 5 words that correspond to spam mail are: {spm.top_5_words_nb(a,b, dictionary)}")

the top 5 words that correspond to spam mail are: ['won', 'prize', 'uk', 'claim', 'cs']


In [33]:
from sklearn.metrics import confusion_matrix
confusion_matrix(test_labels, test_predictions)

array([[490,   1],
       [  5,  62]])

1 false positive, 62 true positives

5 false negatives (i.e. undected spam) and 490 true negatives


In [34]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
clf = make_pipeline(
    StandardScaler(),
    SVC(kernel="rbf", C=1.0, gamma="scale"),
)
train_labels: np.ndarray = training_df["label"].to_numpy()
clf.fit(design_matrix, train_labels)

0,1,2
,steps,"[('standardscaler', ...), ('svc', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [35]:
print("Accuracy on training set:", clf.score(design_matrix, train_labels))

Accuracy on training set: 0.9952883105227731


In [37]:
print("Accuracy on the test set:", clf.score(test_features_matrix, test_labels))

Accuracy on the test set: 0.9802867383512545


In [None]:
#let's look at confusion matrix - probs better indicator
test_pred = clf.predict(test_features_matrix)
confusion_matrix(test_labels, test_pred) 

array([[491,   0],
       [ 11,  56]])

spam is doubly more likely to fall undetected BUT at least non-spam is never misclassified.
SVM will probs perform better if i did some hyperparameter tuning but it doesn't look worth it to pursue (marginal gains)