In [54]:
import pandas as pd
import csv
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Preprocessing steps

In [12]:
def split_into_lemmas(message):
    message = message.lower()
    words = TextBlob(message).words
    return [word.lemma for word in words]

In [26]:
messages = pd.read_csv('train.csv', sep='\t', quoting=csv.QUOTE_NONE, index_col=False)
bow_transformer = CountVectorizer(analyzer=split_into_lemmas).fit(messages['message'])
messages_bow = bow_transformer.transform(messages['message'])
tfidf_transformer = TfidfTransformer().fit(messages_bow)
messages_tfidf = tfidf_transformer.transform(messages_bow)

In [39]:
valid = pd.read_csv('validate.csv', sep='\t', quoting=csv.QUOTE_NONE, index_col=False)
valid_bow = bow_transformer.transform(valid['message'])
valid_tfidf = tfidf_transformer.transform(valid_bow)

# Naive Bayes Classifier

In [36]:
naive_bayes = MultinomialNB().fit(messages_tfidf, messages['label'])

In [60]:
all_predictions = naive_bayes.predict(messages_tfidf)
print("Training accuracy:", accuracy_score(messages['label'], all_predictions))
print("Training confusion matrix:\n", classification_report(messages['label'], all_predictions))

Training accuracy: 0.9641117662137914
Training confusion matrix:
               precision    recall  f1-score   support

         ham       0.96      1.00      0.98      3380
        spam       1.00      0.73      0.84       521

    accuracy                           0.96      3901
   macro avg       0.98      0.87      0.91      3901
weighted avg       0.97      0.96      0.96      3901



In [61]:
all_predictions = naive_bayes.predict(valid_tfidf)
print("Validation accuracy:", accuracy_score(valid['label'], all_predictions))
print("Validation confusion matrix:\n", classification_report(valid['label'], all_predictions))


Validation accuracy: 0.961768219832736
Validation confusion matrix:
               precision    recall  f1-score   support

         ham       0.96      1.00      0.98       725
        spam       1.00      0.71      0.83       112

    accuracy                           0.96       837
   macro avg       0.98      0.86      0.91       837
weighted avg       0.96      0.96      0.96       837



# Logistic Regression

In [62]:
logistic_reg = LogisticRegression().fit(messages_tfidf, messages['label'])

In [63]:
all_predictions = logistic_reg.predict(messages_tfidf)
print("Training accuracy:", accuracy_score(messages['label'], all_predictions))
print("Training confusion matrix:\n", classification_report(messages['label'], all_predictions))

Training accuracy: 0.9751345808766982
Training confusion matrix:
               precision    recall  f1-score   support

         ham       0.97      1.00      0.99      3380
        spam       1.00      0.82      0.90       521

    accuracy                           0.98      3901
   macro avg       0.99      0.91      0.94      3901
weighted avg       0.98      0.98      0.97      3901



In [64]:
all_predictions = logistic_reg.predict(valid_tfidf)
print("Validation accuracy:", accuracy_score(valid['label'], all_predictions))
print("Validation confusion matrix:\n", classification_report(valid['label'], all_predictions))

Validation accuracy: 0.970131421744325
Validation confusion matrix:
               precision    recall  f1-score   support

         ham       0.97      1.00      0.98       725
        spam       0.97      0.80      0.88       112

    accuracy                           0.97       837
   macro avg       0.97      0.90      0.93       837
weighted avg       0.97      0.97      0.97       837



# Support Vector Machines

In [65]:
support_vec = SVC().fit(messages_tfidf, messages['label'])

In [66]:
all_predictions = support_vec.predict(messages_tfidf)
print("Training accuracy:", accuracy_score(messages['label'], all_predictions))
print("Training confusion matrix:\n", classification_report(messages['label'], all_predictions))

Training accuracy: 0.9979492437836452
Training confusion matrix:
               precision    recall  f1-score   support

         ham       1.00      1.00      1.00      3380
        spam       1.00      0.98      0.99       521

    accuracy                           1.00      3901
   macro avg       1.00      0.99      1.00      3901
weighted avg       1.00      1.00      1.00      3901



In [67]:
all_predictions = support_vec.predict(valid_tfidf)
print("Validation accuracy:", accuracy_score(valid['label'], all_predictions))
print("Validation confusion matrix:\n", classification_report(valid['label'], all_predictions))

Validation accuracy: 0.985663082437276
Validation confusion matrix:
               precision    recall  f1-score   support

         ham       0.99      1.00      0.99       725
        spam       0.99      0.90      0.94       112

    accuracy                           0.99       837
   macro avg       0.99      0.95      0.97       837
weighted avg       0.99      0.99      0.99       837

