In [59]:
import pandas as pd
import numpy as np
import csv
import mlflow
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Preprocessing steps

In [60]:
def split_into_lemmas(message):
    message = message.lower()
    words = TextBlob(message).words
    return [word.lemma for word in words]

In [61]:
def eval_metrics(actual, pred):
    acc = accuracy_score(actual, pred)
    f1 = f1_score(actual, pred, pos_label='spam')
    auc = roc_auc_score(actual=='spam', pred=='spam')
    return acc, f1, auc


# For dataset generated by seed = 42

In [62]:
messages = pd.read_csv('data/train.csv', sep='\t', quoting=csv.QUOTE_NONE, index_col=False)
bow_transformer = CountVectorizer(analyzer=split_into_lemmas).fit(messages['message'])
messages_bow = bow_transformer.transform(messages['message'])
tfidf_transformer = TfidfTransformer().fit(messages_bow)
messages_tfidf = tfidf_transformer.transform(messages_bow)

KeyboardInterrupt: 

In [None]:
valid = pd.read_csv('data/validate.csv', sep='\t', quoting=csv.QUOTE_NONE, index_col=False)
valid_bow = bow_transformer.transform(valid['message'])
valid_tfidf = tfidf_transformer.transform(valid_bow)

## Naive Bayes classifier

In [None]:
naive_bayes = MultinomialNB().fit(messages_tfidf, messages['label'])

In [None]:
with mlflow.start_run():
    all_predictions = naive_bayes.predict(valid_tfidf)
    valid_metrics = eval_metrics(valid['label'], all_predictions)
    mlflow.log_metric("acc", valid_metrics[0])
    mlflow.log_metric("f1",valid_metrics[1])
    mlflow.log_metric("auc", valid_metrics[2])
    print("acc:", valid_metrics[0])
    print("f1:",valid_metrics[1])
    print("auc:", valid_metrics[2])
    mlflow.sklearn.log_model(naive_bayes, "NB_42")

acc: 0.961768219832736
f1: 0.8333333333333333
auc: 0.8571428571428572


## Logistic Regression

In [None]:
logistic_reg = LogisticRegression().fit(messages_tfidf, messages['label'])

In [None]:
with mlflow.start_run():
    all_predictions = logistic_reg.predict(valid_tfidf)
    valid_metrics = eval_metrics(valid['label'], all_predictions)
    mlflow.log_metric("acc", valid_metrics[0])
    mlflow.log_metric("f1",valid_metrics[1])
    mlflow.log_metric("auc", valid_metrics[2])
    print("acc:", valid_metrics[0])
    print("f1:",valid_metrics[1])
    print("auc:", valid_metrics[2])
    mlflow.sklearn.log_model(logistic_reg, "Log_reg_42")

acc: 0.970131421744325
f1: 0.878048780487805
auc: 0.899716748768473


## Support Vector Machines

In [None]:
support_vec = SVC().fit(messages_tfidf, messages['label'])

In [None]:
with mlflow.start_run():
    all_predictions = support_vec.predict(valid_tfidf)
    valid_metrics = eval_metrics(valid['label'], all_predictions)
    mlflow.log_metric("acc", valid_metrics[0])
    mlflow.log_metric("f1",valid_metrics[1])
    mlflow.log_metric("auc", valid_metrics[2])
    print("acc:", valid_metrics[0])
    print("f1:",valid_metrics[1])
    print("auc:", valid_metrics[2])
    mlflow.sklearn.log_model(support_vec, "SVM_42")

acc: 0.985663082437276
f1: 0.9439252336448598
auc: 0.9502032019704434


# For dataset generated by seed = 69

In [63]:
!git pull data.dvc

fatal: invalid gitfile format: data.dvc
fatal: Could not read from remote repository.

Please make sure you have the correct access rights
and the repository exists.


In [None]:
messages = pd.read_csv('data/train.csv', sep='\t', quoting=csv.QUOTE_NONE, index_col=False)
bow_transformer = CountVectorizer(analyzer=split_into_lemmas).fit(messages['message'])
messages_bow = bow_transformer.transform(messages['message'])
tfidf_transformer = TfidfTransformer().fit(messages_bow)
messages_tfidf = tfidf_transformer.transform(messages_bow)

In [None]:
valid = pd.read_csv('data/validate.csv', sep='\t', quoting=csv.QUOTE_NONE, index_col=False)
valid_bow = bow_transformer.transform(valid['message'])
valid_tfidf = tfidf_transformer.transform(valid_bow)

## Naive Bayes classifier

In [None]:
naive_bayes = MultinomialNB().fit(messages_tfidf, messages['label'])

In [None]:
with mlflow.start_run():
    all_predictions = naive_bayes.predict(valid_tfidf)
    valid_metrics = eval_metrics(valid['label'], all_predictions)
    mlflow.log_metric("acc", valid_metrics[0])
    mlflow.log_metric("f1",valid_metrics[1])
    mlflow.log_metric("auc", valid_metrics[2])
    print("acc:", valid_metrics[0])
    print("f1:",valid_metrics[1])
    print("auc:", valid_metrics[2])
    mlflow.sklearn.log_model(naive_bayes, "NB_42")

acc: 0.961768219832736
f1: 0.8333333333333333
auc: 0.8571428571428572


## Logistic Regression

In [None]:
logistic_reg = LogisticRegression().fit(messages_tfidf, messages['label'])

In [None]:
with mlflow.start_run():
    all_predictions = logistic_reg.predict(valid_tfidf)
    valid_metrics = eval_metrics(valid['label'], all_predictions)
    mlflow.log_metric("acc", valid_metrics[0])
    mlflow.log_metric("f1",valid_metrics[1])
    mlflow.log_metric("auc", valid_metrics[2])
    print("acc:", valid_metrics[0])
    print("f1:",valid_metrics[1])
    print("auc:", valid_metrics[2])
    mlflow.sklearn.log_model(logistic_reg, "Log_reg_42")

acc: 0.970131421744325
f1: 0.878048780487805
auc: 0.899716748768473


## Support Vector Machines

In [None]:
support_vec = SVC().fit(messages_tfidf, messages['label'])

In [None]:
with mlflow.start_run():
    all_predictions = support_vec.predict(valid_tfidf)
    valid_metrics = eval_metrics(valid['label'], all_predictions)
    mlflow.log_metric("acc", valid_metrics[0])
    mlflow.log_metric("f1",valid_metrics[1])
    mlflow.log_metric("auc", valid_metrics[2])
    print("acc:", valid_metrics[0])
    print("f1:",valid_metrics[1])
    print("auc:", valid_metrics[2])
    mlflow.sklearn.log_model(support_vec, "SVM_42")

acc: 0.985663082437276
f1: 0.9439252336448598
auc: 0.9502032019704434
