In [1]:
import pandas as pd
import numpy as np
import csv
import mlflow
mlflow.set_tracking_uri("http://127.0.0.1:5000")
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Preprocessing steps

In [4]:
def split_into_lemmas(message):
    message = message.lower()
    words = TextBlob(message).words
    return [word.lemma for word in words]

In [6]:
def eval_metrics(actual, pred):
    acc = accuracy_score(actual, pred)
    f1 = f1_score(actual, pred, pos_label='spam')
    auc = roc_auc_score(actual=='spam', pred=='spam')
    return acc, f1, auc


# For dataset generated by seed = 42

In [7]:
messages = pd.read_csv('data/train.csv', sep='\t', quoting=csv.QUOTE_NONE, index_col=False)
bow_transformer = CountVectorizer(analyzer=split_into_lemmas).fit(messages['message'])
messages_bow = bow_transformer.transform(messages['message'])
tfidf_transformer = TfidfTransformer().fit(messages_bow)
messages_tfidf = tfidf_transformer.transform(messages_bow)

In [8]:
valid = pd.read_csv('data/validate.csv', sep='\t', quoting=csv.QUOTE_NONE, index_col=False)
valid_bow = bow_transformer.transform(valid['message'])
valid_tfidf = tfidf_transformer.transform(valid_bow)

## Naive Bayes classifier

In [7]:
naive_bayes = MultinomialNB().fit(messages_tfidf, messages['label'])

In [14]:
with mlflow.start_run():
    all_predictions = naive_bayes.predict(valid_tfidf)
    valid_metrics = eval_metrics(valid['label'], all_predictions)
    mlflow.log_metric("acc", valid_metrics[0])
    mlflow.log_metric("f1",valid_metrics[1])
    mlflow.log_metric("auc", valid_metrics[2])
    print("acc:", valid_metrics[0])
    print("f1:",valid_metrics[1])
    print("auc:", valid_metrics[2])
    mlflow.sklearn.log_model(naive_bayes, "NB_42")

acc: 0.961768219832736
f1: 0.8333333333333333
auc: 0.8571428571428572


## Logistic Regression

In [15]:
logistic_reg = LogisticRegression().fit(messages_tfidf, messages['label'])

In [16]:
with mlflow.start_run():
    all_predictions = logistic_reg.predict(valid_tfidf)
    valid_metrics = eval_metrics(valid['label'], all_predictions)
    mlflow.log_metric("acc", valid_metrics[0])
    mlflow.log_metric("f1",valid_metrics[1])
    mlflow.log_metric("auc", valid_metrics[2])
    print("acc:", valid_metrics[0])
    print("f1:",valid_metrics[1])
    print("auc:", valid_metrics[2])
    mlflow.sklearn.log_model(logistic_reg, "Log_reg_42")

acc: 0.970131421744325
f1: 0.878048780487805
auc: 0.899716748768473


## Support Vector Machines

In [17]:
support_vec = SVC(probability=True).fit(messages_tfidf, messages['label'])

In [18]:
with mlflow.start_run():
    all_predictions = support_vec.predict(valid_tfidf)
    valid_metrics = eval_metrics(valid['label'], all_predictions)
    mlflow.log_metric("acc", valid_metrics[0])
    mlflow.log_metric("f1",valid_metrics[1])
    mlflow.log_metric("auc", valid_metrics[2])
    print("acc:", valid_metrics[0])
    print("f1:",valid_metrics[1])
    print("auc:", valid_metrics[2])
    mlflow.sklearn.log_model(support_vec, "SVM_42")

acc: 0.985663082437276
f1: 0.9439252336448598
auc: 0.9502032019704434


# For dataset generated by seed = 69

In [63]:
!git pull data.dvc

fatal: invalid gitfile format: data.dvc
fatal: Could not read from remote repository.

Please make sure you have the correct access rights
and the repository exists.


In [10]:
messages = pd.read_csv('data/train.csv', sep='\t', quoting=csv.QUOTE_NONE, index_col=False)
bow_transformer = CountVectorizer(analyzer=split_into_lemmas).fit(messages['message'])
messages_bow = bow_transformer.transform(messages['message'])
tfidf_transformer = TfidfTransformer().fit(messages_bow)
messages_tfidf = tfidf_transformer.transform(messages_bow)

In [11]:
valid = pd.read_csv('data/validate.csv', sep='\t', quoting=csv.QUOTE_NONE, index_col=False)
valid_bow = bow_transformer.transform(valid['message'])
valid_tfidf = tfidf_transformer.transform(valid_bow)

## Naive Bayes classifier

In [29]:
naive_bayes = MultinomialNB().fit(messages_tfidf, messages['label'])

In [30]:
with mlflow.start_run():
    all_predictions = naive_bayes.predict(valid_tfidf)
    valid_metrics = eval_metrics(valid['label'], all_predictions)
    mlflow.log_metric("acc", valid_metrics[0])
    mlflow.log_metric("f1",valid_metrics[1])
    mlflow.log_metric("auc", valid_metrics[2])
    print("acc:", valid_metrics[0])
    print("f1:",valid_metrics[1])
    print("auc:", valid_metrics[2])
    mlflow.sklearn.log_model(naive_bayes, "NB_69")

acc: 0.9414575866188769
f1: 0.72
auc: 0.78125


## Logistic Regression

In [31]:
logistic_reg = LogisticRegression().fit(messages_tfidf, messages['label'])

In [32]:
with mlflow.start_run():
    all_predictions = logistic_reg.predict(valid_tfidf)
    valid_metrics = eval_metrics(valid['label'], all_predictions)
    mlflow.log_metric("acc", valid_metrics[0])
    mlflow.log_metric("f1",valid_metrics[1])
    mlflow.log_metric("auc", valid_metrics[2])
    print("acc:", valid_metrics[0])
    print("f1:",valid_metrics[1])
    print("auc:", valid_metrics[2])
    mlflow.sklearn.log_model(logistic_reg, "Log_reg_69")

acc: 0.96415770609319
f1: 0.8484848484848485
auc: 0.8736206896551725


## Support Vector Machines

In [13]:
support_vec = SVC(probability=True).fit(messages_tfidf, messages['label'])

In [14]:
with mlflow.start_run():
    all_predictions = support_vec.predict(valid_tfidf)
    valid_metrics = eval_metrics(valid['label'], all_predictions)
    mlflow.log_metric("acc", valid_metrics[0])
    mlflow.log_metric("f1",valid_metrics[1])
    mlflow.log_metric("auc", valid_metrics[2])
    print("acc:", valid_metrics[0])
    print("f1:",valid_metrics[1])
    print("auc:", valid_metrics[2])
    mlflow.sklearn.log_model(support_vec, "SVM_69")

acc: 0.9713261648745519
f1: 0.8823529411764706
auc: 0.9004064039408868




In [9]:
# After looking at mlflow ui we load the best model and test it on the test dataset
best_model = mlflow.sklearn.load_model("runs:/a9bfc75b3fcd461ba87ef42650c6f4f3/SVM_69")

valid = pd.read_csv('data/test.csv', sep='\t', quoting=csv.QUOTE_NONE, index_col=False)
valid_bow = bow_transformer.transform(valid['message'])
valid_tfidf = tfidf_transformer.transform(valid_bow)
all_predictions = support_vec.predict(valid_tfidf)
valid_metrics = eval_metrics(valid['label'], all_predictions)
print("acc:", valid_metrics[0])
print("f1:",valid_metrics[1])
print("auc:", valid_metrics[2])

MlflowException: API request to http://127.0.0.1:5000/api/2.0/mlflow/runs/get failed with exception HTTPConnectionPool(host='127.0.0.1', port=5000): Max retries exceeded with url: /api/2.0/mlflow/runs/get?run_uuid=a9bfc75b3fcd461ba87ef42650c6f4f3&run_id=a9bfc75b3fcd461ba87ef42650c6f4f3 (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7efb5a41f970>: Failed to establish a new connection: [Errno 111] Connection refused'))