Student Emails
====

Classify, learn stuff, etc.

In [None]:
import numpy as np
import pandas as pd
from sklearn import metrics
from scipy.sparse import csr_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier

from libEmails import read

First we'll try to classify the email subject lines into their demand categories.

We need to clean the data up a bit first; we need to decide how many emails per demand type to require (demand types rarer than this will be discarded):

In [None]:
sampling = "undersample"

In [None]:
min_num_labels = 75

(
    (X_train, categories_train),
    (X_test, categories_test),
    vectorizer,
) = read.read_email_subjects(
    min_num_labels, verbose=True, sampling=sampling, return_vectorizer=True
)

Now we have a bag of words (`X_train`) and some labels (`categories_train`); we can use these to train a classifier:

In [None]:
def classifiers():
    return [
        MultinomialNB(),
        RandomForestClassifier(),
        SGDClassifier(loss="modified_huber", penalty="l1", max_iter=100, alpha=0.0001),
    ]


subject_clfs = classifiers()
for c in subject_clfs:
    c.fit(X_train, categories_train)

In [None]:
def print_info(clf, values, labels):
    print(f"{type(clf).__name__}:")
    print(metrics.classification_report(labels, clf.predict(values)))
    cv_test = cross_validate(
        clf,
        values,
        labels,
        scoring="balanced_accuracy",
    )
    print(
        f"Balanced accuracy {cv_test['test_score'].mean():.4f}+-{cv_test['test_score'].std():.4f}\n{'-' * 79}"
    )


# Print a classification report for the test data
for c in subject_clfs:
    print_info(c, X_test, categories_test)

We can also run the classifier on arbitrary strings:

In [None]:
s = input()
while s:
    X: csr_matrix = vectorizer.transform([s])
    for c in subject_clfs:
        (predicted_class,) = c.predict(X)
        probs = c.predict_proba(X)
        print(
            f"{type(c).__name__}:\n\t{predicted_class}\n\t{' '.join(('{:.4f}'.format(x) for x in probs[0]))}"
        )
        print("-" * 79)
    s = input()

We can also attempt to classify the email bodies similarly; we will put the email body and subject line into a single bag of words.

We will need to use a smaller minimum since this dataset only contains 200 emails.

In [None]:
min_num_labels = 13
(X_train, categories_train), (X_test, categories_test) = read.read_email_body(
    min_num_labels, verbose=True, sampling=sampling
)

In [None]:
body_clfs = [SGDClassifier()]
for c in body_clfs:
    c.fit(X_train, categories_train)

In [None]:
for c in body_clfs:
    print_info(c, X_test, categories_test)