# LAB 7: Error analysis

Objectives
* Construct a  linear text classifier using SGDClassifier
* Evaluate its performance and categorize the errors that it makes
* Eaxmine model's coefficients and decision function values
* Interpret model results using LIME

In [None]:
import numpy as np
import pandas as pd
from cytoolz import *
from tqdm.auto import tqdm

tqdm.pandas()

---

## Load data

In [None]:
train = pd.read_parquet(
    "s3://ling583/lab7-train.parquet", storage_options={"anon": True}
)
test = pd.read_parquet("s3://ling583/lab7-test.parquet", storage_options={"anon": True})

In [None]:
import spacy

nlp = spacy.load(
    "en_core_web_sm",
    exclude=["tagger", "parser", "ner", "lemmatizer", "attribute_ruler"],
)


def tokenize(text):
    doc = nlp.tokenizer(text)
    return [t.norm_ for t in doc if not (t.is_space or t.is_punct or t.like_num)]

In [None]:
import multiprocessing as mp

In [None]:
with mp.Pool() as p:
    train["tokens"] = pd.Series(p.imap(tokenize, tqdm(train["text"]), chunksize=100))
    test["tokens"] = pd.Series(p.imap(tokenize, tqdm(test["text"]), chunksize=100))

The labels are: GPOL = domestic politics, GSPO = sports, GVIO = war/civil war, GJOB = labor issues

In [None]:
train["topics"].value_counts()

---

## Baseline classifier

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, f1_score
from sklearn.pipeline import make_pipeline

In [None]:
baseline = make_pipeline(CountVectorizer(analyzer=identity), SGDClassifier())
baseline.fit(train["tokens"], train["topics"])
base_predicted = baseline.predict(test["tokens"])
print(classification_report(test["topics"], base_predicted))

----

## Hyperparameter search

Find an optimal set of hyperparameters for a Tfidf+SGDClassifier model

In [None]:
import mlflow
from dask_ml.model_selection import RandomizedSearchCV
from logger import log_search
from scipy.stats.distributions import loguniform, randint, uniform

In [None]:
from warnings import simplefilter

simplefilter(action="ignore", category=FutureWarning)

In [None]:
from dask.distributed import Client

client = Client("tcp://127.0.0.1:xxxxx")
client

In [None]:
mlflow.set_experiment("lab-7")
sgd = make_pipeline(
    CountVectorizer(analyzer=identity), TfidfTransformer(), SGDClassifier()
)

In [None]:
%%time

search = RandomizedSearchCV(
    sgd,
    {
        "countvectorizer__min_df": randint(1, 20),
        "countvectorizer__max_df": uniform(0.5, 0.5),
        "tfidftransformer__use_idf": [True, False],
        "sgdclassifier__alpha": loguniform(1e-6, 1e-2),
    },
    n_iter=50,
    scoring="f1_macro",
)
search.fit(train["tokens"], train["topics"])
log_search(search)

---

## Compare optimized model to baseline

In [None]:
sgd = make_pipeline(
    CountVectorizer(analyzer=identity, min_df=x, max_df=x),
    TfidfTransformer(use_idf=True),
    SGDClassifier(alpha=x),
)
sgd.fit(train["tokens"], train["topics"])
predicted = sgd.predict(test["tokens"])
print(classification_report(test["topics"], predicted))

In [None]:
base_f1 = f1_score(test["topics"], base_predicted, average="macro")
sgd_f1 = f1_score(test["topics"], predicted, average="macro")

In [None]:
base_f1, sgd_f1, sgd_f1 - base_f1

In [None]:
(sgd_f1 - base_f1) / (1 - base_f1)

In [None]:
from scipy.stats import binom_test, wilcoxon

In [None]:
diff = (predicted == test["topics"]).astype(int) - (
    base_predicted == test["topics"]
).astype(int)
sum(diff == 1), sum(diff == -1), sum(diff == 0)

In [None]:
binom_test([sum(diff == 1), sum(diff == -1)], alternative="greater")

In [None]:
wilcoxon(diff, alternative="greater")

**TO DO:** Summarize your results: how much better is the optimized model? Is it significantly better than the baseline?

-----

## Save model

In [None]:
import cloudpickle

In [None]:
sgd = make_pipeline(
    CountVectorizer(preprocessor=identity, tokenizer=tokenize, min_df=x, max_df=x),
    TfidfTransformer(use_idf=True),
    SGDClassifier(alpha=x),
)
sgd.fit(train["text"], train["topics"])
predicted = sgd.predict(test["text"])
print(classification_report(test["topics"], predicted))

In [None]:
cloudpickle.dump(sgd, open("sgd.model", "wb"))