# LAB 7: Error analysis

Objectives
* Construct a  linear text classifier using SGDClassifier
* Evaluate its performance and categorize the errors that it makes
* Eaxmine model's coefficients and decision function values
* Interpret model results using LIME

In [None]:
import numpy as np
import pandas as pd
from cytoolz import *
from tqdm.auto import tqdm

tqdm.pandas()

---

## Load data

In [None]:
train = pd.read_parquet("s3://ling583/lab7-train.parquet", storage_options={"anon":True})
test = pd.read_parquet("s3://ling583/lab7-test.parquet", storage_options={"anon":True})

In [None]:
import cloudpickle
from sklearn.metrics import classification_report, f1_score, plot_confusion_matrix

In [None]:
sgd = cloudpickle.load(open("sgd.model", "rb"))

In [None]:
predicted = sgd.predict(test["text"])
print(classification_report(test["topics"], predicted))

---

## Decision function

In [None]:
labels = sgd.named_steps['sgdclassifier'].classes_
scores = sgd.decision_function(test["text"])

In [None]:
labels

In [None]:
scores[0,:]

In [None]:
test['topics'].iloc[0]

In [None]:
test['text'].iloc[0]

In [None]:
highest = scores.max(axis=1)

In [None]:
highest[0]

In [None]:
highest.argmax()

In [None]:
scores[?]

In [None]:
test['topics'].iloc[?]

In [None]:
test['text'].iloc[?]

In [None]:
highest.argmin()

In [None]:
scores[?]

In [None]:
test['topics'].iloc[?]

In [None]:
test['text'].iloc[?]

In [None]:
scores[0:5,:]

In [None]:
scores.sort(axis=1)
scores[0:5,:]

In [None]:
margin = scores[:,3]-scores[:,2]

In [None]:
margin.max(), margin.min()

In [None]:
print(classification_report(test["topics"][margin > 5], predicted[margin > 5]))

In [None]:
import matplotlib.pyplot as plt

In [None]:
thresh = np.linspace(-2, 3, 50)
x = [100*(1-sum(margin > t)/len(margin)) for t in thresh]
y = [f1_score(test["topics"][margin > t], predicted[margin > t], average="macro") for t in thresh]
plt.plot(x, y)
plt.xlabel('% Unknowns')
plt.ylabel('F1')

In [None]:
print(classification_report(test["topics"][margin > 1.75], predicted[margin > 1.75]))

**TO DO:** Summarize your results for this section. What could we do if we wanted to make label as many examples as possible while still keeping F1 above 0.99?

---

## Model parameters

In [None]:
coef = sgd.named_steps['sgdclassifier'].coef_
labels, coef

In [None]:
coef.shape

In [None]:
vocab = sgd.named_steps['countvectorizer'].get_feature_names()

In [None]:
coef[0,:].argmax()

In [None]:
coef[0,13100]

In [None]:
vocab[13100]

In [None]:
ranked = np.argsort(coef, axis=1)
for i, label in enumerate(labels):
    print(label)
    for j in concat([range(-1, -11, -1), range(10, 0, -1)]):
        print(f'  {vocab[ranked[i,j]]:15s} {coef[i, ranked[i,j]]:6.3f}')
    print()

In [None]:
from kwic import kwic

In [None]:
kwic('newsroom', train['text'])

In [None]:
kwic('zaire', train['text'])

**TO DO:** What can you conclude about the model from looking at the coefficients? Is there evidence of overfitting? How could we improve the results?