### Imports

In [None]:
# debugging
from IPython.core.debugger import set_trace

# file system navigation
from pathlib import Path

# data transformation
import pandas as pd
import numpy as np

# plotting
import matplotlib.pyplot as plt
%matplotlib inline

# ml algorithms and evaluation metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import cross_val_score, cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn import model_selection
from scipy.stats.distributions import uniform, randint

# sklearn specific stuff
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# nlp
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import spacy
from spacy.pipeline import TextCategorizer
from spacy.util import minibatch, compounding
from spacy.util import decaying

# misc
import random
import copy

### Custom functions

In [None]:
def get_cv_scores(model, X, y, n_cv=10):
    precision_scores = cross_val_score(model, X, y, cv=n_cv, scoring="precision")
    recall_scores = cross_val_score(model, X, y, cv=n_cv, scoring="recall")
    print(f"Average precision score for {n_cv} splits: {precision_scores.mean():.2f} (+/- {precision_scores.std():.2f})")
    print(f"Average recall score for {n_cv} splits: {recall_scores.mean():.2f} (+/- {recall_scores.std() * 2:.2f})")

In [None]:
def get_cv_score_auc(model, X, y, n_cv=10):
    auc_scores = cross_val_score(model, X, y, cv=n_cv, scoring="roc_auc")
    print(f"Average auc score for {n_cv} splits: {auc_scores.mean():.2f} (+/- {auc_scores.std():.2f})")

In [None]:
def get_cv_auc(model, X, y, n_cv=10):
    auc_cv = cross_validate(model, X, y,
                                scoring="roc_auc",
                                cv=n_cv,
                                n_jobs=-1,
                                return_train_score=False,
                                return_estimator=True)
    auc_scores = auc_cv["test_score"]
    mean_auc = auc_scores.mean()
    representative_estimator = auc_cv["estimator"][np.argmin([np.abs(score - mean_auc) for score in auc_scores])]
    print(f"Average auc score for {n_cv} splits: {mean_auc:.2f} (+/- {auc_scores.std():.2f})")
    
    return representative_estimator

In [None]:
def plot_precision_recall(model, X, y):
    precision, recall, _ = metrics.precision_recall_curve(y, model.predict_proba(X)[:, 1])

    step_kwargs = {"step": "post"}
    plt.step(recall, precision, color='b', alpha=0.2,
             where="post")
    plt.fill_between(recall, precision, alpha=0.2, color='b', **step_kwargs)

    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    plt.title("Precision-Recall curve")
    plt.show()

### Load preprocessed training data

In [None]:
data = pd.read_parquet(Path.cwd() / "data" / "processed" / "train_data.parquet")

In [None]:
data.head()

In [None]:
data.shape

### Baseline model

In [None]:
data_base = data[["claps", "reading_time", "interesting"]]

#### Visualize data distribution for numerical features

In [None]:
x_index = 0
y_index = 1
target_names = ["not interesting", "interesting"]

colors = ["red", "green"]

for label, color in zip(range(len(data_base["interesting"])), colors):
    plt.scatter(np.array(data_base[data_base["interesting"]==label].iloc[:, x_index]), 
                np.array(data_base[data_base["interesting"]==label].iloc[:, y_index]),
                label=target_names[label],
                c=color)

plt.xlabel(data_base.columns[x_index])
plt.ylabel(data_base.columns[y_index])
plt.legend(loc="upper left")
plt.show()

#### Save figure

In [None]:
plt.savefig(os.path.join(wd, "output", "base_classifier.png"))

#### Baseline classficiation model using author, claps and reading time

In [None]:
X_num = data_base[["claps", "reading_time"]]
y_num = data_base["interesting"]

In [None]:
rf_model = get_cv_auc(RandomForestClassifier(), X_num, y, n_cv=20)

In [None]:
s = model_selection.StratifiedShuffleSplit(n_splits=4, test_size=0.3, random_state=42)

In [None]:
for i, split in enumerate(s.split(X_num, y_num)):
    i_train = split[0]
    i_test = split[1]
    print("Training model number  ", i)
    rf_model.fit(X_num.iloc[i_train, :], y_num[i_train])
    print("AUC on the test set:")
    print(metrics.roc_auc_score(y_num[i_test], rf_model.predict_proba(X_num.iloc[i_test, :])[:, 1]))

### Text based models

In [None]:
X_text = data["text"]
y = data["interesting"]

Create true hold out set to simulate future articles coming in

In [None]:
X_70 = X_text[0:70]
y_70 = y[0:70]

X_100 = X_text[70:]
y_100 = y[70:]

In [None]:
X_70 = X_70.reset_index().drop("index", axis=1)["text"]
X_100 = X_100.reset_index().drop("index", axis=1)["text"]

In [None]:
y_70 = y_70.reset_index().drop("index", axis=1)["interesting"]
y_100 = y_100.reset_index().drop("index", axis=1)["interesting"]

In [None]:
X_text_train, X_text_test, y_train, y_test = train_test_split(X_text,
                                                              y,
                                                              test_size=0.3,
                                                              random_state=42,
                                                              stratify=y)

#### CountVectorizer + random forest

##### Default values

In [None]:
vectorizer = CountVectorizer()

In [None]:
pipe = make_pipeline(vectorizer, RandomForestClassifier())

In [None]:
estimator = get_cv_auc(pipe, X_text_train, y_train, n_cv=20)

In [None]:
plot_precision_recall(estimator, X_text_test, y_test)

##### Some optimization, i.e. preprocessing and feature selection

In [None]:
pipe = Pipeline([
    ("vec", CountVectorizer()),
    ("rf", RandomForestClassifier())
    ])
params = {"vec__stop_words": ["english", None],
          "vec__ngram_range": [(1, 1), (1, 2), (1, 3)], 
          "vec__max_df": uniform(loc=0.8, scale=0.2),
          "vec__min_df": uniform(loc=0.0, scale=0.2),
          "vec__max_features": randint(low=1000, high=9000)}

grid = RandomizedSearchCV(pipe,
                          params,
                          n_iter=8,
                          scoring="roc_auc",
                          n_jobs=-1,
                          cv=10,
                          return_train_score=False)

In [None]:
_ = grid.fit(X_text_train, y_train)

In [None]:
avg_auc = grid.cv_results_["mean_test_score"].mean()
std_auc = grid.cv_results_["std_test_score"].mean()
print(f"Average auc score: {avg_auc:.2f} (+/- {std_auc:.2f})")
# pd.DataFrame.from_dict(grid.cv_results_).sort_values("rank_test_score")

In [None]:
plot_precision_recall(grid.best_estimator_, X_text_test, y_test)

#### TfidfVectorizer + random forest

##### Default values

In [None]:
vectorizer = TfidfVectorizer()

In [None]:
pipe = make_pipeline(vectorizer, RandomForestClassifier())

In [None]:
estimator = get_cv_auc(pipe, X_text_train, y_train, n_cv=20)

In [None]:
plot_precision_recall(estimator, X_text_test, y_test)

In [None]:
vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1, 3), max_df=0.8, min_df=0.2, max_features=5000)

pipe = make_pipeline(vectorizer, RandomForestClassifier())

estimator = get_cv_auc(pipe, X_text_train, y_train, n_cv=20)

In [None]:
plot_precision_recall(estimator, X_text_test, y_test)

##### Grid

In [None]:
pipe = Pipeline([
    ("vec", TfidfVectorizer()),
    ("rf", RandomForestClassifier())
    ])
params = {"vec__stop_words": ["english", None],
          "vec__ngram_range": [(1, 1), (1, 2), (1, 3)], 
          "vec__max_df": uniform(loc=0.8, scale=0.2),
          "vec__min_df": uniform(loc=0.0, scale=0.2),
          "vec__max_features": randint(low=1000, high=9000)}

grid = RandomizedSearchCV(pipe,
                          params,
                          n_iter=8,
                          scoring="roc_auc",
                          n_jobs=-1,
                          cv=10,
                          return_train_score=False)

In [None]:
_ = grid.fit(X_text_train, y_train)

In [None]:
avg_auc = grid.cv_results_["mean_test_score"].mean()
std_auc = grid.cv_results_["std_test_score"].mean()
print(f"Average auc score: {avg_auc:.2f} (+/- {std_auc:.2f})")
# pd.DataFrame.from_dict(grid.cv_results_).sort_values("rank_test_score")

In [None]:
plot_precision_recall(grid.best_estimator_, X_text_test, y_test)

#### Advanced tokenization and lemmatization using spaCy

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
doc = nlp(X_text[0])

In [None]:
for token in doc:
    print(token.text, token.lemma_, token.is_stop)

In [None]:
nlp = spacy.load("en")

#### Language model

[Instructions from SpaCy documentation](https://spacy.io/usage/training#section-textcat)

In [None]:
class CustomSpacyClassifier():
    """ Wrapper for spaCy's text classification that enables integration with sklearn.metrics.cross_validate
    
    """
    
    def __init__(self):
        self._estimator_type = "classifier"
        
        self.nlp = None
        self.label = None
        self.train_data = None
        
    def get_params(self, deep=True):
        return dict()
    
    def add_textcat(self, label):
        self.label = label
        if "textcat" not in self.nlp.pipe_names:
            textcat = self.nlp.create_pipe("textcat")
            self.nlp.add_pipe(textcat, last=True)
        # otherwise, get it, so we can add labels to it
        else:
            textcat = self.nlp.get_pipe("textcat")
        textcat.add_label(label)
    
    def fit(self, X, y, n_iter=10, **kwargs):
        
        self.nlp = spacy.load("en")
        self.add_textcat("interesting")
        self.train_data = [(e, {"cats": {self.label: bool(l)}}) for e, l in zip(X, y)]
        
        drop_rate = kwargs["drop_rate"]
        
        other_pipes = [pipe for pipe in self.nlp.pipe_names if pipe != "textcat"]
        with self.nlp.disable_pipes(*other_pipes):  # only train textcat
            optimizer = self.nlp.begin_training()
            for i in range(n_iter):
                print(f"EPOCH {i+1}")
                losses = {}
                batches = minibatch(self.train_data, size=compounding(4., 16., 1.001))
                for batch in batches:
                    texts, annotations = zip(*batch)
                    self.nlp.update(texts, annotations, sgd=optimizer, drop=drop_rate,
                               losses=losses)
                loss = losses["textcat"]
                print(f"LOSS: {loss}")
                print("")
                
    def predict_proba(self, X):
        p1_scores = [np.float64(self.nlp(sample_text).cats["interesting"]) for sample_text in X]
        
        return np.array([[1. - score, score] for score in p1_scores])

In [None]:
clf = CustomSpacyClassifier()

In [None]:
s = model_selection.StratifiedShuffleSplit(n_splits=6, test_size=0.2, random_state=42)

Train several custom classifiers and evaluate their performance on the true hold out set

In [None]:
fitted_clfs = []

for i, split in enumerate(s.split(X_70, y_70)):
    i_train = split[0]
    i_test = split[1]
    
    print("Training model number  ", i)
    print("")
    print("Training IDs: ", i_train)
    print("Test IDs: ", i_test)
    
    clf.fit(X_70[i_train], y_70[i_train], n_iter=5, drop_rate=0.4)
    fitted_clf = copy.deepcopy(clf)
    test_auc = metrics.roc_auc_score(y_70[i_test], clf.predict_proba(X_70[i_test])[:, 1])
    fitted_clfs.append((fitted_clf, test_auc))

    print("AUC on the test set: ", test_auc)
    print("")

In [None]:
scores = [score for _, score in fitted_clfs]
print("Mean AUC: ", np.mean(scores))
print("Std deviation of AUC: ", np.std(scores))

In [None]:
preds = np.array([c.predict_proba(X_100)[:, 1] for c, _ in fitted_clfs])

In [None]:
print("AUC on the hold out set: ", metrics.roc_auc_score(y_100, preds.mean(axis=0)))

In [None]:
auc_cv = cross_validate(clf, X_text_train, y_train,
                            scoring="roc_auc",
                            cv=2,
                            n_jobs=1,
                            return_train_score=False,
                            return_estimator=True)

In [None]:
auc_cv

In [None]:
nlp = spacy.load("en")

In [None]:
if "textcat" not in nlp.pipe_names:
    textcat = nlp.create_pipe("textcat")
    nlp.add_pipe(textcat, last=True)
# otherwise, get it, so we can add labels to it
else:
    textcat = nlp.get_pipe("textcat")

In [None]:
textcat.add_label("interesting")

In [None]:
TRAIN_DATA = [(example, {"cats": {"interesting": bool(label)}}) for example, label in zip(X_text_train, y_train)]

In [None]:
n_iter = 10

In [None]:
# dropout = decaying(0.6, 0.2, 1e-4)

In [None]:
i = 0
while i < 20:
    print(next(dropout))
    i += 1

In [None]:
size=compounding(4., 16., 1.05)

In [None]:
i = 0
while i < 20:
    print(next(size))
    i += 1

In [None]:
for i in range(n_iter):
        print(f"EPOCH {i+1}")
        batches = minibatch(TRAIN_DATA, size=compounding(4., 16., 1.5))
        print(len(next(batches)))

In [None]:
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]
with nlp.disable_pipes(*other_pipes):  # only train textcat
    optimizer = nlp.begin_training()
    for i in range(n_iter):
        print(f"EPOCH {i+1}")
        losses = {}
        batches = minibatch(TRAIN_DATA, size=compounding(4., 16., 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.3,
                       losses=losses)
        loss = losses["textcat"]
        print(f"LOSS: {loss}")
        print("")
        

#### Evaluate

In [None]:
test_scores = [nlp(sample_text).cats["interesting"] for sample_text in X_text_test]

In [None]:
te = [np.float64(nlp(sample_text).cats["interesting"]) for sample_text in X_text_test[0:2]]

In [None]:
te2 = np.array([[1. - score, score] for score in te])

In [None]:
te2

In [None]:
np.float64(te[0])

In [None]:
metrics.roc_auc_score(y_test, test_scores)

In [None]:
precision, recall, _ = metrics.precision_recall_curve(y_test, test_scores)

step_kwargs = {"step": "post"}
plt.step(recall, precision, color='b', alpha=0.2,
         where="post")
plt.fill_between(recall, precision, alpha=0.2, color='b', **step_kwargs)

plt.xlabel("Recall")
plt.ylabel("Precision")
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title("Precision-Recall curve")
plt.show()

### Archive

In [None]:
class CustomEvaluator():
    """ Simple class holding data and functionality related to evaluating a classifier's performance
    
    """
    
    def __init__(self, texts, labels, scores):
        self.df = pd.DataFrame({"text": texts, "label": labels, "score": scores})
        self.group_means = self.df.groupby(by="label").mean()
        
    def get_scores(self, thresholds=[0.25, 0.5, 0.75]):
        if isinstance(thresholds, float):
            thresholds = [thresholds]
        
        tps = [1e-8]*len(thresholds)  # True positives
        fps = [1e-8]*len(thresholds)  # False positives
        fns = [1e-8]*len(thresholds)  # False negatives
        tns = [1e-8]*len(thresholds)  # True negatives
        
        for i, t in enumerate(thresholds):
            for truth, pred in zip(self.df["label"], self.df["score"] > t):
                if truth and pred:
                    tps[i] += 1.
                elif not truth and pred:
                    fps[i] += 1.
                elif truth and not pred:
                    fns[i] += 1.
                elif not truth and not pred:
                    tns[i] += 1.
        
        precisions = [tp / (tp + fp) for tp, fp in zip(tps, fps)]
        recalls = [tp / (tp + fn) for tp, fn in zip(tps, fns)]
        f_scores = [2 * (p * r) / (p + r) for p, r in zip(precisions, recalls)]
        accuracies = [(tp + tn) / (tp + fp + fn + tn) for tp, fp, fn, tn in zip(tps, fps, fns, tns)]
        
        score_df = pd.DataFrame({"threshold": thresholds,
                                 "precision": precisions,
                                 "recall": recalls,
                                 "f_score": f_scores,
                                 "accuracy": accuracies})
        
        print(score_df)
        self.score_df = score_df
        
        return

In [None]:
test_evaluator = CustomEvaluator(X_text_test, y_test, test_scores)

In [None]:
test_evaluator.group_means

In [None]:
test_evaluator.get_scores()