In [1]:
# debugging
from IPython.core.debugger import set_trace

# file system navigation
import os

# data transformation
import pandas as pd
import numpy as np

# nlp
import spacy
from spacy.pipeline import TextCategorizer
from spacy.util import minibatch, compounding
from spacy.util import decaying

# ml
from sklearn.model_selection import train_test_split

# misc
import random

In [2]:
wd = os.getcwd()

#### Load and split data

In [3]:
data = pd.read_csv(os.path.join(wd, "data", "processed", "train_data.csv"))

In [4]:
data.head()

Unnamed: 0,author,claps,reading_time,link,title,text,interesting
0,Justin Lee,8300,11,https://medium.com/swlh/chatbots-were-the-next...,Chatbots were the next big thing: what happene...,"Oh, how the headlines blared:\nChatbots were T...",0
1,Conor Dewey,1400,7,https://towardsdatascience.com/python-for-data...,Python for Data Science: 8 Concepts You May Ha...,If you’ve ever found yourself looking up the s...,0
2,William Koehrsen,2800,11,https://towardsdatascience.com/automated-featu...,Automated Feature Engineering in Python – Towa...,Machine learning is increasingly moving from h...,1
3,Irhum Shafkat,2000,15,https://towardsdatascience.com/intuitively-und...,Intuitively Understanding Convolutions for Dee...,The advent of powerful and versatile deep lear...,1
4,Sam Drozdov,2300,6,https://uxdesign.cc/an-intro-to-machine-learni...,An intro to Machine Learning for designers – U...,There is an ongoing debate about whether or no...,0


#### Baseline model using author, claps and reading time

In [5]:
data_base = data[["claps", "reading_time", "interesting"]]

In [None]:
X = data_base[["claps", "reading_time"]]
y = data_base["interesting"]

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3,
                                                    stratify=y)

In [6]:
data_base.head()

Unnamed: 0,claps,reading_time,interesting
0,8300,11,0
1,1400,7,0
2,2800,11,1
3,2000,15,1
4,2300,6,0


In [None]:
data_base

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

x_index = 0
y_index = 1
target_names = ["not interesting", "interesting"]

colors = ["red", "green"]

for label, color in zip(range(len(data_base["interesting"])), colors):
    plt.scatter(np.array(data_base[data_base["interesting"]==label].iloc[:, x_index]), 
                np.array(data_base[data_base["interesting"]==label].iloc[:, y_index]),
                label=target_names[label],
                c=color)

plt.xlabel(data_base.columns[x_index])
plt.ylabel(data_base.columns[y_index])
plt.legend(loc="upper right")
plt.savefig(os.path.join(wd, "output", "base_classifier.png"))
plt.show()

In [None]:
x_index = 0
y_index = 1
target_names = ["not interesting", "interesting"]

colors = ["red", "green"]

for label, color in zip(range(len(data_base["interesting"])), colors):
    plt.scatter(np.array(data_base[data_base["interesting"]==label].iloc[:, x_index]), 
                np.array(data_base[data_base["interesting"]==label].iloc[:, y_index]),
                label=target_names[label],
                c=color)

plt.xlabel(data_base.columns[x_index])
plt.ylabel(data_base.columns[y_index])
plt.legend(loc="upper left")
plt.show()

In [None]:
plt.savefig(os.path.join(wd, "output", "base_classifier.png"))

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
gbm = GradientBoostingClassifier()

In [None]:
gbm.fit(X_train, y_train)

In [None]:
gbm.score(X_test, y_test)

#### Build text based model

In [None]:
X = data["text"]
y = data["interesting"]
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3,
                                                    stratify=y)

In [None]:
X_train.head()

In [None]:
y_train.head()

#### Train model

[Instructions from SpaCy documentation](https://spacy.io/usage/training#section-textcat)

In [None]:
nlp = spacy.load("en")

In [None]:
if "textcat" not in nlp.pipe_names:
    textcat = nlp.create_pipe("textcat")
    nlp.add_pipe(textcat, last=True)
# otherwise, get it, so we can add labels to it
else:
    textcat = nlp.get_pipe("textcat")

In [None]:
textcat.add_label("interesting")

In [None]:
TRAIN_DATA = [(example, {"cats": {"interesting": bool(label)}}) for example, label in zip(X_train, y_train)]

In [None]:
n_iter = 10

In [None]:
dropout = decaying(0.6, 0.2, 1e-4)

In [None]:
i = 0
while i < 20:
    print(next(dropout))
    i += 1

In [None]:
size=compounding(4., 16., 1.05)

In [None]:
i = 0
while i < 20:
    print(next(size))
    i += 1

In [None]:
for i in range(n_iter):
        print(f"EPOCH {i+1}")
        batches = minibatch(TRAIN_DATA, size=compounding(4., 16., 1.5))
        print(len(next(batches)))

In [None]:
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]
with nlp.disable_pipes(*other_pipes):  # only train textcat
    optimizer = nlp.begin_training()
    for i in range(n_iter):
        print(f"EPOCH {i+1}")
        losses = {}
        batches = minibatch(TRAIN_DATA, size=compounding(4., 16., 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.3,
                       losses=losses)
        loss = losses["textcat"]
        print(f"LOSS: {loss}")
        print("")
        

#### Evaluate

In [None]:
test_scores = [nlp(sample_text).cats["interesting"] for sample_text in X_test]

In [None]:
test_evaluator = CustomEvaluator(X_test, y_test, test_scores)

In [None]:
test_evaluator.group_means

In [None]:
test_evaluator.get_scores()

In [None]:
train_scores = [nlp(sample_text).cats["interesting"] for sample_text in X_train]

In [None]:
train_evaluator = CustomEvaluator(X_train, y_train, train_scores)

In [None]:
print(train_evaluator.group_means)

In [None]:
train_evaluator.get_scores()

In [None]:
class CustomEvaluator():
    """ Simple class holding data and functionality related to evaluating the classifiers performance
    
    """
    
    def __init__(self, texts, labels, scores):
        self.df = pd.DataFrame({"text": texts, "label": labels, "score": scores})
        self.group_means = self.df.groupby(by="label").mean()
        
    def get_scores(self, thresholds=[0.25, 0.5, 0.75]):
        if isinstance(thresholds, float):
            thresholds = [thresholds]
        
        tps = [1e-8]*len(thresholds)  # True positives
        fps = [1e-8]*len(thresholds)  # False positives
        fns = [1e-8]*len(thresholds)  # False negatives
        tns = [1e-8]*len(thresholds)  # True negatives
        
        for i, t in enumerate(thresholds):
            for truth, pred in zip(self.df["label"], self.df["score"] > t):
                if truth and pred:
                    tps[i] += 1.
                elif not truth and pred:
                    fps[i] += 1.
                elif truth and not pred:
                    fns[i] += 1.
                elif not truth and not pred:
                    tns[i] += 1.
        
        precisions = [tp / (tp + fp) for tp, fp in zip(tps, fps)]
        recalls = [tp / (tp + fn) for tp, fn in zip(tps, fns)]
        f_scores = [2 * (p * r) / (p + r) for p, r in zip(precisions, recalls)]
        accuracies = [(tp + tn) / (tp + fp + fn + tn) for tp, fp, fn, tn in zip(tps, fps, fns, tns)]
        
        score_df = pd.DataFrame({"threshold": thresholds,
                                 "precision": precisions,
                                 "recall": recalls,
                                 "f_score": f_scores,
                                 "accuracy": accuracies})
        
        print(score_df)
        self.score_df = score_df
        
        return