### Imports and settings

In [None]:
# file system navigation
from pathlib import Path

# data transformation
import pandas as pd
import numpy as np

# plotting
import matplotlib.pyplot as plt
%matplotlib inline

# ml algorithms and evaluation metrics
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.model_selection import cross_val_score, cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn import model_selection
from sklearn.model_selection import StratifiedShuffleSplit
from scipy.stats.distributions import uniform, randint

# sklearn specifics
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# nlp
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import spacy
from spacy.pipeline import TextCategorizer
from spacy.util import minibatch, compounding
from spacy.util import decaying

# misc
import random
import copy
import pickle
import re
from collections import namedtuple
from typing import List
from helpers import *

In [None]:
DATA_PATH = Path.cwd() / "data" / "processed"
OUTPUT_PATH = Path.cwd() / "reports" / "images"

### Helper visualizations

Generate a plot of interest in Machine Learning over time based on Google trends data.

In [None]:
google_trends_ml = pd.read_csv(Path.cwd() / "data" / "trends-ml.csv",
                               skiprows=3,
                               header=None,
                               names=["date", "interest"],
                               parse_dates=["date"])

In [None]:
google_trends_ml.head()

In [None]:
interest_plot = google_trends_ml.plot(x="date",
                      y="interest",
                      legend=False)
interest_plot.set_xlabel("Date")
interest_plot.set_ylabel("Relative interest")
interest_plot;

In [None]:
fig = interest_plot.get_figure()
fig.savefig(Path.cwd() / "reports" / "images" / "interest-in-ml.png")

### Data preparation

#### Load preprocessed data and split into train and test

In [None]:
data = pd.read_parquet(DATA_PATH / "train_data.parquet")

In [None]:
X = data[["claps", "reading_time", "text"]]
y = np.array(data["interesting"])

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3, random_state=42,
                                                    stratify=y)

Initialize custom evaluator and shuffle split generator to use across all modeling approaches below.

In [None]:
evaluator = CustomEvaluator(target_precision=0.8)
sss = model_selection.StratifiedShuffleSplit(n_splits=6, test_size=0.2, random_state=42)

#### Generate a vocabulary of words specific to the corpus

In [None]:
EMBEDDING_PATH = Path.cwd() / "resources" / "embeddings"
top_10k = pd.read_table(EMBEDDING_PATH / "google-10000-english" / "google-10000-english.txt", header=None)
top_10k_dict = {str(word).lower() : rank + 1 for rank, word in top_10k.iloc[:, 0].items()}

In [None]:
texts = data["text"].copy()

In [None]:
texts = texts.apply(lambda x: x.lower())
texts = texts.apply(lambda x: clean_apostrophe(x))
texts = texts.apply(lambda x: remove_punctuation(x))
texts = texts.apply(lambda x: fix_specific(x))
#texts = texts.apply(lambda x: clean_numbers(x))

In [None]:
sentences = texts.apply(lambda x: x.split()).values
vocab = build_vocab(sentences)
oov = check_coverage(vocab, top_10k_dict)

In [None]:
specific_vocab = [w for w, _ in oov[:1000]]

### Data exploration

#### Summary statistics

Take a first look at the data and generate summary tables for the report

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data["interesting"].value_counts() / data.shape[0]

In [None]:
summary_numeric = (data[["claps", "reading_time"]]
                   .describe()
                   .round(2))
summary_numeric

In [None]:
save_html(df=summary_numeric, name="summary_numeric", out=OUTPUT_PATH)

In [None]:
summary_object = data[["author", "title", "text"]].describe()
summary_object

In [None]:
save_html(df=summary_object, name="summary_object", out=OUTPUT_PATH)

#### Exploratory visualization

In [None]:
data_base = data[["claps", "reading_time", "interesting"]]

In [None]:
x_index = 0
y_index = 1
target_names = ["not interesting", "interesting"]

colors = ["red", "green"]

for label, color in zip(range(len(data_base["interesting"])), colors):
    plt.scatter(np.array(data_base[data_base["interesting"]==label].iloc[:, x_index]), 
                np.array(data_base[data_base["interesting"]==label].iloc[:, y_index]),
                label=target_names[label],
                c=color)

plt.xlabel(data_base.columns[x_index])
plt.ylabel(data_base.columns[y_index])
plt.legend(loc="upper right")

plt.savefig(Path.cwd() / "reports" / "images" / "base_classifier.png")

### Baseline model

Create a baseline model using just the numerical features `claps` and `reading time` based on three classes of classification models:

- Random forests
- Support vector machines
- Logistic regression

In [None]:
num_cols = ["claps", "reading_time"]
X_train_num, X_test_num = np.array(X_train[num_cols]), np.array(X_test[num_cols])

#### Random Forest 

In [None]:
rf = RandomForestClassifier(n_estimators=20,
                            min_samples_leaf=3,
                            random_state=42)

In [None]:
fitted_rfs = fit_ensemble(rf, sss, X_train_num, y_train)

In [None]:
evaluate_ensemble(fitted_rfs, evaluator, X_test_num, y_test)

In [None]:
base_results_rf = evaluate_ensemble(fitted_rfs, evaluator, X_test_num, y_test, return_res=True, method="baseline_rf")

#### Support Vector Machine

In [None]:
svc = SVC(gamma="auto",
          probability=True)

In [None]:
fitted_svcs = fit_ensemble(svc, sss, X_train_num, y_train)

In [None]:
evaluate_ensemble(fitted_svcs, evaluator, X_test_num, y_test)

In [None]:
base_results_svc = evaluate_ensemble(fitted_svcs, evaluator, X_test_num, y_test, return_res=True, method="baseline_svc")

#### Logistic regression

In [None]:
lr = LogisticRegression(solver="liblinear", random_state=42)

In [None]:
fitted_lrs = fit_ensemble(lr, sss, X_train_num, y_train)

In [None]:
evaluate_ensemble(fitted_lrs, evaluator, X_test_num, y_test)

In [None]:
base_results_lr = evaluate_ensemble(fitted_lrs, evaluator, X_test_num, y_test, return_res=True, method="baseline_lr")

#### Collect and save baseline results

In [None]:
base_results = pd.concat([base_results_rf, base_results_svc, base_results_lr], axis=0)

In [None]:
base_results

In [None]:
save_html(df=base_results, name="summary_baseline_results", out=OUTPUT_PATH, index=False)

### Text based models

Prepare feature array for training text based models by extracting just the column containing the blog posts' text from `X_train`.

In [None]:
text_col = "text"
X_train_text, X_test_text = np.array(X_train[text_col]), np.array(X_test[text_col])

#### CountVectorizer + Classifier

Scikit-learn's `CountVectorizer` is the simplest approach to turning the blog posts' texts into numerical matrices. It will just count the number of occurences of each token in the text and create a sparse matrix holding these counts for all posts.

##### Default values

First, let's do everything with default values to get a general feeling for how this approach performs.

In [None]:
count_vectorizer_specific = CountVectorizer(vocabulary=specific_vocab)

In [None]:
count_vectorizer = CountVectorizer()

###### Random Forest

With full vocab

In [None]:
pipe_countvec_rf = make_pipeline(count_vectorizer, RandomForestClassifier(n_estimators=10, random_state=1, n_jobs=-1))

In [None]:
fitted_countvec_rf = fit_ensemble(pipe_countvec_rf, sss, X_train_text, y_train)

In [None]:
res_countvec_rf_full = evaluate_ensemble(fitted_countvec_rf, evaluator, X_test_text, y_test,
                                         return_res=True, method="countvec rf full v1")

Only top k words specific to the corpus

In [None]:
pipe_countvec_rf_specific = make_pipeline(count_vectorizer_specific, RandomForestClassifier(n_estimators=10,
                                                                                            random_state=1,
                                                                                            n_jobs=-1))
fitted_countvec_rf_specific = fit_ensemble(pipe_countvec_rf_specific, sss, X_train_text, y_train)
res_countvec_rf_specific = evaluate_ensemble(fitted_countvec_rf_specific, evaluator, X_test_text, y_test, return_res=True, method="countvec rf specific v1")

###### Support Vector Machine

In [None]:
pipe_countvec_svc = make_pipeline(count_vectorizer, SVC(gamma="auto", probability=True))

In [None]:
fitted_countvec_svc = fit_ensemble(pipe_countvec_svc, sss, X_train_text, y_train)

In [None]:
res_countvec_svc_full = evaluate_ensemble(fitted_countvec_svc, evaluator, X_test_text, y_test, return_res=True, method="countvec svc full")

In [None]:
pipe_countvec_svc_specific = make_pipeline(count_vectorizer_specific, SVC(gamma="auto", probability=True))
fitted_countvec_svc_specific = fit_ensemble(pipe_countvec_svc_specific, sss, X_train_text, y_train)
res_countvec_svc_specific = evaluate_ensemble(fitted_countvec_svc_specific, evaluator, X_test_text, y_test, return_res=True, method="countvec svc specific")

##### Some optimization (preprocessing, feature selection, model tuning) using grid search

In [None]:
pipe = Pipeline([
    ("vec", CountVectorizer(vocabulary=specific_vocab)),
    ("rf", RandomForestClassifier())
    ])
params = {"vec__stop_words": ["english", None],
          "vec__ngram_range": [(1, 1), (1, 2), (1, 3)], 
          "vec__max_df": uniform(loc=0.8, scale=0.2),
          "vec__min_df": uniform(loc=0.0, scale=0.2),
          "vec__max_features": randint(low=1000, high=9000),
          "rf__n_estimators": randint(low=10, high=40),
          "rf__max_depth": randint(low=2, high=8),
          "rf__min_samples_leaf": randint(low=1, high=10),
          "rf__max_features": [0.5, "sqrt", "auto"]}

grid_1 = RandomizedSearchCV(pipe,
                          params,
                          n_iter=10,
                          scoring="roc_auc",
                          n_jobs=-1,
                          cv=5,
                          return_train_score=False)

grid_2 = RandomizedSearchCV(pipe,
                          params,
                          n_iter=10,
                          scoring="roc_auc",
                          n_jobs=-1,
                          cv=5,
                          return_train_score=False)

grid_3 = RandomizedSearchCV(pipe,
                          params,
                          n_iter=10,
                          scoring="roc_auc",
                          n_jobs=-1,
                          cv=5,
                          return_train_score=False)

In [None]:
_ = grid_1.fit(X_train_text, y_train)
_ = grid_2.fit(X_train_text, y_train)
_ = grid_3.fit(X_train_text, y_train)

In [None]:
estimator_1 = grid_1.best_estimator_.fit(X_train_text, y_train)
estimator_2 = grid_2.best_estimator_.fit(X_train_text, y_train)
estimator_3 = grid_3.best_estimator_.fit(X_train_text, y_train)

In [None]:
estimator_1

In [None]:
evaluate_ensemble([estimator_1], evaluator, X_test_text, y_test) 

In [None]:
estimator_2

In [None]:
evaluate_ensemble([estimator_2], evaluator, X_test_text, y_test) 

In [None]:
estimator_3

In [None]:
evaluate_ensemble([estimator_3], evaluator, X_test_text, y_test)

In [None]:
evaluate_ensemble([estimator_1, estimator_2, estimator_3], evaluator, X_test_text, y_test)

#### TfidfVectorizer + Classifier

The next approach to feature extraction from the text data that I want to try is the `Term-Frequency-Inverse-Document-Frequency` technique implemented in scikit-learn's `TfidfVectorizer`. This method creates the same matrix as the `CountVectorizer` but divides the values for each token in the vocabulary by its frequency across all documents in the corpus.

##### Default values

Full vocab

In [None]:
tfidf_vectorizer = TfidfVectorizer()
pipe_tfidf = make_pipeline(tfidf_vectorizer, RandomForestClassifier(n_estimators=10,
                                                                    random_state=1,
                                                                    n_jobs=-1))
fitted_tfidf = fit_ensemble(pipe_tfidf, sss, X_train_text, y_train)
res_tfidf_rf_full = evaluate_ensemble(fitted_tfidf, evaluator, X_test_text, y_test, return_res=True, method="tfidf rf full v1")

Only specific vocab

In [None]:
tfidf_vectorizer_specific = TfidfVectorizer(vocabulary=specific_vocab)
pipe_tfidf_specific = make_pipeline(tfidf_vectorizer_specific, RandomForestClassifier(n_estimators=10,
                                                                                      random_state=1,
                                                                                      n_jobs=-1))
fitted_tfidf_specific = fit_ensemble(pipe_tfidf_specific, sss, X_train_text, y_train)
res_tfidf_rf_specific = evaluate_ensemble(fitted_tfidf_specific, evaluator, X_test_text, y_test, return_res=True, method="tfidf rf specific v1")

##### Support Vector Machine

In [None]:
pipe_tfidf_svc = make_pipeline(tfidf_vectorizer, SVC(gamma="auto",
                                                     C=0.8,
                                                     probability=True))

In [None]:
fitted_tfidf_svc = fit_ensemble(pipe_tfidf_svc, sss, X_train_text, y_train)

In [None]:
res_tfidf_svc_full = evaluate_ensemble(fitted_tfidf_svc, evaluator, X_test_text, y_test, return_res=True, method="tfidf svc full")

In [None]:
pipe_tfidf_svc_specific = make_pipeline(tfidf_vectorizer_specific, SVC(gamma="auto",
                                                     C=0.8,
                                                     probability=True))
fitted_tfidf_svc_specific = fit_ensemble(pipe_tfidf_svc_specific, sss, X_train_text, y_train)
res_tfidf_svc_specific = evaluate_ensemble(fitted_tfidf_svc_specific, evaluator, X_test_text, y_test,
                                           return_res=True, method="tfidf svc full")

##### Save results

In [None]:
results_tfidf_countvec = pd.concat([res_countvec_rf_full, res_countvec_rf_specific,
                                    res_countvec_svc_full, res_countvec_svc_specific,
                                    res_tfidf_full, res_tfidf_rf_full,
                                    res_tfidf_rf_specific, res_tfidf_svc_full])

In [None]:
results_tfidf_countvec

In [None]:
save_html(df=results_tfidf_countvec, name="summary_countvec_tfidf_results", out=OUTPUT_PATH, index=False)

##### Grid search

In [None]:
pipe = Pipeline([
    ("vec", TfidfVectorizer()),
    ("rf", RandomForestClassifier())
    ])
params = {"vec__stop_words": ["english", None],
          "vec__ngram_range": [(1, 1), (1, 2), (1, 3)], 
          "vec__max_df": uniform(loc=0.8, scale=0.2),
          "vec__min_df": uniform(loc=0.0, scale=0.2),
          "vec__max_features": randint(low=1000, high=9000),
          "rf__n_estimators": randint(low=10, high=40),
          "rf__max_depth": randint(low=2, high=8),
          "rf__min_samples_leaf": randint(low=1, high=10),
          "rf__max_features": [0.5, "sqrt", "auto"]}

grid_1_tfidf = RandomizedSearchCV(pipe,
                          params,
                          n_iter=10,
                          scoring="roc_auc",
                          n_jobs=-1,
                          random_state=1,
                          cv=5,
                          return_train_score=False)

grid_2_tfidf = RandomizedSearchCV(pipe,
                          params,
                          n_iter=10,
                          scoring="roc_auc",
                          n_jobs=-1,
                          random_state=1,
                          cv=5,
                          return_train_score=False)

grid_3_tfidf = RandomizedSearchCV(pipe,
                          params,
                          n_iter=10,
                          scoring="roc_auc",
                          n_jobs=-1,
                          random_state=1,
                          cv=5,
                          return_train_score=False)

In [None]:
_ = grid_1_tfidf.fit(X_train_text, y_train)
_ = grid_2_tfidf.fit(X_train_text, y_train)
_ = grid_3_tfidf.fit(X_train_text, y_train)

In [None]:
evaluate_ensemble([grid_1_tfidf.best_estimator_], evaluator, X_test_text, y_test)

In [None]:
evaluate_ensemble([grid_2_tfidf.best_estimator_], evaluator, X_test_text, y_test)

In [None]:
evaluate_ensemble([grid_3_tfidf.best_estimator_], evaluator, X_test_text, y_test)

In [None]:
evaluate_ensemble([grid_1_tfidf.best_estimator_, grid_2_tfidf.best_estimator_, grid_3_tfidf.best_estimator_],
                  evaluator,
                  X_test_text,
                  y_test)

#### Pretrained word embeddings + neural network 

The below is heavily based on these two Kaggle Kernels:
- [Processing text when using word embeddings](https://www.kaggle.com/christofhenkel/how-to-preprocessing-when-using-embeddings/notebook)
- [Comparing word embeddings](https://www.kaggle.com/sudalairajkumar/a-look-at-different-embeddings/notebook)

##### Config

In [None]:
MAX_FEATURES = 10000 # how many unique words to use (i.e num rows in embedding vector)
MAX_LEN = 1000 # max number of words in a blog post
EMBEDDING_PATH = Path.cwd() / "resources" / "embeddings"
EMBEDDING_FOLDER = EMBEDDING_PATH / "glove.840B.300d"
SPECIFIC_ONLY = True

##### Purpose specific imports

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import text_to_word_sequence

from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, GRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

##### Read embeddings

In [None]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype="float32")

In [None]:
if (EMBEDDING_FOLDER / "embeddings_index.pkl").is_file():
    with open(EMBEDDING_FOLDER / "embeddings_index.pkl", "rb") as handle:
        embeddings_index = pickle.load(handle)
else:
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FOLDER / "glove.840B.300d.txt"))
    with open(EMBEDDING_FOLDER / "embeddings_index.pkl", "wb") as handle:
        pickle.dump(embeddings_index, handle, protocol=pickle.HIGHEST_PROTOCOL)

##### Preprocess the text data to work well with embeddings

In [None]:
X_train = pd.Series(X_train_text.copy())
X_test = pd.Series(X_test_text.copy())

In [None]:
X_train = X_train.apply(lambda x: x.lower())
X_train = X_train.apply(lambda x: clean_apostrophe(x))
X_test = X_train.apply(lambda x: fix_punctuation(x))
X_train = X_train.apply(lambda x: fix_specific(x))

Apply the same transformations to the test set

In [None]:
X_test = X_test.apply(lambda x: x.lower())
X_test = X_test.apply(lambda x: clean_apostrophe(x))
X_test = X_test.apply(lambda x: fix_punctuation(x))
X_test = X_test.apply(lambda x: fix_specific(x))

##### Tokenize text 

In [None]:
tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(list(X_train))

In [None]:
X_train = tokenizer.texts_to_sequences(X_train_text)
X_test = tokenizer.texts_to_sequences(X_test_text)

In [None]:
X_train = pad_sequences(X_train, maxlen=MAX_LEN)
X_test = pad_sequences(X_test, maxlen=MAX_LEN)

##### Process embeddings into a matrix of size `(max_features, embed_size)`

In [None]:
all_embs = np.stack(embeddings_index.values())
emb_mean, emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

In [None]:
word_index = tokenizer.word_index

if SPECIFIC_ONLY: word_index = {word : i for word, i in word_index.items() if word in specific_vocab}

In [None]:
nb_words = min(MAX_FEATURES, len(tokenizer.word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= MAX_FEATURES: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

##### Define model

In [None]:
inp = Input(shape=(MAX_LEN,))
x = Embedding(MAX_FEATURES, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(GRU(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.4)(x)
x = Dense(1, activation="sigmoid")(x)


model = Model(inputs=inp, outputs=x)
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
print(model.summary())

##### Fit model

In [None]:
from keras.engine.training import Model as keras_model

In [None]:
fitted_keras = fit_ensemble(model, sss, X_train, y_train, print_progress=True,
                            batch_size=16, epochs=3, verbose=0, validation_data=(X_test, y_test))

In [None]:
model.fit(X_train, y_train,
          batch_size=8,
          epochs=5,
          validation_data=(X_test, y_test))

##### Evaluate model

In [None]:
from keras.engine.training import Model as keras_model

In [None]:
def evaluate_ensemble(fitted:List, eval:CustomEvaluator, X_test:ndarray,
                      y_test:ndarray, return_res:bool=False, method:str="default") -> pd.DataFrame:
    """Evaluate the performance of a set of classifiers trained on different subsets of the training set
    
    Arguments:
    fitted - list of named tuples containing fitted models as well as their train and out of bag AUC
    eval - object of class CustomEvaluator used to evaluate the performance on the hold out set
    X_test - numpy array of the texts for the hold out set for final evaluation
    y_test - numpy array of labels for the hold out set for final evaluation
    
    Return:
    pd.DataFrame - if requested, return pandas dataframe summarizing the results
    """
    
    if hasattr(fitted[0], "clf"):
        train_scores = [m.train_auc for m in fitted]
        oob_scores = [m.oob_auc for m in fitted]
        
        if hasattr(fitted[0].clf, "predict_proba"):
            preds_test = np.array([m.clf.predict_proba(X_test)[:, 1] for m in fitted])
        elif isinstance(fitted[0].clf, keras_model):
            preds_test = np.array([m.clf.predict(X_test) for m in fitted])
        
        print(f"Mean Train AUC: {np.mean(train_scores):.2f} (+/- {np.std(train_scores):.2f})")
        print(f"Mean OOB AUC: {np.mean(oob_scores):.2f} (+/- {np.std(oob_scores):.2f})")
        print("")
        
    else: preds_test = np.array([m.predict_proba(X_test)[:, 1] for m in fitted])
    print("Performance on hold out set:")
    if return_res:
        test_auc = eval.score(y_test, preds_test.mean(axis=0), return_res)
        return pd.DataFrame({"method": method,
                             "mean train auc" : np.mean(train_scores),
                                 "mean cv auc" : np.mean(oob_scores),
                             "mean test auc" : test_auc}, index=[0])
    else:
        eval.score(y_test, preds_test.mean(axis=0))

In [None]:
evaluate_ensemble(fitted_keras, evaluator, X_test, y_test)

#### SpaCy language model

[Instructions from SpaCy documentation](https://spacy.io/usage/training#section-textcat)

In [None]:
class CustomSpacyClassifier():
    """ Wrapper for spaCy's text classification that enables integration with sklearn.metrics.cross_validate
    
    """
    
    def __init__(self):
        self._estimator_type = "classifier"
        
        self.nlp = None
        self.label = None
        self.train_data = None
        
    def get_params(self, deep=True):
        return dict()
    
    def add_textcat(self, label):
        self.label = label
        if "textcat" not in self.nlp.pipe_names:
            textcat = self.nlp.create_pipe("textcat")
            self.nlp.add_pipe(textcat, last=True)
        # otherwise, get it, so we can add labels to it
        else:
            textcat = self.nlp.get_pipe("textcat")
        textcat.add_label(label)
    
    def fit(self, X, y, n_iter=10, **kwargs):
        
        self.nlp = spacy.load("en")
        self.add_textcat("interesting")
        self.train_data = [(e, {"cats": {self.label: bool(l)}}) for e, l in zip(X, y)]
        
        drop_rate = kwargs["drop_rate"]
        
        other_pipes = [pipe for pipe in self.nlp.pipe_names if pipe != "textcat"]
        with self.nlp.disable_pipes(*other_pipes):  # only train textcat
            optimizer = self.nlp.begin_training()
            for i in range(n_iter):
                print(f"EPOCH {i+1}")
                losses = {}
                batches = minibatch(self.train_data, size=compounding(4., 16., 1.001))
                for batch in batches:
                    texts, annotations = zip(*batch)
                    self.nlp.update(texts, annotations, sgd=optimizer, drop=drop_rate,
                               losses=losses)
                loss = losses["textcat"]
                print(f"LOSS: {loss}")
                print("")
                
    def predict_proba(self, X):
        p1_scores = [np.float64(self.nlp(sample_text).cats["interesting"]) for sample_text in X]
        
        return np.array([[1. - score, score] for score in p1_scores])

In [None]:
nlp = spacy.load("en")

In [None]:
clf_spacy = CustomSpacyClassifier()

In [None]:
fitted_spacy = fit_ensemble(clf_spacy, sss, X_train_text, y_train, n_iter=5, drop_rate=0.6)

In [None]:
evaluate_ensemble(fitted_spacy, evaluator, X_test_text, y_test)

In [None]:
for m in fitted_spacy:
    evaluate_ensemble([m], evaluator, X_test_text, y_test)

#### Fastai

In [None]:
from fastai import *
from fastai.text import *

In [None]:
data = pd.read_csv(Path.cwd() / "data" / "shared" / "train_data_fastai.csv",
                   header=None)

In [None]:
data.head()

##### Train and evaluate using custom metric 

In [None]:
data.columns = ["label", "text"]

In [None]:
PATH = Path.cwd() / "data" / "shared" / "fastai"
os.makedirs(PATH / "exp", exist_ok=True)
EXP_PATH = PATH / "exp"

###### Train and Evaluate the Language Model (Metric : Accuracy)

In [None]:
data_lm = (TextList.from_df(df=data, path=EXP_PATH, cols="text")
             .random_split_by_pct()
             .label_for_lm()
             .databunch())

In [None]:
learn = language_model_learner(data_lm, pretrained_model=URLs.WT103, drop_mult=0.3)

In [None]:
learn.lr_find()
learn.recorder.plot(skip_end=15)

In [None]:
learn.fit_one_cycle(2, 5e-2, moms=(0.8,0.7))

In [None]:
learn.save("fit_head")

In [None]:
learn.load("fit_head");

In [None]:
learn.unfreeze()
learn.fit_one_cycle(3, 5e-3, moms=(0.8,0.7))

In [None]:
learn.save("fine_tuned")

In [None]:
learn.save_encoder("fine_tuned_enc")

###### Train and Evaluate the Classifier (Metric = F_beta)

In [None]:
data_clf = (TextList.from_df(df=data, path=EXP_PATH, cols=["text"], vocab=data_lm.vocab)
               .random_split_by_pct()
               .label_from_df(cols="label")
               .databunch(bs=8))

My initial objective was to achieve `precision = 0.95` and `recall = 0.75`. As `0.75 / 0.95` is approx. `0.8`, I will use the fbeta score with `beta = 0.8` to evaluate my classifier. Let's calculate the benchmark first:  

In [None]:
def f_beta(beta, pr, rc):
    beta2 = beta**2
    return (1+beta2) * pr*rc / (beta2*pr + rc)

In [None]:
f_beta(0.8, 0.95, 0.75)

In [None]:
class FBetaBinary(Callback):
    "Computes the f_beta between preds and targets for binary classification"

    def __init__(self, beta=1, eps=1e-9, sigmoid=True, thresh=0.5):      
        self.beta2 = beta**2
        self.eps = eps
        self.sigmoid = sigmoid
        self.thresh = thresh
    
    def on_epoch_begin(self, **kwargs):
        self.TP = 0
        self.total_y_pred = 0   
        self.total_y_true = 0
    
    def on_batch_end(self, last_output, last_target, **kwargs):
        y_pred = last_output
        y_pred = y_pred.softmax(dim=1)
        y_pred = (y_pred[:, 1]>self.thresh).float()
        y_true = last_target.float()
        
        self.TP += ((y_pred==1) * (y_true==1)).float().sum()
        self.total_y_pred += (y_pred==1).float().sum()
        self.total_y_true += (y_true==1).float().sum()
    
    def on_epoch_end(self, **kwargs):
        prec = self.TP / (self.total_y_pred+self.eps)
        rec = self.TP / (self.total_y_true+self.eps)
        res = (prec*rec) / (prec*self.beta2+rec+self.eps) * (1+self.beta2)        
        self.metric = res

In [None]:
metrics = []

for t in np.arange(0.1, 0.4, 0.05):
    metrics.append(FBetaBinary(beta=0.8, thresh=t))

In [None]:
learn = text_classifier_learner(data_clf, drop_mult=0.5)
learn.load_encoder("fine_tuned_enc")
learn.metrics = metrics
learn.freeze()

In [None]:
learn.lr_find()
learn.recorder.plot();

In [None]:
learn.fit_one_cycle(1, 5e-3, moms=(0.8,0.7))

In [None]:
learn.save("first")

In [None]:
learn.load("first");

In [None]:
learn.freeze_to(-2)
learn.fit_one_cycle(1, slice(5e-3/(2.6**4),5e-3), moms=(0.8,0.7))

In [None]:
learn.save("second")

In [None]:
learn.load("second");

In [None]:
learn.freeze_to(-3)
learn.fit_one_cycle(1, slice(1e-3/(2.6**4),1e-3), moms=(0.8,0.7))

In [None]:
learn.save("three")

In [None]:
learn.load("three");

In [None]:
learn.unfreeze()
learn.fit_one_cycle(2, slice(5e-4/(2.6**4),5e-4), moms=(0.8,0.7))