### Imports and settings

In [None]:
# file system navigation
from pathlib import Path

# data transformation
import pandas as pd
import numpy as np

# plotting
import matplotlib.pyplot as plt
%matplotlib inline

# ml algorithms and evaluation metrics
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.model_selection import cross_val_score, cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn import model_selection
from sklearn.model_selection import StratifiedShuffleSplit
import scipy
from scipy.stats.distributions import uniform, randint

# sklearn specifics
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
from sklearn.exceptions import DataConversionWarning
from sklearn import warnings
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

# nlp
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import spacy
from spacy.pipeline import TextCategorizer
from spacy.util import minibatch, compounding
from spacy.util import decaying

# keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import text_to_word_sequence

from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, GRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

# misc
import random
import copy
import pickle
import re
from collections import namedtuple
from typing import List

In [None]:
from helpers import *

In [None]:
DATA_PATH = Path.cwd() / "data" / "shared"
OUTPUT_PATH = Path.cwd() / "reports" / "images-and-tables"
MODEL_PATH = Path.cwd() / "models"
SEED = 1

In [None]:
if not OUTPUT_PATH.is_dir():
    OUTPUT_PATH.mkdir(parents=True)
    
if not MODEL_PATH.is_dir():
    MODEL_PATH.mkdir(parents=True)

### Helper visualizations

Generate a plot of interest in Machine Learning over time based on Google trends data.

In [None]:
google_trends_ml = pd.read_csv(Path.cwd() / "data" / "trends-ml.csv",
                               skiprows=3,
                               header=None,
                               names=["date", "interest"],
                               parse_dates=["date"])

In [None]:
google_trends_ml.head()

In [None]:
interest_plot = google_trends_ml.plot(x="date",
                      y="interest",
                      legend=False)
interest_plot.set_xlabel("Date")
interest_plot.set_ylabel("Relative interest")
interest_plot;

In [None]:
fig = interest_plot.get_figure()
fig.savefig(Path.cwd() / "reports" / "images" / "interest-in-ml.png")

### Data preparation

#### Load preprocessed data and split into train and test

In [None]:
data = pd.read_parquet(DATA_PATH / "train_data.parquet")

In [None]:
X = data[["claps", "reading_time", "text"]]
y = np.array(data["interesting"])

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3, random_state=SEED,
                                                    stratify=y)

Initialize custom evaluator and shuffle split generator to use across all modeling approaches below.

In [None]:
evaluator = CustomEvaluator(target_precision=0.8)
sss = model_selection.StratifiedShuffleSplit(n_splits=6, test_size=0.2, random_state=SEED)

#### Generate a vocabulary of words specific to the corpus

We use a list of the top 10k most frequent words in the English language obtained from [this repository](https://github.com/first20hours/google-10000-english) to identify the words specific to the corpus of block posts.

In [None]:
TOP_WORDS_PATH = Path.cwd() / "resources" / "top_words"
top_10k = pd.read_table(TOP_WORDS_PATH / "google-10000-english" / "google-10000-english.txt", header=None)
top_10k_dict = {str(word).lower() : rank + 1 for rank, word in top_10k.iloc[:, 0].items()}

In [None]:
texts = data["text"].copy()

In [None]:
texts = texts.apply(lambda x: x.lower())
texts = texts.apply(lambda x: clean_apostrophe(x))
texts = texts.apply(lambda x: remove_punctuation(x))
texts = texts.apply(lambda x: fix_specific(x))

In [None]:
sentences = texts.apply(lambda x: x.split()).values
vocab = build_vocab(sentences)
oov = check_coverage(vocab, top_10k_dict)

In [None]:
specific_vocab = [w for w, _ in oov]

### Data exploration

#### Summary statistics

Take a first look at the data and generate summary tables for the report

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data["interesting"].value_counts() / data.shape[0]

In [None]:
summary_numeric = (data[["claps", "reading_time"]]
                   .describe()
                   .round(2))
summary_numeric

In [None]:
save_html(df=summary_numeric, name="summary_numeric", out=OUTPUT_PATH)

In [None]:
summary_object = data[["author", "title", "text"]].describe()
summary_object

In [None]:
save_html(df=summary_object, name="summary_object", out=OUTPUT_PATH)

#### Exploratory visualization

In [None]:
data_base = data[["claps", "reading_time", "interesting"]]

In [None]:
x_index = 0
y_index = 1
target_names = ["not interesting", "interesting"]

colors = ["red", "green"]

for label, color in zip(range(len(data_base["interesting"])), colors):
    plt.scatter(np.array(data_base[data_base["interesting"]==label].iloc[:, x_index]), 
                np.array(data_base[data_base["interesting"]==label].iloc[:, y_index]),
                label=target_names[label],
                c=color)

plt.xlabel(data_base.columns[x_index])
plt.ylabel(data_base.columns[y_index])
plt.legend(loc="upper right")

plt.savefig(Path.cwd() / "reports" / "images" / "base_classifier.png")

### Baseline model

Create a baseline model using just the numerical features `claps` and `reading time` based on three classes of classification models:

- Random forests
- Support vector machines
- Logistic regression

In [None]:
num_cols = ["claps", "reading_time"]
X_train_num, X_test_num = np.array(X_train[num_cols]), np.array(X_test[num_cols])

#### Random Forest 

In [None]:
rf = RandomForestClassifier(n_estimators=20,
                            min_samples_leaf=3,
                            random_state=SEED)

In [None]:
fitted_rfs = fit_ensemble(rf, sss, X_train_num, y_train)

In [None]:
evaluate_ensemble(fitted_rfs, evaluator, X_test_num, y_test)

In [None]:
base_results_rf = evaluate_ensemble(fitted_rfs, evaluator, X_test_num, y_test, return_res=True, method="baseline_rf")

#### Support Vector Machine

In [None]:
svc = SVC(gamma="auto",
          probability=True)

In [None]:
fitted_svcs = fit_ensemble(svc, sss, X_train_num, y_train)

In [None]:
evaluate_ensemble(fitted_svcs, evaluator, X_test_num, y_test)

In [None]:
base_results_svc = evaluate_ensemble(fitted_svcs, evaluator, X_test_num, y_test, return_res=True, method="baseline_svc")

#### Logistic regression

In [None]:
lr = LogisticRegression(solver="liblinear", random_state=SEED)

In [None]:
fitted_lrs = fit_ensemble(lr, sss, X_train_num, y_train)

In [None]:
evaluate_ensemble(fitted_lrs, evaluator, X_test_num, y_test)

In [None]:
base_results_lr = evaluate_ensemble(fitted_lrs, evaluator, X_test_num, y_test, return_res=True, method="baseline_lr")

#### Collect and save baseline results

In [None]:
base_results = pd.concat([base_results_rf, base_results_svc, base_results_lr], axis=0)

In [None]:
base_results

In [None]:
save_html(df=base_results, name="summary_baseline_results", out=OUTPUT_PATH, index=False)

### Text based models

Prepare feature array for training text based models by extracting just the column containing the blog posts' text from `X_train`.

In [None]:
text_col = "text"
X_train_text, X_test_text = np.array(X_train[text_col]), np.array(X_test[text_col])

#### CountVectorizer + Classifier

Scikit-learn's `CountVectorizer` is the simplest approach to turning the blog posts' texts into numerical matrices. It will just count the number of occurences of each token in the text and create a sparse matrix holding these counts for all posts.

##### Default values

First, let's do everything with default values to get a general feeling for how this approach performs.

In [None]:
count_vectorizer_specific = CountVectorizer(vocabulary=specific_vocab)

In [None]:
count_vectorizer = CountVectorizer()

###### Random Forest

With full vocab

In [None]:
pipe_countvec_rf = make_pipeline(count_vectorizer, RandomForestClassifier(n_estimators=10,
                                                                          random_state=SEED,
                                                                          n_jobs=-1))
fitted_countvec_rf = fit_ensemble(pipe_countvec_rf, sss, X_train_text, y_train)
res_countvec_rf_full = evaluate_ensemble(fitted_countvec_rf, evaluator, X_test_text, y_test,
                                         return_res=True, method=f"countvec rf full v{SEED}")

In [None]:
save_pickle(res_countvec_rf_full, OUTPUT_PATH)

Only top k words specific to the corpus

In [None]:
pipe_countvec_rf_specific = make_pipeline(count_vectorizer_specific, RandomForestClassifier(n_estimators=10,
                                                                                            random_state=1,
                                                                                            n_jobs=-1))
fitted_countvec_rf_specific = fit_ensemble(pipe_countvec_rf_specific, sss, X_train_text, y_train)
res_countvec_rf_specific = evaluate_ensemble(fitted_countvec_rf_specific, evaluator, X_test_text, y_test,
                                             return_res=True, method=f"countvec rf specific v{SEED}")

In [None]:
save_pickle(res_countvec_rf_specific, OUTPUT_PATH)

###### Support Vector Machine

We use scikit-learn's `StandardScaler` here as the SVC's default kernel (`rbf`) expects normalized features.

In [None]:
pipe_countvec_svc = make_pipeline(count_vectorizer, StandardScaler(with_mean=False),
                                  SVC(gamma="auto", probability=True, random_state=SEED))
fitted_countvec_svc = fit_ensemble(pipe_countvec_svc, sss, X_train_text, y_train)
res_countvec_svc_full = evaluate_ensemble(fitted_countvec_svc, evaluator, X_test_text, y_test,
                                          return_res=True, method=f"countvec svc full v{SEED}")

In [None]:
save_pickle(res_countvec_svc_full, OUTPUT_PATH)

In [None]:
pipe_countvec_svc_specific = make_pipeline(count_vectorizer_specific,
                                           StandardScaler(with_mean=False),
                                           SVC(gamma="auto", probability=True, random_state=SEED))
fitted_countvec_svc_specific = fit_ensemble(pipe_countvec_svc_specific, sss, X_train_text, y_train)
res_countvec_svc_specific = evaluate_ensemble(fitted_countvec_svc_specific, evaluator, X_test_text, y_test,
                                              return_res=True, method=f"countvec svc specific v{SEED}")

In [None]:
save_pickle(res_countvec_svc_specific, OUTPUT_PATH)

##### Grid search on best default models

###### Full vocab + SVC

In [None]:
pipe = Pipeline([
    ("vec", CountVectorizer()),
    ("std", StandardScaler(with_mean=False)),
    ("svc", SVC(probability=True))
    ])
params = {"vec__stop_words": ["english", None],
          "vec__ngram_range": [(1, 1), (1, 2), (1, 3)], 
          "vec__max_df": uniform(loc=0.8, scale=0.2),
          "vec__min_df": uniform(loc=0.0, scale=0.2),
          "vec__max_features": randint(low=1000, high=20000),
          "svc__C": scipy.stats.expon(scale=1.0),
          "svc__gamma": ["auto", "scale"],
          "svc__kernel": ["rbf"],
          "svc__class_weight": ["balanced", None]}

grid = RandomizedSearchCV(pipe,
                          params,
                          n_iter=50,
                          scoring="roc_auc",
                          cv=5,
                          return_train_score=False)

In [None]:
grid_fitted = grid.fit(X_train_text, y_train)

In [None]:
joblib.dump(grid_fitted.best_estimator_, MODEL_PATH / "countvec_full_svc_grid_best.pkl", compress=1)

In [None]:
countvec_svc_full_grid_best = joblib.load(MODEL_PATH / "countvec_full_svc_grid_best.pkl")
countvec_svc_full_grid_best = countvec_svc_full_grid_best.set_params(svc__random_state=SEED)
fitted_countvec_svc_full_grid_best = fit_ensemble(countvec_svc_full_grid_best, sss,
                                                X_train_text, y_train, print_progress=True)
res_countvec_svc_full_best = evaluate_ensemble(fitted_countvec_svc_full_grid_best, evaluator, X_test_text, y_test,
                                              return_res=True, method=f"countvec svc full best params v{SEED}") 

In [None]:
countvec_svc_full_grid_best.get_params()

In [None]:
save_pickle(res_countvec_svc_full_best, OUTPUT_PATH)

###### Specific vocab + SVC

In [None]:
pipe = Pipeline([
    ("vec", CountVectorizer(vocabulary=specific_vocab)),
    ("std", StandardScaler(with_mean=False)),
    ("svc", SVC(probability=True))
    ])
params = {"vec__stop_words": ["english", None],
          "vec__ngram_range": [(1, 1), (1, 2), (1, 3)], 
          "vec__max_df": uniform(loc=0.8, scale=0.2),
          "vec__min_df": uniform(loc=0.0, scale=0.2),
          "vec__max_features": randint(low=1000, high=20000),
          "svc__C": scipy.stats.expon(scale=1.0),
          "svc__gamma": ["auto", "scale"],
          "svc__kernel": ["rbf"],
          "svc__class_weight": ["balanced", None]}

grid = RandomizedSearchCV(pipe,
                          params,
                          n_iter=50,
                          scoring="roc_auc",
                          cv=5,
                          return_train_score=False)

In [None]:
grid_fitted = grid.fit(X_train_text, y_train)

In [None]:
joblib.dump(grid_fitted.best_estimator_, MODEL_PATH / "countvec_specific_svc_grid_best.pkl", compress=1)

In [None]:
countvec_svc_specific_grid_best = joblib.load(MODEL_PATH / "countvec_specific_svc_grid_best.pkl")
countvec_svc_specific_grid_best = countvec_svc_specific_grid_best.set_params(svc__random_state=SEED)
fitted_countvec_svc_specific_grid_best = fit_ensemble(countvec_svc_specific_grid_best, sss,
                                                X_train_text, y_train, print_progress=True)
res_countvec_svc_specific_best = evaluate_ensemble(fitted_countvec_svc_specific_grid_best, evaluator, X_test_text, y_test,
                                              return_res=True, method=f"best params countvec svc specific v{SEED}") 

In [None]:
countvec_svc_specific_grid_best.get_params()

In [None]:
save_pickle(res_countvec_svc_specific_best, OUTPUT_PATH)

###### Create summary of grid search results and save to html

In [None]:
list_countvec_grid = read_results(OUTPUT_PATH / "raw", "best_params_countvec*.pkl")

In [None]:
res = pd.concat(list_countvec_grid)

In [None]:
res_groupby_method = res.groupby("base_method")
res_analysis_table = res_groupby_method.agg({c : ["mean", "std"] for c in ["mean train auc", "mean cv auc", "mean test auc"]})

In [None]:
out = res_analysis_table.sort_values([("mean test auc", "mean"), ("mean cv auc", "mean")], axis=0, ascending=False)
out.index.name = None

In [None]:
index = pd.MultiIndex.from_product([["train auc", "cv auc", "test auc"], ["mean", "std"]])
out.columns = index

In [None]:
out

In [None]:
save_html(out, "summary_countvec_svc_best_params", out=OUTPUT_PATH, index=True)

#### TfidfVectorizer + Classifier

The next approach to feature extraction from the text data that I want to try is the `Term-Frequency-Inverse-Document-Frequency` technique implemented in scikit-learn's `TfidfVectorizer`. This method creates the same matrix as the `CountVectorizer` but divides the values for each token in the vocabulary by its frequency across all documents in the corpus.

##### Default values

In [None]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer_specific = TfidfVectorizer(vocabulary=specific_vocab)

###### Random Forest

Full vocab

In [None]:
pipe_tfidf = make_pipeline(tfidf_vectorizer, RandomForestClassifier(n_estimators=10,
                                                                    random_state=SEED,
                                                                    n_jobs=-1))
fitted_tfidf = fit_ensemble(pipe_tfidf, sss, X_train_text, y_train)
res_tfidf_rf_full = evaluate_ensemble(fitted_tfidf, evaluator, X_test_text, y_test,
                                      return_res=True, method=f"tfidf rf full v{SEED}")

In [None]:
save_pickle(res_tfidf_rf_full, OUTPUT_PATH)

Only specific vocab

In [None]:
pipe_tfidf_specific = make_pipeline(tfidf_vectorizer_specific, RandomForestClassifier(n_estimators=10,
                                                                                      random_state=SEED,
                                                                                      n_jobs=-1))
fitted_tfidf_specific = fit_ensemble(pipe_tfidf_specific, sss, X_train_text, y_train)
res_tfidf_rf_specific = evaluate_ensemble(fitted_tfidf_specific, evaluator, X_test_text, y_test,
                                          return_res=True, method=f"tfidf rf specific v{SEED}")

In [None]:
save_pickle(res_tfidf_rf_specific, OUTPUT_PATH)

###### Support Vector Machine

In [None]:
pipe_tfidf_svc = make_pipeline(tfidf_vectorizer,
                               StandardScaler(with_mean=False),
                               SVC(gamma="auto",
                                   probability=True,
                                   random_state=SEED))
fitted_tfidf_svc = fit_ensemble(pipe_tfidf_svc, sss, X_train_text, y_train)
res_tfidf_svc_full = evaluate_ensemble(fitted_tfidf_svc, evaluator, X_test_text, y_test,
                                       return_res=True, method=f"tfidf svc full v{SEED}")

In [None]:
save_pickle(res_tfidf_svc_full, OUTPUT_PATH)

In [None]:
pipe_tfidf_svc_specific = make_pipeline(tfidf_vectorizer_specific,
                                        StandardScaler(with_mean=False),
                                        SVC(gamma="auto",
                                            probability=True,
                                            random_state=SEED))
fitted_tfidf_svc_specific = fit_ensemble(pipe_tfidf_svc_specific, sss, X_train_text, y_train)
res_tfidf_svc_specific = evaluate_ensemble(fitted_tfidf_svc_specific, evaluator, X_test_text, y_test,
                                           return_res=True, method=f"tfidf svc specific v{SEED}")

In [None]:
save_pickle(res_tfidf_svc_specific, OUTPUT_PATH)

##### Collect results from running the above with different seeds

In [None]:
list_countvec = read_results(OUTPUT_PATH / "raw", "countvec*.pkl")

In [None]:
list_tfidf = read_results(OUTPUT_PATH / "raw", "tfidf*.pkl")

In [None]:
results_tfidf_countvec = pd.concat([e for l in [list_countvec, list_tfidf] for e in l])

In [None]:
results_tfidf_countvec

In [None]:
save_html(df=results_tfidf_countvec.drop("base_method", axis=1).sort_values(["mean test auc", "mean cv auc"], axis=0, ascending=False),
          name="countvec_tfidf_results_default_all_appendix",
          out=OUTPUT_PATH,
          index=False)

##### Summarize results and save overview table

In [None]:
res = results_tfidf_countvec.sort_values(["mean test auc", "mean cv auc"], axis=0, ascending=False)

In [None]:
res_groupby_method = res.groupby("base_method")
res_analysis_table = res_groupby_method.agg({c : ["mean", "std"] for c in ["mean train auc", "mean cv auc", "mean test auc"]})

In [None]:
out = res_analysis_table.sort_values([("mean test auc", "mean"), ("mean cv auc", "mean")], axis=0, ascending=False)
out.index.name = None

In [None]:
index = pd.MultiIndex.from_product([["train auc", "cv auc", "test auc"], ["mean", "std"]])
out.columns = index

In [None]:
out

In [None]:
save_html(df=out,
          name="summary_countvec_tfidf_results_default",
          out=OUTPUT_PATH,
          index=True)

#### Pretrained word embeddings + neural network 

The below is heavily based on these two Kaggle Kernels:
- [Processing text when using word embeddings](https://www.kaggle.com/christofhenkel/how-to-preprocessing-when-using-embeddings/notebook)
- [Comparing word embeddings](https://www.kaggle.com/sudalairajkumar/a-look-at-different-embeddings/notebook)

##### Config

In [None]:
text_len = data["text"].apply(lambda x: len(x.split(" ")))
text_para_len = data["text"].apply(lambda x: len(x.split("\n")))

In [None]:
print(f"There are {len(vocab)} distinct words in the blog posts.")
print()
print("Summary statistics on the number of words across all blog posts:")
print(f"   Mean: {text_len.mean():.2f}")
print(f"   Median: {text_len.median():.2f}")
print(f"   Maximum: {text_len.max()}")
print(f"   Minimum: {text_len.min()}")
print(f"   Standard deviation: {text_len.std():.2f}")
print()
print("Summary statistics on the number of paragraphs across all blog posts:")
print(f"   Mean: {text_para_len.mean():.2f}")
print(f"   Median: {text_para_len.median():.2f}")
print(f"   Maximum: {text_para_len.max()}")
print(f"   Minimum: {text_para_len.min()}")
print(f"   Standard deviation: {text_para_len.std():.2f}")

In [None]:
MAX_FEATURES = 10000 # how many unique words to use (i.e num rows in embedding vector)
MAX_LEN = 1000 # max number of words in a blog post
EMBEDDING_PATH = Path.cwd() / "resources" / "embeddings"
EMBEDDING_FOLDER = EMBEDDING_PATH / "glove.840B.300d"
SPECIFIC_ONLY = False
L2_REG = None

##### Read embeddings

In [None]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype="float32")

In [None]:
if (EMBEDDING_FOLDER / "embeddings_index.pkl").is_file():
    with open(EMBEDDING_FOLDER / "embeddings_index.pkl", "rb") as handle:
        embeddings_index = pickle.load(handle)
else:
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FOLDER / "glove.840B.300d.txt"))
    with open(EMBEDDING_FOLDER / "embeddings_index.pkl", "wb") as handle:
        pickle.dump(embeddings_index, handle, protocol=pickle.HIGHEST_PROTOCOL)

##### Preprocess the text data to work well with embeddings

In [None]:
X_train = pd.Series(X_train_text.copy())
X_test = pd.Series(X_test_text.copy())

In [None]:
X_train = X_train.apply(lambda x: x.lower())
X_train = X_train.apply(lambda x: clean_apostrophe(x))
X_test = X_train.apply(lambda x: fix_punctuation(x))
X_train = X_train.apply(lambda x: fix_specific(x))

Apply the same transformations to the test set

In [None]:
X_test = X_test.apply(lambda x: x.lower())
X_test = X_test.apply(lambda x: clean_apostrophe(x))
X_test = X_test.apply(lambda x: fix_punctuation(x))
X_test = X_test.apply(lambda x: fix_specific(x))

##### Tokenize the texts 

In [None]:
tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(list(X_train))

In [None]:
X_train = tokenizer.texts_to_sequences(X_train_text)
X_test = tokenizer.texts_to_sequences(X_test_text)

In [None]:
X_train = pad_sequences(X_train, maxlen=MAX_LEN)
X_test = pad_sequences(X_test, maxlen=MAX_LEN)

##### Process embeddings into a matrix of size `(max_features, embed_size)`

In [None]:
all_embs = np.stack(embeddings_index.values())
emb_mean, emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

In [None]:
word_index = tokenizer.word_index

if SPECIFIC_ONLY: word_index = {word : i for word, i in word_index.items() if word in specific_vocab}

In [None]:
nb_words = min(MAX_FEATURES, len(tokenizer.word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= MAX_FEATURES: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

##### Define model

In [None]:
from keras import regularizers
reg = regularizers.l2(L2_REG) if L2_REG else None

In [None]:
inp = Input(shape=(MAX_LEN,))
x = Embedding(MAX_FEATURES, embed_size, weights=[embedding_matrix], embeddings_regularizer=reg)(inp)
x = Bidirectional(GRU(64, return_sequences=True, kernel_regularizer=reg))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu", kernel_regularizer=reg)(x)
x = Dropout(0.6)(x)
x = Dense(1, activation="sigmoid")(x)


model = Model(inputs=inp, outputs=x)
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
print(model.summary())

##### Fit model

In [None]:
from keras.callbacks import EarlyStopping

In [None]:
model.fit(X_train, y_train, batch_size=16, epochs=10,
          validation_data=(X_test, y_test), callbacks=[EarlyStopping(min_delta=0.005, patience=2, restore_best_weights=True)])

##### Evaluate model

In [None]:
train_auc = metrics.roc_auc_score(y_train, model.predict(X_train))
test_auc = metrics.roc_auc_score(y_test, model.predict(X_test))

In [None]:
results_keras = pd.DataFrame({"method": f"embeddings keras v{SEED}", "train auc": train_auc, "test auc": test_auc},
                             index=[0])

In [None]:
results_keras

In [None]:
save_pickle(results_keras, OUTPUT_PATH)

##### Collect results and save for report

In [None]:
list_keras = read_results(OUTPUT_PATH / "raw", "*keras*.pkl")

In [None]:
full_results_keras = pd.concat(list_keras)

In [None]:
full_results_keras

In [None]:
save_html(df=full_results_keras.drop("base_method", axis=1).sort_values(["test auc"], axis=0, ascending=False),
          name="keras_results_all_appendix",
          out=OUTPUT_PATH,
          index=False)

Aggregate results and save for report

In [None]:
res = full_results_keras.groupby("base_method").agg({c : ["mean", "std"] for c in ["train auc", "test auc"]})

In [None]:
out = res.sort_values([("test auc", "mean")], axis=0, ascending=False)
out.index.name = None

In [None]:
index = pd.MultiIndex.from_product([["train auc", "test auc"], ["mean", "std"]])
out.columns = index

In [None]:
out

In [None]:
save_html(df=out,
          name="keras_results_summary",
          out=OUTPUT_PATH,
          index=True)

#### Language model + neural network

In [None]:
from fastai.datasets import URLs
from fastai.text import TextList
from fastai.basic_data import DatasetType
from fastai.text.learner import text_classifier_learner
from fastai.text.learner import language_model_learner

In [None]:
PATH = MODEL_PATH / "fastai"
if not PATH.is_dir():
    PATH.mkdir(parents=True)

In [None]:
MIN_N_TOKENS = 0
BPTT = 1000
MAX_LEN = 2000
SPLIT_TEXTS = False

Optionally, we split the long texts into chunks based on new line characters, train on those and put the results back together in the end

In [None]:
X_train_full = pd.DataFrame({"text": X_train_text, "label": y_train})
X_test_full = pd.DataFrame({"text": X_test_text, "label": y_test})

if SPLIT_TEXTS:
    X_train_full_exploded = explode_texts(X_train_full)
    X_test_full_exploded = explode_texts(X_test_full)

    X_train_full_exploded["is_test"] = False
    X_test_full_exploded["is_test"] = True

    data_proc = pd.concat([X_train_full_exploded, X_test_full_exploded], axis=0)
else:
    X_train_full["is_test"] = False
    X_test_full["is_test"] = True
    data_proc = pd.concat([X_train_full, X_test_full], axis=0)
    
n_tokens = data_proc["text"].apply(lambda x: len(x.split(" ")))

print("Summary statistics on the number of words across all blog posts:")
print(f"   Mean: {n_tokens.mean():.2f}")
print(f"   Median: {n_tokens.median():.2f}")
print(f"   Maximum: {n_tokens.max()}")
print(f"   Minimum: {n_tokens.min()}")
print(f"   Standard deviation: {n_tokens.std():.2f}")

In [None]:
data_proc.head()

In [None]:
data_proc.shape

##### Train and Evaluate the Language Model (Metric : Accuracy)

In [None]:
data_lm = (TextList.from_df(df=data_proc, path=PATH, cols="text")
             .random_split_by_pct(0.2)
             .label_for_lm()
             .databunch())

In [None]:
learn = language_model_learner(data_lm, pretrained_model=URLs.WT103, bptt=BPTT, drop_mult=0.5)

In [None]:
learn.lr_find()
learn.recorder.plot(skip_end=15)

In [None]:
lr = 5e-2

In [None]:
learn.fit_one_cycle(2, lr, moms=(0.8, 0.7))

In [None]:
learn.save("lm_fit_head")

In [None]:
learn.load("lm_fit_head");

In [None]:
learn.unfreeze()
learn.fit_one_cycle(4, lr/10, moms=(0.8, 0.7))

In [None]:
learn.save("lm_fine_tuned")

In [None]:
learn.save_encoder("lm_fine_tuned_enc")

##### Train and Evaluate the Classifier (Metric = ROC AUC)

###### Setup

In [None]:
clf_df = data_proc.copy()[n_tokens >= MIN_N_TOKENS]

In [None]:
clf_df.shape

In [None]:
data_clf = (TextList.from_df(df=clf_df, path=PATH, cols=["text"], vocab=data_lm.vocab)
               .split_from_df("is_test")
               .label_from_df(cols="label")
               .databunch(bs=8))

In [None]:
clf_df.head()

In [None]:
data_clf.show_batch()

###### Training

In [None]:
learn = text_classifier_learner(data_clf, bptt=BPTT, max_len=MAX_LEN, drop_mult=0.3)
learn.load_encoder("lm_fine_tuned_enc")
learn.freeze()

In [None]:
learn.lr_find()
learn.recorder.plot();

In [None]:
lr = 5e-3

In [None]:
learn.fit_one_cycle(1, lr, moms=(0.8, 0.7))

In [None]:
learn.save("clf_first_stage")

In [None]:
learn.load("clf_first_stage");

In [None]:
learn.freeze_to(-2)
learn.fit_one_cycle(1, slice(lr/(2.6**4), lr), moms=(0.8, 0.7))

In [None]:
learn.save("clf_second_stage")

In [None]:
learn.load("clf_second_stage");

In [None]:
learn.freeze_to(-3)
learn.fit_one_cycle(1, slice((lr/5)/(2.6**4), lr/5), moms=(0.8, 0.7))

In [None]:
learn.save("clf_third_stage")

In [None]:
learn.load("clf_third_stage");

In [None]:
learn.unfreeze()
learn.fit_one_cycle(2, slice((lr/10)/(2.6**4), lr/10), moms=(0.8, 0.7))

###### Evaluation

In [None]:
p1_test = clf_df.loc[clf_df.is_test == True, "text"].apply(lambda t: learn.predict(t)[2].numpy()[1])

In [None]:
p1_train = clf_df.loc[clf_df.is_test == False, "text"].apply(lambda t: learn.predict(t)[2].numpy()[1])

In [None]:
#This doesn't seem to produce results consistent with what we get when explicitly calling predict on each sample
#I do need to finish this assignment before looking into this more closely
#Thus, I will use the slower version above for now
#p1_test = learn.get_preds()[0][:, 1].numpy()
#p1_train = learn.get_preds(ds_type=DatasetType.Train)[0][:, 1].numpy()

In [None]:
if SPLIT_TEXTS:
    print("TRAIN PERFORMANCE")
    evaluate_exploded(clf_df[clf_df.is_test == False], p1_train, y_train)
    print()
    print("TEST PERFORMANCE")
    evaluate_exploded(clf_df[clf_df.is_test == True], p1_test, y_test)
else:
    print("TRAIN PERFORMANCE")
    train_auc = metrics.roc_auc_score(clf_df.loc[clf_df.is_test == False, "label"], p1_train)
    print(f"AUC: {train_auc:.2f}")
    print()
    print("TEST PERFORMANCE")
    test_auc = metrics.roc_auc_score(clf_df.loc[clf_df.is_test == True, "label"], p1_test)
    print(f"AUC: {test_auc:.2f}")
    results_lm_finetune = pd.DataFrame({"method": f"lm fine tuning v{SEED}", "train auc": train_auc, "test auc": test_auc},
                         index=[0])

In [None]:
results_lm_finetune

In [None]:
save_pickle(results_lm_finetune, OUTPUT_PATH)

Collect results and save for report

In [None]:
list_lm_finetune = read_results(OUTPUT_PATH, "*lm*.pkl")

In [None]:
full_results_lm_finetune = pd.concat(list_lm_finetune)

In [None]:
full_results_lm_finetune

In [None]:
save_html(df=full_results_lm_finetune.drop("base_method", axis=1).sort_values(["test auc"], axis=0, ascending=False),
          name="lm_finetune_results_all_appendix",
          out=OUTPUT_PATH,
          index=False)

Summarize results and save summary

In [None]:
res = full_results_lm_finetune.groupby("base_method").agg({c : ["mean", "std"] for c in ["train auc", "test auc"]})

In [None]:
out = res.sort_values([("test auc", "mean")], axis=0, ascending=False)
out.index.name = None

In [None]:
index = pd.MultiIndex.from_product([["train auc", "test auc"], ["mean", "std"]])
out.columns = index

In [None]:
out

In [None]:
save_html(df=out,
          name="lm_finetune_results_summary",
          out=OUTPUT_PATH,
          index=True)