In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Main idea
pre-trained language model (wikitext 103) --> refined language model (trained on quora data) --> classifier

# Imports

In [None]:
# Must install specific version such that it's compatible with acc41 language model
!pip uninstall spacy --yes
!pip install spacy==2.2.4
!pip install fastai --upgrade

In [None]:
########## Additional Import ##########
import fastai
from fastai.losses import *
from fastai.text.all import *
from sklearn.model_selection import train_test_split
from sklearn import metrics
import multiprocessing as mp
from math import floor
import pickle
import spacy
from matplotlib import pyplot
#######################################

In [None]:
# Read training csv
train_df = pd.read_csv("/kaggle/input/quora-insincere-questions-classification/train.csv")
# Optional (to gain a sample)
train_df = train_df.sample(frac = 1)
print(train_df.shape)
train_df

In [None]:
# Proportion of sincere ot insincere questions
train_df.target.value_counts()

In [None]:
## Create is_train column that defines which rows are in the training set
seed = 38
np.random.seed(seed)
train_proportion = 0.8
proportion = floor(train_df.shape[0] * train_proportion)
idx = np.random.choice(train_df.shape[0], (proportion), replace=False)

train_df["is_train"] = train_df.index.isin(idx)
train_df

In [None]:
# Proportion of sincere to insincere if using custom validation set
train_df.loc[train_df.is_train].target.value_counts()

In [None]:
# Proportion of sincere to insincere if using custom validation set (dev set proportions)
train_df.loc[~train_df.is_train].target.value_counts()

# 1. Language Model

Datablock utilises parallelisation to tokenize and numericalize when textblock is passed to datablock.

In [None]:
# Create DataBlock (dev_set = 20% of train data i.e. entire data set --> no need to create a test seta as that isn't the objective)
dev_set_size = 0.2
quora_db = DataBlock(blocks=TextBlock.from_df('question_text', is_lm=True),
                       get_x=ColReader('text'), splitter=RandomSplitter(dev_set_size))
dls = quora_db.dataloaders(train_df, bs=128)
dls.show_batch(max_n=2)

* xxbos to indicate the beginning of a text
* xxmaj to indicate the next word was capitalized

In [None]:
"""
Creates a pre-trained langauge model (pretrained model: Wikitext 103) using AWD_LSTM architecture.
- Metrics: accuracy, perplexity (exponential of the loss)
- drop_mult is a parameter that controls the magnitude of all dropouts in that model 
(i.e. randomly drops input based on given probability for all layers, in this case we provide a value which scales the default probabilities)
- to_fp16 asserts that predictions are float16 values (helps speed up training on Nvidia GPUs with tensor cores)
"""

#dls = pickle.load(open("/kaggle/input/acc41-language-model/savelm.p", "rb"))
    
learn = language_model_learner(
    dls, AWD_LSTM, drop_mult=0.3, 
    metrics=[accuracy, Perplexity()]).to_fp16()

In [None]:
# Search for an appropriate learning rate 
# Template: start_lr = ?, end_lr = ?
learn.lr_find()

By default, a pretrained Learner is in a frozen state, meaning that only the head of the model will train while the body stays frozen. Below, fit one epoch then save in case we want to re-load the model and retrain once we fit more epochs.

In [None]:
# Trains only the last layer
learn.fit_one_cycle(1, 0.03)
learn.save("1epoch")

In [None]:
# Find new lr
learn.lr_find()

Begin fine-tuning model, by un-freezing, then training for multiple epochs.

Note: Encoder == saves the entire model except the final layer that converts activations to probabilities of picking each token in the vocabulary

In [None]:
# unfreeze all layers of the model
learn.unfreeze()

# Begin Training
# epochs = 10

# SaveModelCallback keeps the model with the best val_loss
# ReduceLROnPlateau reduces LR after 2 succesive epochs with no decrease greater than 0.1
callbacks = [SaveModelCallback(fname="languageModel"), 
             ReduceLROnPlateau(monitor='valid_loss', min_delta=0.1, patience=2)]

learn.fit_one_cycle(10, 1e-3, cbs = callbacks)

# Save model except final layer (this is what's used for the classifier)
learn.save_encoder("finetuned")

# Plot loss
learn.recorder.plot_loss()

In [None]:
# Save dls so that we can load it for the classifier
pickle.dump(dls, open("savelm.p", "wb"))

In [None]:
# Test the langauge model
learn.load("/kaggle/input/acc41-language-model/languageModel")
#learn.load("languageModel")
TEXT = input("Enter the start of a question: ")
#TEXT = "What do I do if"
N_WORDS = 10
learn.predict(TEXT, N_WORDS, temperature=0.3) 

# 2. Language Classifier

In [None]:
# Create DataBlock for classifier using pretrained vocab from language model (to ensure that they use the same token to index mapping) 
# Datablock will also handle any padding (for each batch pad by the size of the largest text [libraries cannot apply all batches with the same padding])
# Batches text with similar lengths

# If available pre-load language model dataloader from binary
try: 
    dls = pickle.load(open("savelm.p", "rb"))
except FileNotFoundError:
    dls = pickle.load(open("/kaggle/input/acc41-language-model/savelm.p", "rb"))
    
# If available pre-load language classifier from binary
try: 
    if seed != 42: raise Exception("New random state, re-intialise data loader")
    dls_classifier = pickle.load(open("../input/9417-v1/dls_classifier.p", "rb"))
    dls_classifier.show_batch(max_n=2)
except: 
    # Create dataloader (purpose: preprocessing and batching)
    dev_set_size = 0.2
    dls_classifier = DataBlock(
        blocks=(TextBlock.from_df('question_text', vocab=dls.vocab), CategoryBlock),
        get_x=ColReader('text'), 
        get_y=ColReader("target"), 
        splitter=RandomSplitter(dev_set_size)
    ).dataloaders(train_df.loc[train_df.is_train], bs=512)
    dls_classifier.show_batch(max_n=2)

    pickle.dump(dls_classifier, open("dls_classifier.p", "wb"))

In [None]:
# Initialise testing data loader
test_x = train_df.loc[~train_df.is_train].rename(columns={"question_text":"text"})
test_x_dl = dls_classifier.test_dl(test_x, with_labels=True)

In [None]:
# Dump test data loader binaries for later use if necessary
pickle.dump(test_x_dl, open("test_x_dl.p", "wb"))

In [None]:
# Initialise RNN and load pre-trained language model
# Metrics: F1-Score, RocAucBinary
learn = text_classifier_learner(dls_classifier, 
                                AWD_LSTM, 
                                drop_mult=0.5, 
                                loss_func=FocalLossFlat(),
                                metrics=[F1Score()]).to_fp16()
try: 
    # load encoding if available in kaggle working directory
    learn = learn.load_encoder("finetuned")
except:
    # Else load from kaggle data set
    learn = learn.load_encoder("/kaggle/input/acc41-language-model/finetuned")

Fine-tune by gradually unfreezing layers

In [None]:
# Find lr on un-refined model
learn.lr_find()

In [None]:
# Fit one cycle with appropriate lr from above.
learn.fit_one_cycle(1, 0.001)

In [None]:
# Save first cycle
learn.save("classifier_1")

In [None]:
# Find new lr
learn.lr_find()

In [None]:
# Refine last 2 layers
# Use discriminative layer training
learn.freeze_to(-2)
learn.fit_one_cycle(1, slice(1e-3/(2.6**4),1e-3))

In [None]:
# Save 2nd cycle
learn.save("classifier_2")

In [None]:
# Find new lr 
learn.lr_find()

In [None]:
# Refine last 3 layers
# Use discriminative layer training
learn.freeze_to(-3)
learn.fit_one_cycle(1, slice(1e-3/(2.6**4), 1e-3))

In [None]:
# Save 3rd cycle
learn.save("classifier_3")

In [None]:
# Find new lr 
learn.lr_find()

In [None]:
# Refine entire model
learn.unfreeze()
callbacks = [SaveModelCallback(fname="final_classifier"),
            ReduceLROnPlateau(monitor='valid_loss', min_delta=0.1, patience=2)]
learn.fit_one_cycle(4, slice(0.001/(2.6**4),0.001), cbs = callbacks)

# Plot loss
learn.recorder.plot_loss()

# Predictions

In [None]:
# Reload model if necessary
dls_classifier = pickle.load(open("/kaggle/input/d/juliangarratt/9417-v1/dls_classifier.p", "rb"))
learn = text_classifier_learner(dls_classifier, 
                                AWD_LSTM, 
                                drop_mult=0.5, 
                                loss_func=FocalLossFlat(),
                                metrics=[F1Score()],
                                model_dir="../input/d/juliangarratt/9417-v1/models").to_fp16()
learn.load("final_classifier")

In [None]:
# Test on validation set (loss, f1-score)
learn.validate()

In [None]:
# Get Preds
test_x_df = train_df.loc[~train_df.is_train]
try:
    test_dl = pickle.load(open("./test_x_dl.p", "rb"))
except: 
    # Initialise dls classifier if not available in locally
    test_dl = dls_classifier.test_dl(test_x_df.question_text.tolist())
preds = learn.get_preds(dl=test_dl)

In [None]:
# Calculate Predictions from proba & get true preds
y_true = test_x_df.target.to_numpy()
y_preds = np.argmax(np.array(preds[0].tolist()), axis=1)

In [None]:
# F-Score (out of sample)
print("Out of sample f-score:", metrics.f1_score(y_true, y_preds))

In [None]:
# Precision-Recall Curve
pyplot.figure(figsize=(8, 6), dpi=80)
precision, recall, thresholds = metrics.precision_recall_curve(y_true, np.array(preds[0].tolist())[:, 1])
no_skill = len(y_true[y_true==1]) / len(y_true)
pyplot.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
pyplot.plot(recall, precision, marker='.', label='Model')
pyplot.xlabel('Recall')
pyplot.ylabel('Precision')
pyplot.title("Precision-Recall Curve")
pyplot.legend()
pyplot.show()

In [None]:
# Calculate optimal threshold & plot curve from custom package
from f_score_thresholding_utility import *
threshold_finder(np.array(preds[0].tolist())[:, 0].tolist(), y_true)

# Notes

## Observations
* Sincere qs
    * Often contain math
    * Often highly specialised e.g. scientific jargon
* Insincere qs
    * Highly political
    * Contain swear words

## Resources
* https://www.kaggle.com/theoviel/improve-your-score-with-some-text-preprocessing
* https://stanford-cs221.github.io/autumn2019-extra/posters/162.pdf
* https://towardsdatascience.com/quora-insincere-questions-classification-d5a655370c47
* https://medium.com/@ph_singer/1st-place-in-kaggle-quora-insincere-questions-classification-competition-520616d39938

## Improvements
* Loss function --> focal loss (no improvements)
* Embeddinngs

## Previous results
* f-score 0.55 with dev_size=0.1 and sample=0.2
* f-score 0.63 with dev_size=0.2 and sample=0.8
* f-score 0.64 with dev_size=0.2 and sample=0.8
* f-score 0.64 (focal loss)