# Quora Insincere Question Classifier
* Taken from the framework found in FastAI lesson 4 IMDB classifier [notebook](https://nbviewer.jupyter.org/github/fastai/course-v3/blob/master/nbs/dl1/lesson3-imdb.ipynb)
* Language model does not use embeddings provided in competition and instead the pre-trained models included with FastAI 

In [None]:
import os, gc
from fastai.text import *
import pandas as pd
from fastai import *
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt

In [None]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
train.head()

In [None]:
plt.hist(train.question_text.apply(lambda x: len(x)), density = False, bins = 40)
#Length of questions asked

In [None]:
np.random.seed(42)
train_small = train.iloc[train.sample(frac=0.99).index]
train = train_small  #Comment out this cell later, taking smaller dataset in order to run through process
plt.bar(["False",'True'], train.groupby('target').count().qid)

### Use a 20% sample of data to test the model within the kernel here

In [None]:
train.shape

In [None]:
sample_size = 0.2
train_df =train.sample(frac=(1-sample_size))
valid_df = train[~train.index.isin(train_df)]

In [None]:
%%time
data_lm = TextLMDataBunch.from_df(path = '.',
                            train_df = train_df,
                            valid_df = valid_df,
                            test_df = test,
                            text_cols = 'question_text',
                            label_cols = 'target',
                            max_vocab = 20000)
print(len(data_lm.vocab.itos))
data_lm.save()

In [None]:
data_lm.show_batch()

In [None]:
data_lm.vocab.itos[100:105]

Create classification data-bunch. This sets the labels / targets to the actual labels of the data. 

In [None]:
%%time
data_class = TextClasDataBunch.from_df(path = '.',
                                       train_df = train_df,
                                       valid_df = valid_df,
                                       test_df = test,
                                       text_cols = 'question_text',
                                       label_cols = 'target',
                                       max_vocab = 20000,
                                       vocab=data_lm.vocab)

In [None]:
data_class.show_batch()

## Train Language Model, starting from wiki103 
* Note that this method following here uses pretrained embeddings from the fast ai library NOT the embeddings from the Quora competition. This is probably fine for outside work but would DQ from Quora competition

In [None]:
path = Path("../")
model_path = path/'models'
model_path.mkdir(exist_ok=True)
url = 'http://files.fast.ai/models/wt103_v1/'
download_url(f'{url}lstm_wt103.pth', model_path/'lstm_wt103.pth')
download_url(f'{url}itos_wt103.pkl', model_path/'itos_wt103.pkl')

In [None]:
learn = language_model_learner(data_lm, pretrained_fnames=['lstm_wt103', 'itos_wt103'], drop_mult=0.3, arch = AWD_LSTM, model_dir=model_path)

In [None]:
learn.lr_find()
learn.recorder.plot()

In [None]:
learn.fit_one_cycle(1, 1e-1)

In [None]:
learn.unfreeze()
learn.fit_one_cycle(1, 1e-1)

In [None]:
learn.save_encoder('ft_enc')

In [None]:
data_lm.vocab.itos[10:20]

In [None]:
learn.predict('Why in the earth', 10)
#So this looks great, ha

## Train Classifier
f-beta metric that's used in Quora classifier was found in fastai forum [Comment from wyquek](https://forums.fast.ai/t/f1-score-as-metric/30370/14)

In [None]:
def f1_score(y_pred, targets):
    epsilon = 1e-07
    
    y_pred = y_pred.argmax(dim = -1)
    #targets = targets.argmax(dim=-1)

    tp = (y_pred*targets).float().sum(dim=0)
    tn = ((1-targets)*(1-y_pred)).float().sum(dim=0)
    fp = ((1-targets)*y_pred).float().sum(dim=0)
    fn = (targets*(1-y_pred)).sum(dim=0)

    p = tp / (tp + fp + epsilon)
    r = tp / (tp + fn + epsilon)

    f1 = 2*p*r / (p+r+epsilon)
    f1 = torch.where(f1!=f1, torch.zeros_like(f1), f1)
    return f1.mean()

In [None]:
%%time
learn_class = text_classifier_learner(data_class, drop_mult = 0.5, 
                                      arch = AWD_LSTM, model_dir=model_path, 
                                     metrics = [accuracy, f1_score])

learn_class.load_encoder('ft_enc')

In [None]:
learn = None
gc.collect()

Nee dto trouble shoot the lr_find why the graph isn't what we'd expect

In [None]:
learn_class.lr_find()
learn_class.recorder.plot_lr()

In [None]:
learn_class.fit_one_cycle(3, 1e-3)

In [None]:
learn_class.freeze_to(-2)
learn_class.fit_one_cycle(4, slice(1e-3, 1e-1))

## Prediction threshold
* Currently the f1 score just takes the greater of the two predictions, but since there is some bias in the prediction it's useful to search potential thresholds to determine optimal threshold for F1 scoring
* Code taken from [fastai test](https://www.kaggle.com/mnpinto/quora-fastai-v1-0-baseline) by mnpinto

In [None]:
from sklearn.metrics import roc_curve, precision_recall_curve
def threshold_search(y_true, y_proba, plot=False):
    precision, recall, thresholds = precision_recall_curve(y_true, y_proba)
    thresholds = np.append(thresholds, 1.001) 
    F = 2 / (1/precision + 1/recall)
    best_score = np.max(F)
    best_th = thresholds[np.argmax(F)]
    if plot:
        plt.plot(thresholds, F, '-b')
        plt.plot([best_th], [best_score], '*r')
        plt.show()
    search_result = {'threshold': best_th , 'f1': best_score}
    return search_result 

In [None]:
gc.collect()

In [None]:
preds = learn_class.get_preds(DatasetType.Valid)
proba = to_np(preds[0][:,1])
ytrue = to_np(preds[1])

In [None]:
thr = threshold_search(ytrue, proba, plot=True); thr

By updating the threshold for a true flag to 0.19 we can achieve an f1 score of >0.5

In [None]:
probs, _ = learn_class.get_preds(DatasetType.Test)
preds = np.argmax(probs, axis=1)

submission = pd.DataFrame(test['qid'])
submission['prediction'] = preds 
submission.to_csv('submission.csv',index=False)
submission.head()