In [103]:
import pandas as pd
from fastai.text import * 
from sklearn.metrics import roc_auc_score, accuracy_score, roc_curve, auc

# OBTAINING DATA

In [25]:
# Downloading data
!kaggle competitions download -c jigsaw-unintended-bias-in-toxicity-classification

sample_submission.csv.zip: Skipping, found more recently modified local copy (use --force to force download)
test.csv.zip: Skipping, found more recently modified local copy (use --force to force download)
train.csv.zip: Skipping, found more recently modified local copy (use --force to force download)


In [26]:
# Unzipping data
!unzip test.csv.zip -d data/
!unzip train.csv.zip -d data/
!unzip sample_submission.csv.zip -d data/

Archive:  test.csv.zip
  inflating: data/test.csv           
Archive:  train.csv.zip
  inflating: data/train.csv          
Archive:  sample_submission.csv.zip
  inflating: data/sample_submission.csv  


In [2]:
# Setting data path
DATA_PATH = './data'

In [3]:
# Reading data into dataframe
df_train = pd.read_csv(DATA_PATH + '/train.csv')
df_test = pd.read_csv(DATA_PATH + '/test.csv')

In [8]:
# Using sampling
df_train = df_train.head(100)
df_test = df_test.head(10)

In [9]:
df_all = pd.concat([df_train, df_test], ignore_index=True, sort=False)

# LANGUAGE MODEL

In [11]:
# Creating databunch with all data for language model and store it
databunch_lm = (
    TextList.from_df(df=df_all, path=DATA_PATH, cols=['comment_text'])
        .split_by_rand_pct(0.1)
        .label_for_lm()
        .databunch())

databunch_lm.save('databunch_lm.pkl')

In [12]:
# Load language model databunch
databunch_lm = load_data(DATA_PATH, file='databunch_lm.pkl')

In [13]:
# Creating learner with model
language_model_learner = language_model_learner(databunch_lm, TransformerXL, drop_mult=0.05)

  warn("There are no pretrained weights for that architecture yet!")


In [None]:
# Finding the best learning rate 
language_model_learner.unfreeze()
language_model_learner.lr_find()
language_model_learner.recorder.plot(suggestion=True)

In [19]:
# Training language model
language_model_learner.unfreeze()
language_model_learner.fit_one_cycle(5, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,7.48782,5.899519,0.318973,00:37
1,6.807964,5.459869,0.057589,00:37
2,6.298919,4.480237,0.318973,00:38
3,5.931885,4.477242,0.318973,00:38
4,5.718742,4.489619,0.318973,00:38


In [20]:
# Saving encoder
language_model_learner.save_encoder('encoder_transformer_xl')

# CLASSIFIER

In [22]:
# Creating databunch for classifier using previous vocab and storing it
databunch_classifier = (
        TextList.from_df(df=df_train, path=DATA_PATH, cols=['comment_text'], vocab=databunch_lm.train_ds.vocab)
            .split_by_rand_pct(0.1)
            .label_from_df(cols='target')
            .add_test(TextList.from_df(df_test, path=DATA_PATH))
            .databunch())

databunch_classifier.save('databunch_classifier.pkl')


In [24]:
# Load classifier databunch
databunch_classifier = load_data(DATA_PATH, file='databunch_classifier.pkl', bs=16)

In [25]:
# Example of a batch
databunch_classifier.show_batch()

text,target
"xxbos xxmaj tuesday xxmaj jan. xxunk , xxunk of \n this xxunk ' xxunk ' xxunk to \n xxmaj john xxmaj day , xxmaj or to try to xxunk the xxunk \n there to come to xxmaj malheur and support them . xxmaj he xxunk to go \n without the xxunk of the local \n sheriff in xxmaj burns , xxmaj xxunk xxmaj ward .",0.5
"xxbos xxmaj tuesday xxmaj jan. xxunk , xxunk of \n this xxunk ' xxunk ' xxunk to \n xxmaj john xxmaj day , xxmaj or to try to xxunk the xxunk \n there to come to xxmaj malheur and support them . xxmaj he xxunk to go \n without the xxunk of the local \n sheriff in xxmaj burns , xxmaj xxunk xxmaj ward .",0.4
"xxbos xxmaj xxunk xxmaj david , we did xxunk all of these xxunk when xxunk our system ! xxmaj in xxunk , we assume people will try to xxunk it ; that 's why we have a lot going on xxunk the xxunk xxunk . xxmaj the xxunk - xxunk xxunk of the xxunk is very xxunk , but there are a lot of xxunk and xxunk on the backend",0.0
"xxbos xxmaj one would hope that the xxunk of xxunk this system is to xxunk more debate and discussion , not less . xxmaj it seems there are xxunk things that xxunk the xxunk of discussion : xxmaj xxunk comments that are xxunk xxunk or xxunk xxunk xxunk ; xxunk xxunk ( which has been a real problem on xxup ww 's comment section , in my xxunk ) ;",0.0
"xxbos xxmaj xxunk xxmaj bundy seems like a nice , xxunk xxunk being who has been xxunk a xxunk of xxunk by his xxunk xxunk xxunk , who he xxunk . xxmaj xxunk has a bunch of other xxunk xxunk xxunk following xxunk as well . xxmaj where ? xxmaj no one xxunk xxunk . xxmaj and , xxunk you xxunk in xxmaj burns no one xxunk . xxmaj xxunk",0.0


In [39]:
# Creating text classifier learner
classifier_learner = text_classifier_learner(databunch_classifier, TransformerXL, drop_mult=0.05)
classifier_learner.load_encoder('encoder_transformer_xl')

  warn("There are no pretrained weights for that architecture yet!")


In [None]:
# Finding learning rate
classifier_learner.lr_find()
classifier_learner.recorder.plot(suggestion=True)

In [40]:
# Training only classifier last layer
classifier_learner.fit_one_cycle(1, 1e-3)

epoch,train_loss,valid_loss,time
0,0.073298,0.021866,00:15


In [None]:
# Finding the best learning rate when unfreezed
classifier_learner.unfreeze()
classifier_learner.lr_find()
classifier_learner.recorder.plot(suggestion=True)

In [41]:
# Training all classifier
classifier_learner.unfreeze()
classifier_learner.fit_one_cycle(5, slice(1e-6, 1e-4))

epoch,train_loss,valid_loss,time
0,0.058781,0.021065,00:33
1,0.057206,0.020406,00:35
2,0.059186,0.019833,00:33
3,0.060962,0.019289,00:33
4,0.058467,0.018798,00:38


In [42]:
# Save model
classifier_learner.save('classifier_model')

In [29]:
# Export model
classifier_learner.export('classifier_transformer_xl.pkl')

In [30]:
# Load model
classifier_learner = load_learner(DATA_PATH,'classifier_transformer_xl.pkl')

In [31]:
# Example
classifier_learner.predict("This kid son of a bitch! you are a motherfucker!!! I will kill you!!!")

(FloatItem [0.011184], tensor([0.0112]), tensor([0.0112]))

# TESTING

In [89]:
# Testing out of fold (validation)
oof_preds = classifier_learner.get_preds(ds_type=DatasetType.Valid)

In [70]:
# Convert predictions to numpy
oof_prediction = oof_preds[0].cpu().data.numpy()
oof_labels = oof_preds[1].cpu().data.numpy()

In [109]:
# Label as toxic when threshold above 0.5
oof_labels_booleans = oof_labels>=0.5
oof_prediction_booleans = oof_prediction>=0.5

In [110]:
# Accuracy
accuracy_score(oof_labels_booleans, oof_prediction_booleans)

1.0

In [None]:
# ROC-AUC
roc_auc_score(oof_labels_booleans,oof_prediction_booleans)

# GENERATE SUBMISSION

In [None]:
# Calculate preditions for test set
test_preds = classifier_learner.get_preds(ds_type=DatasetType.Test, ordered=True)

In [90]:
# Convert to numpy
test_prediction = test_preds[0].cpu().data.numpy()

In [98]:
# Read submission file
submission = pd.read_csv(DATA_PATH + '/sample_submission.csv', index_col='id')
submission = submission.head(10)

In [99]:
# Fill submission file with predictions
submission['prediction'] = test_prediction[:]

In [101]:
# Store submission
submission.to_csv('submission.csv')
submission.head()

Unnamed: 0_level_0,prediction
id,Unnamed: 1_level_1
7000000,0.042214
7000001,0.042214
7000002,0.042214
7000003,0.042214
7000004,0.042214
