---
# Jigsaw Competition

---
---

In [None]:
TASK = 'TRAIN'

if TASK == 'TRAIN':
    print('\n\n\t\tMAKE SURE INTERNET IS ON FOR TRAINING!\n\n')
    
if TASK == 'INFER':
    print('\n\n\t\tMAKE SURE INTERNET IS OFF!\n\n')

In [None]:
if TASK=='INFER':
    import sys
    path_to_flair = '../input/flair/offline_files/'
    sys.path.append(path_to_flair)
    
if TASK=='TRAIN':
    !pip install flair
    
import flair
from flair.data import Sentence
from flair.data import Corpus
from flair.embeddings import DocumentRNNEmbeddings, DocumentPoolEmbeddings
from flair.embeddings import FlairEmbeddings, WordEmbeddings
from flair.embeddings import BytePairEmbeddings
from flair.models.text_regression_model import TextRegressor
from flair.trainers import ModelTrainer
    
import gc
import time
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(font_scale=1.4)
sns.set_style('darkgrid')
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn-dark-palette')

from tqdm import tqdm
from joblib import parallel_backend, Parallel, delayed

import torch

from sklearn import metrics

# Saving
import pickle
import joblib

import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_colwidth', 200)

# THE DATA

In [None]:
df = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")
for col in ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']: 
    display(df.loc[df[col] == 1, ['comment_text', col]].sample(3))

df['severe_toxic'] = df.severe_toxic * 2
df['y'] = (df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis = 1)).astype(int)
df['y'] = df['y'] / df['y'].max()
df = df[['comment_text', 'y']].rename(columns = {'comment_text': 'text'})
display(df.sample(7, random_state=1), df.shape)

In [None]:
val_data = pd.read_csv('../input/jigsaw-toxic-severity-rating/validation_data.csv')
ss = pd.read_csv('../input/jigsaw-toxic-severity-rating/sample_submission.csv')
testset = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')
for i in [val_data, testset, ss]:
    display(i.sample(5), i.shape)

In [None]:
sns.distplot(df[['y']])

# MODELLING

In [None]:
if TASK=='TRAIN':
    
    # Prepping the Dataset 
    df_fst = df.copy()
    df_fst['labels'] = '__label__' + df_fst['y'].astype(str)
    df_fst = df_fst[['labels','text']]
    display(df_fst.head(3))

In [None]:
if TASK=='TRAIN':
    
    # Spltting the Dataset
    train_fst, test_fst, dev_fst = np.split(df_fst, [int(.9*len(df_fst)), int(.95*len(df_fst))])

    print('Original: ', df_fst.shape)
    print('Train: ', train_fst.shape)
    print('Test: ', test_fst.shape)
    print('Dev: ', dev_fst.shape)

In [None]:
if TASK=='TRAIN':
    
    # Create a folder
    !mkdir -p data_faster
    
    # Save to the folder
    train_fst.to_csv("data_faster/train.csv",sep='\t',index=False,header=False)  
    test_fst.to_csv("data_faster/test.csv",sep='\t',index=False,header=False)
    dev_fst.to_csv("data_faster/dev.csv",sep='\t',index=False,header=False)

In [None]:
if TASK=='TRAIN':
    
    # Create Corpus from Folder
    data_folder_fast = "data_faster"
    corpus_fst: Corpus = flair.datasets.ClassificationCorpus(data_folder_fast)

In [None]:
if TASK=='TRAIN':
    
    # Instantiate  Embeddings
    document_embeddings = DocumentRNNEmbeddings(    
                                                    [
                                                        # standard FastText word embeddings for English
                                                        WordEmbeddings('en'),
                                                        # Byte pair embeddings for English
                                                        BytePairEmbeddings('en'),
                                                    ], 
                                                    128, 1, False, 64, False, False
                                               )

## Start Training

In [None]:
gc.collect()

In [None]:
%%time
if TASK=='TRAIN':
    
    # Create the Model
    model = TextRegressor(document_embeddings)
    trainer = ModelTrainer(model, corpus_fst)
    # Train the Model
    trainer.train('data_faster',      # The path the model and training log will be saved in
                  learning_rate=0.1,   
                  mini_batch_size=32,
                  anneal_factor=0.5,
                  patience=4,          
                  train_with_dev=True, 
                  monitor_test=True,
                  max_epochs=120,
                  embeddings_storage_mode='cpu')

In [None]:
gc.collect()

## Validate

In [None]:
if TASK == 'TRAIN':
    
    # Load the Model
    flair_regressor = TextRegressor.load('./data_faster/final-model.pt')
    
if TASK == 'INFER':
    
    # Load the Model
    flair_regressor = TextRegressor.load('../input/jigsaw-flair-notebook/data_faster/final-model.pt')

In [None]:
%%time
def flair_prediction(texts, regressor):
    sentences = [Sentence(text) for text in texts]   # Create Sentence Object
    regressor.predict(sentences, mini_batch_size=32)  # Predictions are tagged to the sentence object
    return [float(str(sent.labels[0]).split(' ')[0])   # Extract prediction from the sentence
            for sent in tqdm(sentences)]

In [None]:
%%time
val_less_toxic = flair_prediction(val_data['less_toxic'].tolist(), flair_regressor)
val_more_toxic = flair_prediction(val_data['more_toxic'].tolist(), flair_regressor)

val_less_toxic = np.array(val_less_toxic)
val_more_toxic = np.array(val_more_toxic)

# Validation Accuracy
(val_less_toxic < val_more_toxic).mean()

# Predict on Testset

In [None]:
preds = flair_prediction(testset['text'].tolist(), flair_regressor)

In [None]:
testset['score'] = preds
testset = testset.sort_values(['score'])
testset['score'] = testset['score'].rank(method='first')
sub = testset[['comment_id', 'score']]
display(sub)

In [None]:
testset

In [None]:
sub.to_csv('submission.csv', index=False)