This is a simple/naive pipeline to show how to utilize fastai and ULMFIT in order to tackle this problem: https://docs.fast.ai/tutorial.text.html

Inference notebook can be found here: https://www.kaggle.com/alibaba19/jigsaw-ulmfit-fastai-inference/notebook

In [None]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
import fastai
import pathlib 
from pathlib import Path 
import random 
import numpy as np

In [None]:
from fastai.text.all import *

In [None]:
old_jigsaw_path = Path('../input/jigsaw-toxic-comment-classification-challenge')
input_path = Path('../input/jigsaw-toxic-severity-rating')

In [None]:
df_2 = pd.read_csv('/kaggle/input/jigsaw-toxic-severity-rating/validation_data.csv')
df_2.head(3)

In [None]:
!unzip ../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip


In [None]:
df = pd.read_csv('../working/train.csv')
df.shape

In [None]:
df.tail(3)

In [None]:
df[df.toxic==1].head(4)

In [None]:
df['total_tox'] = df[df.columns[2:]].sum(axis=1)

In [None]:
#the code above basically takes the sum of 6 columns starting from the column after comment_text

In [None]:
df['total_tox'].value_counts()

The code then takes the sum and spits out which text cells have those values (with 0 being not toxic at all and 6 being extremely freaking toxic) 

So most of them don't have any toxicity, which we can score as zero. Only 31 are maximally toxic on our arbitrary scale. Probably need to be more mindful about the data collection and parsing, but we'll use this for now to proceed

In [None]:
db_lm = DataBlock(
            blocks = TextBlock.from_df('comment_text', is_lm=True),
    get_x =ColReader('text'),       
    splitter=RandomSplitter(0.1),
    ).dataloaders(df, bs=64, seq_len=80)

In [None]:
db_lm.show_batch(max_n=1)

In [None]:
vocab = pd.Series(db_lm.vocab)
vocab.to_csv('lm_vocab.csv')

In [None]:
vocab


In [None]:
learn = language_model_learner(
            db_lm, AWD_LSTM, drop_mult=0.5,
            metrics=[accuracy, Perplexity()]).to_fp16()

In [None]:
learn.fit_one_cycle(1,2e-2)

In [None]:
learn.save('1epoch')

In [None]:
learn = learn.load('1epoch')

In [None]:
learn.unfreeze()
learn.fit_one_cycle(10,2e-2)

In [None]:
learn.save_encoder('lm_encoder')

In [None]:
toxic_db = DataBlock(blocks=(TextBlock.from_df('comment_text', vocab=db_lm.vocab), RegressionBlock),
                     get_x = ColReader('text'),
                     get_y = ColReader('total_tox'),
                     splitter = RandomSplitter())

In [None]:
dls = toxic_db.dataloaders(df, bs=64)
dls.show_batch(max_n=1)
dls.show_batch(max_n=1)

In [None]:
learner = text_classifier_learner(dls, AWD_LSTM, drop_mult=0.5, metrics=rmse)

In [None]:
learner = learner.load_encoder('lm_encoder')

In [None]:
learner.lr_find()

In [None]:
learn.fit_one_cycle(1, 2e-2)

In [None]:
learn.freeze_to(-2)
learn.fit_one_cycle(1, slice(1e-2/(2.6**4),1e-2))

In [None]:
learn.fit_one_cycle(1, 2e-2)

In [None]:
learn.freeze_to(-3)
learn.fit_one_cycle(1, slice(5e-3/(2.6**4),5e-3))

In [None]:
learn.unfreeze()
learn.fit_one_cycle(2, slice(1e-3/(2.6**4),1e-3))

In [None]:
TextLearner.save_encoder('tcl_enocder_1')

In [None]:
df = pd.read_csv('/kaggle/input/jigsaw-toxic-severity-rating/validation_data.csv')

df['toxic_rating_less'] = np.random.randint(0, 4, df_2.shape[0])
df['toxic_rating_more'] = np.random.randint(4, 7, df_2.shape[0])

In [None]:
df.head()

In [None]:
less_valid_toxic_db = DataBlock(blocks=(TextBlock.from_df('less_toxic',vocab=db_lm.vocab),RegressionBlock),
                                get_x = ColReader('text'),
                                get_y = ColReader('toxic_rating_less'),
                                splitter = RandomSplitter())


In [None]:
less_valid_dls = less_valid_toxic_db.dataloaders(df, bs=64)
less_valid_dls.show_batch()

In [None]:
learner = text_classifier_learner(less_valid_dls, AWD_LSTM, drop_mult=0.5, metrics=rmse)
learner = TextLearner.load_encoder("tcl_enocder_1")

In [None]:
learner.fine_tune(2,4e-3)

In [None]:
TextLearner.save_encoder('tcl_enocder_2')

In [None]:
more_valid_toxic_db = DataBlock(blocks=(TextBlock.from_df('more_toxic',vocab=db_lm.vocab),RegressionBlock),
                                get_x = ColReader('text'),
                                get_y = ColReader('toxic_rating_more'),
                                splitter = RandomSplitter())
more_valid_dls = more_valid_toxic_db.dataloaders(df, bs=64)
more_valid_dls.show_batch()


In [None]:
learner = text_classifier_learner(more_valid_dls, AWD_LSTM, drop_mult=0.5, metrics=rmse)
learner = TextLearner.load_encoder("tcl_enocder_2")

In [None]:
learner.fine_tune(4,4e-3)

In [None]:
learner.show_results()

In [None]:
learner.save('initial_model')

In [None]:
learner.model