# Import libraries

In [None]:
import numpy as np
import pandas as pd

from fastai.text import *

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Import data

In [None]:
train_pure_data = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv")
validation_pure_data = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-test-translated/jigsaw_miltilingual_valid_translated.csv")
test_pure_data = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-test-translated/jigsaw_miltilingual_test_translated.csv")

Let's take a look at the imported data

In [None]:
train_pure_data.head(2)

In [None]:
validation_pure_data.head(2)

In [None]:
test_pure_data.head(2)

Translate test text into english using another dataset

In [None]:
test_pure_data['content'] = test_pure_data['translated']
test_pure_data.drop(['translated', 'id'], axis=1, inplace=True)

In [None]:
test_pure_data.head(2)

Translate validation text into english using another dataset

In [None]:
validation_pure_data['comment_text'] = validation_pure_data['translated']
validation_pure_data.drop(['translated', 'id'], axis=1, inplace=True)

In [None]:
validation_pure_data.head(2)

Drop 'lang' from validation and test data

In [None]:
validation_pure_data.drop(['lang'], inplace=True, axis=1)
test_pure_data.drop(['lang'], inplace=True, axis=1)

In [None]:
validation_pure_data.head(2)

In [None]:
test_pure_data.head(2)

Rebuild our train data so it contain only one label - toxic if there is any other label

In [None]:
train_pure_data[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis = 0)

In [None]:
train_pure_data['toxic'] = train_pure_data[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis = 1) > 0

In [None]:
train_pure_data.drop(['severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate', 'id'], inplace=True, axis=1)

In [None]:
train_pure_data['toxic'] = train_pure_data['toxic'].astype(int)

In [None]:
train_pure_data.head(2)

In [None]:
train_positive_samples = train_pure_data[train_pure_data['toxic'] == 1]
train_negative_samples = train_pure_data[train_pure_data['toxic'] == 0]

In [None]:
#final_train = pd.concat([train_positive_samples, train_negative_samples.sample(24000, random_state=3543)])
final_train = train_pure_data.sample(frac=1, random_state=3543)

In [None]:
final_train.head()

# Modelling

Load data into model

In [None]:
data_lm = (TextList.from_df(final_train)
                   .split_by_rand_pct()
                   .label_from_df(cols='toxic')
                   .databunch())

data_lm.save()

Take a look at one batch

In [None]:
data_lm.show_batch()

Create learner

In [None]:
learn = text_classifier_learner(data_lm, AWD_LSTM)
learn.unfreeze()

In [None]:
learn.fit_one_cycle(10, slice(1e-7, 1e-1))
#learn.save('mini_train_clas')

In [None]:
preds,y,losses = learn.get_preds(with_loss=True)
interp = ClassificationInterpretation(learn, preds, y, losses)
interp.plot_confusion_matrix()

In [None]:
learn.data.add_test(test_pure_data)
preds,y = learn.get_preds(ds_type=DatasetType.Test)

In [None]:
submission = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/sample_submission.csv")

In [None]:
submission.head()

In [None]:
submission['toxic'] = [x[1].item() for x in preds]

In [None]:
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)