

This notebook can be run without a graphics card (GPU) or TPU enough and CPU. (I have already spent my entire GPU resource, but there is a way out.)

### Ensemble [TFIDF+BERT]
Pay attention to the input libraries (databases):
- ../input/jigsaw-toxic-severity-rating  - current competition data
- ../input/jigsaw-toxic-comment-classification-challenge - 2017 competition data "The problem of classification of toxic comments"
- ../input/roberta-base - model data roberta base
- ../input/ruddit-jigsaw-dataset - Norms of Offensiveness for English Reddit Comments is a dataset of English language Reddit comments
- ../input/0-824-jigsaw-inference - output of the corresponding notepad

### Very important!!
This notebook uses data from my other two notebooks, I will leave links. You can, on the basis of their variants and already modified data, get your own results, perhaps even better than mine.

- ../input/fork-of-pytorch-w-b-my-jigsaw-starter - output of the corresponding notepad https://www.kaggle.com/andrej0marinchenko/my-jigsaw-starter-for-beginners

# Imports modules

In [None]:
import pandas as pd  # data analysis library
import numpy as np  # library linear algebra, Fourier transform and random numbers

# sklearn - а set of python modules for machine learning and data mining
from sklearn.ensemble import RandomForestRegressor  # using the Random Forest Regressor
from sklearn.feature_extraction.text import TfidfVectorizer  # for convert a collection of raw documents to a matrix of TF-IDF features
from sklearn.linear_model import Ridge, LinearRegression  # Ridge - Linear least squares with l2 regularization, Linear Regression - ordinary least squares
from sklearn.pipeline import Pipeline, FeatureUnion  # module implements utilities to build a composite estimator, as a chain of transforms and estimators
from sklearn.base import TransformerMixin, BaseEstimator # TransformerMixin - Mixin class for all transformers in scikit-learn.


import re  # module for working with regular expressions
import scipy  # library is built to work with NumPy arrays, and provides efficient numerical routines such as routines for numerical integration and optimization
from scipy import sparse  # SciPy 2-D sparse matrix package for numeric data
import gc # Garbage Collector - module provides the ability to disable the collector, tune the collection frequency, and set debugging options
from IPython.display import display, HTML  # Jupyter kernel to work with Python code in Jupyter notebooks and other interactive frontends
from pprint import pprint  # module provides a capability to “pretty-print” arbitrary Python data structures in a form which can be used as input to the interpreter
import warnings  # Warning messages are typically issued in situations where it is useful to alert the user of some condition in a program

warnings.filterwarnings("ignore")  # This is the base class of all warning category classes. It is a subclass of Exception. 
# The warnings filter controls whether warnings are ignored, displayed, or turned into errors (raising an exception) 
# "ignore" - never print matching warnings

pd.options.display.max_colwidth=300  # The maximum width in characters of a column in the repr of a pandas data structure. 
#Wen the column overflows, a “…” placeholder is embedded in the output

# Training data 

## Convert the label to SUM of all toxic labels (This might help with maintaining toxicity order of comments)

In [None]:
# this block is needed only for understanding what data we are working with
#  We use data from the 2017 competition "The problem of classification of toxic comments"
df = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")  # read the data for training and put it in the date frame 'df'
print(df.shape)  # display information about the size of the table, the size of the table is (159571 lines, 8 columns)
# of 8 columns, one column is the comment number, the second is the comment text, 
# and another 6 columns are the relationship to the degree of toxicity: 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'

for col in ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']:  # we iterate over each taxation column in the table
    print(f'****** {col} *******')  # display the name of the processed column
    display(df.loc[df[col]==1,['comment_text',col]].sample(10))  
    # we will display 10 examples (rows) of the table each in which the column of the value of the given taxation category is equal to one

In [None]:

# Give more weight to severe toxic 
df['severe_toxic'] = df.severe_toxic * 2  # multiply the highly toxic value of the column by 2. While the remaining toxicity columns remain at one.
df['y'] = (df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) ).astype(int)
# Let's add one more column "y" to our dataframe - equal to the sum of all toxicity values.
# Since we have 6 degrees of toxicity with maximum values in the column:
# 'toxic' = 1
# 'severe_toxic' = 2
# 'obscene' = 1
# 'threat' = 1
# 'insult' = 1
# 'identity_hate' = 1
# the most toxic comment will collect all levels of toxicity 1 + 2 + 1 + 1 + 1 + 1 = 7

df['y'] = df['y']/df['y'].max()  # Let's normalize the values, not from 0 to 7, but from 0 to 1.
# Where 0 is a non-toxic comment, 1 - corresponds to the presence of all signs of toxicity

df = df[['comment_text', 'y']].rename(columns={'comment_text': 'text'})  # rename column 'comment_text' in 'text'
df.sample(5)  # we will display 5 examples (rows)

In [None]:
df['y'].value_counts()  # we will display statistics for the entire table, how many comments correspond to one of the 8 degrees of toxicity

## Create 3 versions of the data

In [None]:
# Divide the resulting dataframe into 7 and save each in a separate csv output file. 
# It should be noted that the division into 7 folders is not linear, so we minimize the skew in the number of values, although it will not play a special role here.
n_folds = 7  # number of folders

frac_1 = 0.7
frac_1_factor = 1.5

for fld in range(n_folds):  # iterate over each of the 7 folders in turn
    print(f'Fold: {fld}')  # display the name of the currently formed folder
    tmp_df = pd.concat([df[df.y>0].sample(frac=frac_1, random_state = 10*(fld+1)) , 
                        df[df.y==0].sample(n=int(len(df[df.y>0])*frac_1*frac_1_factor) , 
                                            random_state = 10*(fld+1))], axis=0).sample(frac=1, random_state = 10*(fld+1))
    # use handling of joining pandas objects along a specific axis with optional setup logic

    tmp_df.to_csv(f'/kaggle/working/df_fld{fld}.csv', index=False)  # save the resulting folder dataframe to a csv file and mark it in a folder '/kaggle/working/'
    print(tmp_df.shape)  # display statistics for in this file, how many comments correspond to one of the 8 degrees of toxicity 
    print(tmp_df['y'].value_counts())  # display statistics in this file. As we can see, all files will contain the same number of lines.

# Create 3 versions of __clean__ data

In [None]:

def clean(data, col):  # Replace each occurrence of pattern/regex in the Series/Index

    # Clean some punctutations
    data[col] = data[col].str.replace('\n', ' \n ')  
    data[col] = data[col].str.replace(r'([a-zA-Z]+)([/!?.])([a-zA-Z]+)',r'\1 \2 \3')
    # Replace repeating characters more than 3 times to length of 3
    data[col] = data[col].str.replace(r'([*!?\'])\1\1{2,}',r'\1\1\1')    
    # Add space around repeating characters
    data[col] = data[col].str.replace(r'([*!?\']+)',r' \1 ')    
    # patterns with repeating characters 
    data[col] = data[col].str.replace(r'([a-zA-Z])\1{2,}\b',r'\1\1')
    data[col] = data[col].str.replace(r'([a-zA-Z])\1\1{2,}\B',r'\1\1\1')
    data[col] = data[col].str.replace(r'[ ]{2,}',' ').str.strip()   
    
    return data  # the function returns the processed value

In [None]:
# Test clean function
test_clean_df = pd.DataFrame({"text":
                              ["heyy\n\nkkdsfj",
                               "hi   how/are/you ???",
                               "hey?????",
                               "noooo!!!!!!!!!   comeone !! ",
                              "cooooooooool     brooooooooooo  coool brooo",
                              "naaaahhhhhhh"]})
display(test_clean_df)  # display the test function before transformation
clean(test_clean_df,'text')  # display the test function after transformation

In [None]:
df = clean(df,'text')  # clear the whole date frame

In [None]:
# # Divide the resulting cleared dataframe by 7 and save each in a separate output csv file.
# It should be noted that, as before, the separation rule is respected. In this way, we created 7 cleared and not cleared data files.
n_folds = 7  # number of folders

frac_1 = 0.7
frac_1_factor = 1.5

for fld in range(n_folds):  # iterate over each of the 7 folders in turn
    print(f'Fold: {fld}')  # display the name of the currently formed folder
    tmp_df = pd.concat([df[df.y>0].sample(frac=frac_1, random_state = 10*(fld+1)) , 
                        df[df.y==0].sample(n=int(len(df[df.y>0])*frac_1*frac_1_factor) , 
                                            random_state = 10*(fld+1))], axis=0).sample(frac=1, random_state = 10*(fld+1))
     # use handling of joining pandas objects along a specific axis with optional setup logic
        
        
    tmp_df.to_csv(f'/kaggle/working/df_clean_fld{fld}.csv', index=False)  # save the resulting folder dataframe to a csv file and mark it in a folder '/kaggle/working/'
    print(tmp_df.shape)  # display statistics for in this file, how many comments correspond to one of the 8 degrees of toxicity
    print(tmp_df['y'].value_counts())  # display statistics in this file. As we can see, all files will contain the same number of lines.

In [None]:
del df,tmp_df  # remove the applied date frames
gc.collect()  # With no arguments, run a full collection, 

## Ruddit data

Ruddit: Norms of Offensiveness for English Reddit Comments is a dataset of English language Reddit comments that has fine-grained, real-valued scores between -1 (maximally supportive) and 1 (maximally offensive). Data sampling procedure, annotation, and analysis have been discussed in detail in the accompanying paper. Authors have provided the comment IDs, post IDs and not the bodies, in accordance to the GDPR regulations. They have suggested that the comments and post bodies can be extracted from any Reddit API using the IDs provided.

The original paper can be found here: Ruddit: Norms of Offensiveness for English Reddit Comments

The source github repo can be found here: https://github.com/hadarishav/Ruddit

In [None]:
df_ = pd.read_csv("../input/ruddit-jigsaw-dataset/Dataset/ruddit_with_text.csv")  # create a dateframe based on a file

print(df_.shape)  # display its size

df_ = df_[['txt', 'offensiveness_score']].rename(columns={'txt': 'text',
                                                                'offensiveness_score':'y'})  # change columns

df_['y'] = (df_['y'] - df_.y.min()) / (df_.y.max() - df_.y.min())  # converting all toxicity values from 0 to 1
df_.y.hist()  # display all values on the histogram

# Create 3 versions of data

In [None]:
# Divide the resulting cleared dataframe by 7 and save each in a separate output csv file.
n_folds = 7  # number of folders

frac_1 = 0.7  # for all categories we take 70% of the original amount

for fld in range(n_folds):  # iterate over each of the 7 folders in turn
    print(f'Fold: {fld}')  # display the name of the currently formed folder
    tmp_df = df_.sample(frac=frac_1, random_state = 10*(fld+1))  # use handling of joining pandas objects along a specific axis with optional setup logic
    tmp_df.to_csv(f'/kaggle/working/df2_fld{fld}.csv', index=False)  # save the resulting folder dataframe to a csv file and mark it in a folder '/kaggle/working/'
    print(tmp_df.shape)  # display statistics for in this file
    print(tmp_df['y'].value_counts())  # display statistics in this file. As we can see, all files will contain the same number of lines.


In [None]:
del tmp_df, df_;  # remove the applied date frames
gc.collect()  # With no arguments, run a full collection

## Load Validation and Test data  
now we read the data file of the current competition and transfer it to the dataframe

In [None]:

# Validation data 

df_val = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")  # create a variable dataframe containing data from the original competition data file
print(df_val.shape)  # display statistics for in this file
print(df_val.head())  # display the first 5 rows of the dataframe table

In [None]:
# Test data

df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")  # create a variable dataframe containing data from the original competition data file
print(df_sub.shape)  # display statistics for in this file
print(df_sub.head())


# Create Sklearn Pipeline with 
## TFIDF - Take 'char_wb' as analyzer to capture subwords well
## Ridge - Ridge is a simple regression algorithm that will reduce overfitting 

### model Ridge - Linear least squares with l2 regularization.

Minimizes the objective function:

||y - Xw||^2_2 + alpha * ||w||^2_2

This model solves a regression model where the loss function is the linear least squares function and regularization is given by the l2-norm. Also known as Ridge Regression or Tikhonov regularization. This estimator has built-in support for multi-variate regression (i.e., when y is a 2d-array of shape (n_samples, n_targets)).

### Linear Regression - ordinary least squares .

Linear Regression fits a linear model with coefficients w = (w1, …, wp) to minimize the residual sum of squares between the observed targets in the dataset, and the targets predicted by the linear approximation.

In [None]:
# NOT USED 
# class LengthTransformer(BaseEstimator, TransformerMixin):

#     def fit(self, X, y=None):
#         return self
#     def transform(self, X):
#         return sparse.csr_matrix([[(len(x)-360)/550] for x in X])
#     def get_feature_names(self):
#         return ["lngth"]

class LengthUpperTransformer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return sparse.csr_matrix([[sum([1 for y in x if y.isupper()])/len(x)] for x in X])
    def get_feature_names(self):
        return ["lngth_uppercase"]

### Does % of uppercase characters have effect on toxicity


In [None]:

df_val['upper_1'] = np.array(LengthUpperTransformer().transform(df_val['less_toxic']).todense()).reshape(-1,1)
df_val['upper_2'] = np.array(LengthUpperTransformer().transform(df_val['more_toxic']).todense()).reshape(-1,1)

print(df_val['upper_1'].mean(), df_val['upper_1'].std())
print(df_val['upper_2'].mean(), df_val['upper_2'].std())

df_val['upper_1'].hist(bins=100)
df_val['upper_2'].hist(bins=100)

## Train pipeline

- Load folds data
- train pipeline
- Predict on validation data
- Predict on test data

pipeline module implements utilities to build a composite estimator, as a chain of transforms and estimators.

### Toxic data

In [None]:
val_preds_arr1 = np.zeros((df_val.shape[0], n_folds))
val_preds_arr2 = np.zeros((df_val.shape[0], n_folds))
test_preds_arr = np.zeros((df_sub.shape[0], n_folds))

for fld in range(n_folds):
    print("\n\n")
    print(f' ****************************** FOLD: {fld} ******************************')
    df = pd.read_csv(f'/kaggle/working/df_fld{fld}.csv')
    print(df.shape)

    features = FeatureUnion([
        #('vect1', LengthTransformer()),
        #('vect2', LengthUpperTransformer()),
        ("vect3", TfidfVectorizer(min_df= 3, max_df=0.5, analyzer = 'char_wb', ngram_range = (3,5))),
        #("vect4", TfidfVectorizer(min_df= 5, max_df=0.5, analyzer = 'word', token_pattern=r'(?u)\b\w{8,}\b')),

    ])
    pipeline = Pipeline(
        [
            ("features", features),
            #("clf", RandomForestRegressor(n_estimators = 5, min_sample_leaf=3)),
            ("clf", Ridge()),
            #("clf",LinearRegression())
        ]
    )
    print("\nTrain:")
    # Train the pipeline
    pipeline.fit(df['text'], df['y'])
    
    # What are the important features for toxicity

    print('\nTotal number of features:', len(pipeline['features'].get_feature_names()) )

    feature_wts = sorted(list(zip(pipeline['features'].get_feature_names(), 
                                  np.round(pipeline['clf'].coef_,2) )), 
                         key = lambda x:x[1], 
                         reverse=True)

    pprint(feature_wts[:30])
    
    print("\npredict validation data ")
    val_preds_arr1[:,fld] = pipeline.predict(df_val['less_toxic'])
    val_preds_arr2[:,fld] = pipeline.predict(df_val['more_toxic'])

    print("\npredict test data ")
    test_preds_arr[:,fld] = pipeline.predict(df_sub['text'])

# Toxic __clean__ data

In [None]:
val_preds_arr1c = np.zeros((df_val.shape[0], n_folds))
val_preds_arr2c = np.zeros((df_val.shape[0], n_folds))
test_preds_arrc = np.zeros((df_sub.shape[0], n_folds))

for fld in range(n_folds):
    print("\n\n")
    print(f' ****************************** FOLD: {fld} ******************************')
    df = pd.read_csv(f'/kaggle/working/df_clean_fld{fld}.csv')
    print(df.shape)

    features = FeatureUnion([
        #('vect1', LengthTransformer()),
        #('vect2', LengthUpperTransformer()),
        ("vect3", TfidfVectorizer(min_df= 3, max_df=0.5, analyzer = 'char_wb', ngram_range = (3,5))),
        #("vect4", TfidfVectorizer(min_df= 5, max_df=0.5, analyzer = 'word', token_pattern=r'(?u)\b\w{8,}\b')),

    ])
    pipeline = Pipeline(
        [
            ("features", features),
            #("clf", RandomForestRegressor(n_estimators = 5, min_sample_leaf=3)),
            ("clf", Ridge()),
            #("clf",LinearRegression())
        ]
    )
    print("\nTrain:")
    # Train the pipeline
    pipeline.fit(df['text'], df['y'])
    
    # What are the important features for toxicity

    print('\nTotal number of features:', len(pipeline['features'].get_feature_names()) )

    feature_wts = sorted(list(zip(pipeline['features'].get_feature_names(), 
                                  np.round(pipeline['clf'].coef_,2) )), 
                         key = lambda x:x[1], 
                         reverse=True)

    pprint(feature_wts[:30])
    
    print("\npredict validation data ")
    val_preds_arr1c[:,fld] = pipeline.predict(df_val['less_toxic'])
    val_preds_arr2c[:,fld] = pipeline.predict(df_val['more_toxic'])

    print("\npredict test data ")
    test_preds_arrc[:,fld] = pipeline.predict(df_sub['text'])

## Ruddit data pipeline

In [None]:
val_preds_arr1_ = np.zeros((df_val.shape[0], n_folds))
val_preds_arr2_ = np.zeros((df_val.shape[0], n_folds))
test_preds_arr_ = np.zeros((df_sub.shape[0], n_folds))

for fld in range(n_folds):
    print("\n\n")
    print(f' ****************************** FOLD: {fld} ******************************')
    df = pd.read_csv(f'/kaggle/working/df2_fld{fld}.csv')
    print(df.shape)

    features = FeatureUnion([
        #('vect1', LengthTransformer()),
        #('vect2', LengthUpperTransformer()),
        ("vect3", TfidfVectorizer(min_df= 3, max_df=0.5, analyzer = 'char_wb', ngram_range = (3,5))),
        #("vect4", TfidfVectorizer(min_df= 5, max_df=0.5, analyzer = 'word', token_pattern=r'(?u)\b\w{8,}\b')),

    ])
    pipeline = Pipeline(
        [
            ("features", features),
            #("clf", RandomForestRegressor(n_estimators = 5, min_sample_leaf=3)),
            ("clf", Ridge()),
            #("clf",LinearRegression())
        ]
    )
    print("\nTrain:")
    # Train the pipeline
    pipeline.fit(df['text'], df['y'])
    
    # What are the important features for toxicity

    print('\nTotal number of features:', len(pipeline['features'].get_feature_names()) )

    feature_wts = sorted(list(zip(pipeline['features'].get_feature_names(), 
                                  np.round(pipeline['clf'].coef_,2) )), 
                         key = lambda x:x[1], 
                         reverse=True)

    pprint(feature_wts[:30])
    
    print("\npredict validation data ")
    val_preds_arr1_[:,fld] = pipeline.predict(df_val['less_toxic'])
    val_preds_arr2_[:,fld] = pipeline.predict(df_val['more_toxic'])

    print("\npredict test data ")
    test_preds_arr_[:,fld] = pipeline.predict(df_sub['text'])

In [None]:
del df, pipeline, feature_wts
gc.collect()

# Validate the pipeline 

In [None]:
print(" Toxic data ")
p1 = val_preds_arr1.mean(axis=1)
p2 = val_preds_arr2.mean(axis=1)

print(f'Validation Accuracy is { np.round((p1 < p2).mean() * 100,2)}')

print(" Ruddit data ")
p3 = val_preds_arr1_.mean(axis=1)
p4 = val_preds_arr2_.mean(axis=1)

print(f'Validation Accuracy is { np.round((p3 < p4).mean() * 100,2)}')

print(" Toxic CLEAN data ")
p5 = val_preds_arr1c.mean(axis=1)
p6 = val_preds_arr2c.mean(axis=1)

print(f'Validation Accuracy is { np.round((p5 < p6).mean() * 100,2)}')


In [None]:
print("Find right weight")

wts_acc = []
for i in range(30,70,1):
    for j in range(0,20,1):
        w1 = i/100
        w2 = (100 - i - j)/100
        w3 = (1 - w1 - w2 )
        p1_wt = w1*p1 + w2*p3 + w3*p5
        p2_wt = w1*p2 + w2*p4 + w3*p6
        wts_acc.append( (w1,w2,w3, 
                         np.round((p1_wt < p2_wt).mean() * 100,2))
                      )
sorted(wts_acc, key=lambda x:x[3], reverse=True)[:5]

In [None]:
w1,w2,w3,_ = sorted(wts_acc, key=lambda x:x[2], reverse=True)[0]
#print(best_wts)

p1_wt = w1*p1 + w2*p3 + w3*p5
p2_wt = w1*p2 + w2*p4 + w3*p6


## Analyze bad predictions 
### Incorrect predictions with similar scores
### Incorrect predictions with different scores

In [None]:
df_val['p1'] = p1_wt
df_val['p2'] = p2_wt
df_val['diff'] = np.abs(p2_wt - p1_wt)

df_val['correct'] = (p1_wt < p2_wt).astype('int')


In [None]:

### Incorrect predictions with similar scores

df_val[df_val.correct == 0].sort_values('diff', ascending=True).head(20)

#### Some of these just look incorrectly tagged 


In [None]:
### Incorrect predictions with dis-similar scores


df_val[df_val.correct == 0].sort_values('diff', ascending=False).head(20)

# Predict on test data 

In [None]:
# Predict using pipeline

df_sub['score'] = w1*test_preds_arr.mean(axis=1) + w2*test_preds_arr_.mean(axis=1) + w3*test_preds_arrc.mean(axis=1)

In [None]:
#test_preds_arr

## Correct the rank ordering

In [None]:
# Cases with duplicates scores

df_sub['score'].count() - df_sub['score'].nunique()

In [None]:
same_score = df_sub['score'].value_counts().reset_index()[:10]
same_score

In [None]:
df_sub[df_sub['score'].isin(same_score['index'].tolist())]

In [None]:
# Same comments have same score - which is ok 

In [None]:
# # Rank the predictions 

# df_sub['score']  = scipy.stats.rankdata(df_sub['score'], method='ordinal')

# print(df_sub['score'].rank().nunique())

# Bert Ensemble

In [None]:
%%time
# connect libraries only for this task
import os  # operating system library
import gc  # Garbage Collector - module provides the ability to disable the collector, tune the collection frequency, and set debugging options
import cv2  # open source computer vision and machine learning library
import copy  # The assignment operation does not copy the object, it only creates a reference to the object. 
# For mutable collections, or for collections containing mutable items, a copy is often needed so that it can be modified without changing the original. 
# This module provides general (shallow and deep) copy operations. 

import time  # time library
import random  # library for working with random values

# For data manipulation
import pandas as pd  # data analysis library
import numpy as np  # library linear algebra, Fourier transform and random numbers

# Pytorch Imports
import torch  #  a Tensor library like NumPy, with strong GPU support
import torch.nn as nn  # a neural networks library deeply integrated with autograd designed for maximum flexibility
from torch.utils.data import Dataset, DataLoader  # DataLoader and other utility functions for convenience


# For Transformer Models
from transformers import AutoTokenizer, AutoModel  # In many cases, the architecture you want to use can be guessed from the name or the path of the 
# pretrained model you are supplying to the from_pretrained() method. AutoClasses are here to do this job for you so that you automatically retrieve the 
# relevant model given the name/path to the pretrained weights/config/vocabulary.
#Instantiating one of AutoConfig, AutoModel, and AutoTokenizer will directly create a class of the relevant architecture.

# Utils
from tqdm import tqdm  # tqdm derives from the Arabic word taqaddum  which can mean "progress," and is an abbreviation for "I love you so much" in Spanish 
# (te quiero demasiado).  this library show a smart progress meter - just wrap any iterable with tqdm(iterable)

# For descriptive error messages
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

CONFIG = dict(
    seed = 42,
    model_name = '../input/roberta-base',
    test_batch_size = 64,
    max_length = 128,
    num_classes = 1,
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
)

CONFIG["tokenizer"] = AutoTokenizer.from_pretrained(CONFIG['model_name'])



MODEL_PATHS = [
    '../input/fork-of-pytorch-w-b-my-jigsaw-starter/Loss-Fold-0.bin',
    '../input/fork-of-pytorch-w-b-my-jigsaw-starter/Loss-Fold-1.bin',
    '../input/fork-of-pytorch-w-b-my-jigsaw-starter/Loss-Fold-2.bin',
    '../input/fork-of-pytorch-w-b-my-jigsaw-starter/Loss-Fold-3.bin',
    '../input/fork-of-pytorch-w-b-my-jigsaw-starter/Loss-Fold-4.bin',
    '../input/fork-of-pytorch-w-b-my-jigsaw-starter/Loss-Fold-5.bin',
    '../input/fork-of-pytorch-w-b-my-jigsaw-starter/Loss-Fold-6.bin',
    '../input/fork-of-pytorch-w-b-my-jigsaw-starter/Loss-Fold-7.bin',
    '../input/fork-of-pytorch-w-b-my-jigsaw-starter/Loss-Fold-8.bin',
    '../input/fork-of-pytorch-w-b-my-jigsaw-starter/Loss-Fold-9.bin'
]



def set_seed(seed = 42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
    
class JigsawDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.max_len = max_length
        self.tokenizer = tokenizer
        self.text = df['text'].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        text = self.text[index]
        inputs = self.tokenizer.encode_plus(
                        text,
                        truncation=True,
                        add_special_tokens=True,
                        max_length=self.max_len,
                        padding='max_length'
                    )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']        
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long)
        }    

    
class JigsawModel(nn.Module):
    def __init__(self, model_name):
        super(JigsawModel, self).__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.drop = nn.Dropout(p=0.2)
        self.fc = nn.Linear(768, CONFIG['num_classes'])
        
    def forward(self, ids, mask):        
        out = self.model(input_ids=ids,attention_mask=mask,
                         output_hidden_states=False)
        out = self.drop(out[1])
        outputs = self.fc(out)
        return outputs
    
@torch.no_grad()
def valid_fn(model, dataloader, device):
    model.eval()
    
    dataset_size = 0
    running_loss = 0.0
    
    PREDS = []
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        
        outputs = model(ids, mask)
        PREDS.append(outputs.view(-1).cpu().detach().numpy()) 
    
    PREDS = np.concatenate(PREDS)
    gc.collect()
    
    return PREDS


def inference(model_paths, dataloader, device):
    final_preds = []
    for i, path in enumerate(model_paths):
        model = JigsawModel(CONFIG['model_name'])
        model.to(CONFIG['device'])
        model.load_state_dict(torch.load(path, map_location=torch.device('cpu')))  # for cpu
        # model.load_state_dict(torch.load(path))  # for gpu
       
        
        print(f"Getting predictions for model {i+1}")
        preds = valid_fn(model, dataloader, device)
        final_preds.append(preds)
    
    final_preds = np.array(final_preds)
    final_preds = np.mean(final_preds, axis=0)
    return final_preds


set_seed(CONFIG['seed'])
df = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
df.head()

test_dataset = JigsawDataset(df, CONFIG['tokenizer'], max_length=CONFIG['max_length'])
test_loader = DataLoader(test_dataset, batch_size=CONFIG['test_batch_size'],
                         num_workers=2, shuffle=False, pin_memory=True)

preds1 = inference(MODEL_PATHS, test_loader, CONFIG['device'])

In [None]:
preds = (preds1-preds1.min())/(preds1.max()-preds1.min())

In [None]:
df_sub['score'] = df_sub['score']*0.85+preds*0.15  # preparation of the output, we select the coefficient empirically 0.82->0.85->0.90, 0.17->0.15->0.10

In [None]:
df_sub[['comment_id', 'score']].to_csv("submission.csv", index=False)  # we form an output file for evaluation in the competition