In [None]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
import gc
import re 
from scipy import sparse
import time
import warnings
warnings.filterwarnings("ignore")
pd.options.display.max_colwidth=300
pd.options.display.max_columns = 100

from sklearn.linear_model import Ridge


In [None]:
import os
import random
import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

In [None]:
# Validation data 

df_val = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
print(df_val.shape)

df_more = df_val[['more_toxic']].rename(columns={'more_toxic': 'text'}).reset_index(drop=True)
df_less = df_val[['less_toxic']].rename(columns={'less_toxic': 'text'}).reset_index(drop=True)

df_val_unique = pd.concat([df_more, df_less]).drop_duplicates(subset='text', keep='first')

print(df_val_unique.shape)
df_val_unique.head()

In [None]:
df_val.head()

In [None]:
df = pd.read_csv('../input/jigsaw4-previous-data-preprocessing/jig1_no_jig4_dup_df.csv')

print(df.shape)
df.head()

In [None]:
df['severe_toxic'] = df['severe_toxic'] * 2
df['y'] = (df[['toxic', 'severe_toxic', 'obscene',
       'identity_hate', 'insult', 'threat', ]].sum(axis=1) ).astype(int)

df['y'].value_counts()

In [None]:
dfs = []
for i in range(0, 8):
    dfs.append(df.query(f'y == {i}')['text'])

In [None]:
pair_df = pd.DataFrame()
pair_df['more_toxic'] = pd.concat([
    dfs[7], dfs[7], dfs[7], dfs[7], dfs[7], dfs[7], dfs[7],
    dfs[6], dfs[6], dfs[6], dfs[6], dfs[6], dfs[6],
    dfs[5], dfs[5], dfs[5], dfs[5], dfs[5],
    dfs[4], dfs[4], dfs[4], dfs[4],
    dfs[3], dfs[3],
    dfs[2], dfs[2], 
    dfs[1]
]).reset_index(drop=True)

pair_df['less_toxic'] = pd.concat([
    dfs[6].sample(len(dfs[7]), random_state=42),
    dfs[5].sample(len(dfs[7]), random_state=42),
    dfs[4].sample(len(dfs[7]), random_state=42),
    dfs[3].sample(len(dfs[7]), random_state=42),
    dfs[2].sample(len(dfs[7]), random_state=42),
    dfs[1].sample(len(dfs[7]), random_state=42),
    dfs[0].sample(len(dfs[7]), random_state=42),
    
    dfs[5].sample(len(dfs[6]), random_state=42),
    dfs[4].sample(len(dfs[6]), random_state=42),
    dfs[3].sample(len(dfs[6]), random_state=42),
    dfs[2].sample(len(dfs[6]), random_state=42),
    dfs[1].sample(len(dfs[6]), random_state=42),
    dfs[0].sample(len(dfs[6]), random_state=42),
    
    dfs[4].sample(len(dfs[5]), random_state=42),
    dfs[3].sample(len(dfs[5]), random_state=42),
    dfs[2].sample(len(dfs[5]), random_state=42),
    dfs[1].sample(len(dfs[5]), random_state=42),
    dfs[0].sample(len(dfs[5]), random_state=42),
    
    dfs[3].sample(len(dfs[4]), random_state=42),
    dfs[2].sample(len(dfs[4]), random_state=42),
    dfs[1].sample(len(dfs[4]), random_state=42),
    dfs[0].sample(len(dfs[4]), random_state=42),
    
    dfs[1].sample(len(dfs[3]), random_state=42),
    dfs[0].sample(len(dfs[3]), random_state=42),
    
    dfs[1].sample(len(dfs[2]), random_state=42),
    dfs[0].sample(len(dfs[2]), random_state=42),
    
    dfs[0].sample(len(dfs[1]), random_state=42)
]).reset_index(drop=True)

pair_df['worker'] = 9999

print(pair_df.shape)
pair_df.head()

In [None]:
pair_df.to_csv('jigsaw4_additional_pairs_from_jigsaw1.csv', index=False)

In [None]:
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

train = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")


Fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2021)
for n, (trn_index, val_index) in enumerate(Fold.split(train, train['worker'])):
    train.loc[val_index, 'kfold'] = int(n)
train['kfold'] = train['kfold'].astype(int)

In [None]:
external = pair_df.copy()

Fold = KFold(n_splits=5, shuffle=True, random_state=2021)
for n, (trn_index, val_index) in enumerate(Fold.split(external, external['worker'])):
    external.loc[val_index, 'kfold'] = int(n)
external['kfold'] = external['kfold'].astype(int)

In [None]:
# fold = 0

for fold in range(5):
    print(f' *** {fold} ***')
    trn_df = train[train.kfold != fold].reset_index(drop=True)
    val_df = train[train.kfold == fold].reset_index(drop=True)

    # join external pair 
    trn_ex = external[external.kfold != fold].reset_index(drop=True)

    trn_ex = trn_ex.reset_index()
    print(len(trn_df), len(val_df), len(trn_ex))

    idx1 = list(pd.merge(val_df, trn_ex, on=['less_toxic'], how='inner').index)
    idx2 = list(pd.merge(val_df, trn_ex, on=['more_toxic'], how='inner').index)
    duplicated_index = list(set(idx1 + idx2))
    print(f'num dup : ', len(duplicated_index))

    trn_ex = trn_ex[~trn_ex['index'].isin(duplicated_index)]

    trn_df = pd.concat([trn_df, trn_ex]).reset_index(drop=True)
    print(f'merged : ', len(trn_df))