In [None]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
import gc
import re 
from scipy import sparse
import time
import warnings
warnings.filterwarnings("ignore")
pd.options.display.max_colwidth=300
pd.options.display.max_columns = 100

from sklearn.linear_model import Ridge


In [None]:
import os
import random
import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

In [None]:
def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text

In [None]:
# # jigsaw1

In [None]:
toxic_comment_path = '../input/jigsaw-toxic-comment-classification-challenge/'

df_test = pd.read_csv(toxic_comment_path+'test.csv.zip')

df_test_label = pd.read_csv(toxic_comment_path+'test_labels.csv.zip').replace(-1,0)

df_test = pd.merge(df_test, df_test_label, how="left", on = "id")
print(df_test.shape)

df_train = pd.read_csv(toxic_comment_path+'train.csv.zip')

df = pd.concat([df_train, df_test]).rename(columns={'comment_text': 'text'}).reset_index(drop=True)
print(df.shape)

del df_train, df_test, df_test_label; gc.collect()

df.head()

In [None]:
# Validation data 

df_val = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
print(df_val.shape)

df_more = df_val[['more_toxic']].rename(columns={'more_toxic': 'text'}).reset_index(drop=True)
df_less = df_val[['less_toxic']].rename(columns={'less_toxic': 'text'}).reset_index(drop=True)

df_val_unique = pd.concat([df_more, df_less]).drop_duplicates(subset='text', keep='first')

print(df_val_unique.shape)
df_val_unique.head()

In [None]:
duplicate_index = pd.merge(df, df_val_unique, on='text')['id']
print(len(duplicate_index))

In [None]:
jig1_no_jig4_dup_df = df[~df['id'].isin(duplicate_index)].reset_index(drop=True)

print(jig1_no_jig4_dup_df.shape)
jig1_no_jig4_dup_df.head()

In [None]:
jig1_no_jig4_dup_df.to_csv('jig1_no_jig4_dup_df.csv', index=False)

In [None]:
# jigsaw2

In [None]:
df = pd.read_csv(
    '../input/jigsaw-unintended-bias-in-toxicity-classification/all_data.csv',
    usecols=['id', 'comment_text',
       'toxicity', 'severe_toxicity', 'obscene', 'sexual_explicit',
       'identity_attack', 'insult', 'threat', 
       'male', 'female', 'transgender',
       'other_gender', 'heterosexual', 'homosexual_gay_or_lesbian', 'bisexual',
       'other_sexual_orientation', 'christian', 'jewish', 'muslim', 'hindu',
       'buddhist', 'atheist', 'other_religion', 'black', 'white', 'asian',
       'latino', 'other_race_or_ethnicity', 'physical_disability',
       'intellectual_or_learning_disability', 'psychiatric_or_mental_illness',
       'other_disability']
).rename(columns={'comment_text': 'text'})

duplicate_index = pd.merge(df, df_val_unique, on='text')['id']
print(len(duplicate_index))

jig2_no_jig4_dup_df = df[~df['id'].isin(duplicate_index)].reset_index(drop=True)

print(jig2_no_jig4_dup_df.shape)
jig2_no_jig4_dup_df.head()

In [None]:
jig2_no_jig4_dup_df[~jig2_no_jig4_dup_df['homosexual_gay_or_lesbian'].isnull()].tail()

In [None]:
jig2_no_jig4_dup_df.to_csv('jig2_no_jig4_dup_df.csv', index=False)

In [None]:
# ruddit

In [None]:
df = pd.read_csv(
    '../input/ruddit-jigsaw-dataset/Dataset/ruddit_with_text.csv',
).rename(columns={'comment_id': 'id', 'txt': 'text', 'offensiveness_score': 'y'})

print(df.shape)

duplicate_index = pd.merge(df, df_val_unique, on='text')['id']
print(len(duplicate_index))

ruddit_no_jig4_dup_df = df[~df['id'].isin(duplicate_index)].reset_index(drop=True)

print(ruddit_no_jig4_dup_df.shape)
ruddit_no_jig4_dup_df.head()

In [None]:
ruddit_no_jig4_dup_df.to_csv('ruddit_no_jig4_dup_df.csv', index=False)

In [None]:
# good score

In [None]:
df = pd.read_csv(
    '../input/jigsaw-rate-severity-good-score-train-dataset/train_data.csv',
).rename(columns={'Unnamed: 0.1': 'id'})

del df['Unnamed: 0']
print(df.shape)

duplicate_index = pd.merge(df, df_val_unique, on='text')['id']
print(len(duplicate_index))

good_no_jig4_dup_df = df[~df['id'].isin(duplicate_index)].reset_index(drop=True)

print(good_no_jig4_dup_df.shape)
good_no_jig4_dup_df.head()

In [None]:
good_no_jig4_dup_df.to_csv('good_no_jig4_dup_df.csv', index=False)