In [None]:
import pandas as pd
import numpy as np

import re 
import scipy
from scipy import sparse
import gc 

from IPython.display import display
from pprint import pprint
from matplotlib import pyplot as plt 

import warnings
warnings.filterwarnings("ignore")
pd.options.display.max_colwidth=300
pd.options.display.max_columns = 100

In [None]:
n_folds = 7

frac_1 = 0.7
frac_1_factor = 1.3

In [None]:
def create_folds(train_path_name):
    for fld in range(n_folds):
        print(f'Fold: {fld}')
        
        if train_path_name == 'dfr_fld':
            tmp_df = df.sample(frac=frac_1, random_state = 10*(fld+1))
        
        else:
            tmp_df = pd.concat([df[df.y>0].sample(frac=frac_1, random_state = 10*(fld+1)) , 
                                df[df.y==0].sample(n=int(len(df[df.y>0])*frac_1*frac_1_factor) , 
                                                    random_state = 10*(fld+1))], axis=0).sample(frac=1, random_state = 10*(fld+1))

        tmp_df.to_csv(f'/kaggle/working/{train_path_name}{fld}.csv', index=False)
        print(tmp_df.shape)
        print(tmp_df['y'].value_counts())

In [None]:
def clean(data, col):

    # Clean some punctutations
    data[col] = data[col].str.replace('\n', ' \n ')
    # Remove ip address
    data[col] = data[col].str.replace(r'(([0-9]+\.){2,}[0-9]+)',' ')
    
    data[col] = data[col].str.replace(r'([a-zA-Z]+)([/!?.])([a-zA-Z]+)',r'\1 \2 \3')
    # Replace repeating characters more than 3 times to length of 3
    data[col] = data[col].str.replace(r'([*!?\'])\1\1{2,}',r'\1\1\1')
    # patterns with repeating characters 
    data[col] = data[col].str.replace(r'([a-zA-Z])\1{2,}\b',r'\1\1')
    data[col] = data[col].str.replace(r'([a-zA-Z])\1\1{2,}\B',r'\1\1\1')
    data[col] = data[col].str.replace(r'[ ]{2,}',' ').str.strip()   
    # Add space around repeating characters
    data[col] = data[col].str.replace(r'([*!?\']+)',r' \1 ')    
    
    return data

# Toxic comment classification

In [None]:
df = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")
print(df.shape)

for col in ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']:
    print(f'****** {col} *******')
    display(df.loc[df[col]==1,['comment_text',col]].sample(5))

In [None]:
# Give more weight to severe toxic 
df['severe_toxic'] = df.severe_toxic * 2
df['y'] = (df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) ).astype(int)
df['y'] = df['y']/df['y'].max()

df = df[['comment_text', 'y']].rename(columns={'comment_text': 'text'})
df.sample(5)

In [None]:
df['y'].value_counts()

In [None]:
create_folds('df_fld')

# Toxic comment classification clean

In [None]:
df = clean(df,'text')

In [None]:
create_folds('df_clean_fld')

# Toxic tweet

In [None]:
df = pd.read_csv('../input/toxic-tweets-dataset/FinalBalancedDataset.csv')
print(df.shape)

In [None]:
df = df[['Toxicity', 'tweet']].rename(columns={'Toxicity': 'y', 'tweet': 'text'})
print(df['y'].value_counts())
df.sample(5)

In [None]:
df = clean(df,'text')

In [None]:
create_folds('df_tweets_fld')

# Read Jigsaw multilingual data CLEANED

In [None]:
df = pd.read_csv("../input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv")
print(df.shape)

for col in ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']:
    print(f'****** {col} *******')
    display(df.loc[df[col]==1,['comment_text',col]].sample(5))

In [None]:
# Give more weight to severe toxic 
df['severe_toxic'] = df.severe_toxic * 2
df['y'] = (df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) ).astype(int)
df['y'] = df['y']/df['y'].max()

df = df[['comment_text', 'y']].rename(columns={'comment_text': 'text'})
df.y.value_counts()

In [None]:
df = clean(df,'text')

In [None]:
create_folds('dfm_fld')

# Ruddit data

In [None]:
df = pd.read_csv("../input/ruddit-jigsaw-dataset/Dataset/ruddit_with_text.csv")
print(df.shape)

df = df[['txt', 'offensiveness_score']].rename(columns={'txt': 'text',
                                                                'offensiveness_score':'y'})

df['y'] = (df['y'] - df.y.min()) / (df.y.max() - df.y.min()) 
df.y.hist()

In [None]:
create_folds('dfr_fld')