In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.model_selection import StratifiedKFold

# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#fe346e; border-radius: 100px 100px; text-align:center">Set Seed for Reproducibility</h1></span>

In [None]:
def set_seed(seed=42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    import os
    import random
    import numpy as np
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
set_seed()

# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#fe346e; border-radius: 100px 100px; text-align:center">Loading Datasets</h1></span>

In [None]:
jigsaw_toxic = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv")

jigsaw_unintended_bias = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train.csv")

In [None]:
jigsaw_unintended_bias = jigsaw_unintended_bias[jigsaw_unintended_bias.toxicity_annotator_count>5]

jigsaw_toxic['dataset'] = ['jigsaw_toxic'] * len(jigsaw_toxic)
jigsaw_unintended_bias['dataset'] = ['jigsaw_unintended_bias'] * len(jigsaw_unintended_bias)

In [None]:
print(jigsaw_toxic.shape)
print(jigsaw_unintended_bias.shape)

In [None]:
jigsaw_toxic.head()

In [None]:
jigsaw_unintended_bias.head()

# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#fe346e; border-radius: 100px 100px; text-align:center">Selecting Features</h1></span>

In [None]:
jigsaw_toxic = jigsaw_toxic[['id', 'dataset' ,'comment_text', 'toxic', 'severe_toxic', 'obscene','threat','insult','identity_hate']]
jigsaw_unintended_bias = jigsaw_unintended_bias[['id', 'dataset' ,'comment_text', 'toxic', 'severe_toxicity', 'obscene','threat','insult','identity_attack']]
jigsaw_unintended_bias.columns = ['id', 'dataset' , 'comment_text', 'toxic', 'severe_toxic', 'obscene','threat','insult','identity_hate']

In [None]:
print(jigsaw_toxic.shape)
print(jigsaw_unintended_bias.shape)

# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#fe346e; border-radius: 100px 100px; text-align:center">Multiplication factors for categories</h1></span>

<span style="color: #000508; font-family: Segoe UI; font-size: 1.5em; font-weight: 300;"> Reference from @ekaterinadranitsyna <a href="https://www.kaggle.com/ekaterinadranitsyna/regression-ensemble-lb-0-78">notebook</a></span>

In [None]:
# Multiplication factors for categories.
cat_mtpl = {'toxic': 1, 
            'severe_toxic': 1.75, 
            'obscene': 0.95,
            'threat': 2, 
            'insult': 1.6, 
            'identity_hate': 1.95}

# Target buckets
target_buckets = {
    True: 'toxic',
    False: 'non_toxic'
}

In [None]:
features = []
for k,v in cat_mtpl.items():
    if k in jigsaw_toxic.columns:
        features.append(k)
        print(k,v)
        jigsaw_toxic[k] *= v

In [None]:
jigsaw_toxic['score'] = jigsaw_toxic[features].sum(axis=1)

In [None]:
jigsaw_unintended_bias['score'] = jigsaw_unintended_bias[features].sum(axis=1)
jigsaw_unintended_bias_mask_less_toxic =  (jigsaw_unintended_bias.toxic < 0.5)
jigsaw_unintended_bias.loc[jigsaw_unintended_bias_mask_less_toxic, 'score'] = jigsaw_unintended_bias[jigsaw_unintended_bias_mask_less_toxic].toxic

# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#fe346e; border-radius: 100px 100px; text-align:center">Merging Datasets</h1></span>

In [None]:
jigsaw_all_data = pd.concat([jigsaw_toxic, jigsaw_unintended_bias])
jigsaw_all_data.shape

In [None]:
jigsaw_all_data['target'] = (jigsaw_all_data['score'] > 1).map(target_buckets)

In [None]:
jigsaw_all_data.head()

# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#fe346e; border-radius: 100px 100px; text-align:center">Dropping duplicates</h1></span>

In [None]:
sum(jigsaw_all_data.duplicated(subset=['comment_text']))

In [None]:
jigsaw_all_data = jigsaw_all_data.drop_duplicates(subset=['comment_text']).reset_index(drop=True)

In [None]:
jigsaw_all_data.shape

# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#fe346e; border-radius: 100px 100px; text-align:center">Making KFolds using Score</h1></span>

In [None]:
num_bins = int(np.floor(1 + np.log2(len(jigsaw_all_data))))
jigsaw_all_data.loc[:,'bins'] = pd.cut(jigsaw_all_data['score'],
                                bins=num_bins,
                                labels=False)

bins = jigsaw_all_data.bins.to_numpy()

jigsaw_all_data['kfold_regression'] = -1
kfold = StratifiedKFold(n_splits= 5,
                        shuffle=True,
                        random_state=42)

for k , (train_idx,valid_idx) in enumerate(kfold.split(X=jigsaw_all_data,y=bins)):
    jigsaw_all_data.loc[valid_idx,'kfold_regression'] = k

In [None]:
jigsaw_all_data.head()

In [None]:
jigsaw_all_data.kfold_regression.value_counts()

# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#fe346e; border-radius: 100px 100px; text-align:center">Making KFolds using Target</h1></span>

In [None]:
jigsaw_all_data['kfold_classification'] = -1
kfold = StratifiedKFold(n_splits= 5,
                        shuffle=True,
                        random_state=42)

for k , (train_idx,valid_idx) in enumerate(kfold.split(X=jigsaw_all_data,y=jigsaw_all_data.target)):
    jigsaw_all_data.loc[valid_idx,'kfold_classification'] = k

In [None]:
jigsaw_all_data.kfold_classification.value_counts()

# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#fe346e; border-radius: 100px 100px; text-align:center">How score distributed in Folds </h1></span>

In [None]:
sns.displot(data=jigsaw_all_data[jigsaw_all_data.dataset == 'jigsaw_toxic'],
           kind='kde',
           x='score',
           hue='kfold_regression',
           multiple='stack',
           alpha=.7, linewidth=0, aspect=20/7);
plt.title('Jigsaw_toxic Dataset');

# sns.set(rc={'figure.figsize':(15,7)})

In [None]:
sns.displot(data=jigsaw_all_data[jigsaw_all_data.dataset == 'jigsaw_unintended_bias'],
           kind='kde',
           x='score',
           hue='kfold_regression',
           multiple='stack',
           alpha=.7, linewidth=0, aspect=20/7);
plt.title('jigsaw_unintended_bias Dataset');


# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#fe346e; border-radius: 100px 100px; text-align:center">How score distributed in Bins </h1></span>

In [None]:
sns.displot(data=jigsaw_all_data,
           x='score',
           kind='kde',
           hue='bins',
           multiple='stack',
           palette="viridis",
           alpha=0.5,
           linewidth=0,
           aspect=20/7,
           warn_singular = False
           );
# sns.set(rc={'figure.figsize':(15,7)})

# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#fe346e; border-radius: 100px 100px; text-align:center">Saving the Dataset </h1></span>

In [None]:
jigsaw_all_data[['id', 'dataset', 'comment_text','score', 'target','kfold_regression', 'kfold_classification']].to_csv('jigsaw_training_data.csv')

In [None]:
!ls