In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Imports**

In [None]:
import os
import shutil
import pandas as pd
import numpy as np
# import keras
import tensorflow as tf
import tensorflow_hub as hub
# import tensorflow_text as text
# from official.nlp import optimization  # to create AdamW optmizer
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

import matplotlib.pyplot as plt

import plotly.express as px
import plotly.graph_objects as go
# import spacy

import warnings
warnings.filterwarnings("ignore")


tf.get_logger().setLevel('ERROR')

pd.set_option("display.max_columns", 150)
pd.set_option('display.max_rows', 999)
pd.set_option('display.max_colwidth', None)

# **Load data**

In [None]:
!unzip ../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip
!unzip ../input/jigsaw-toxic-comment-classification-challenge/test.csv.zip
!unzip ../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip
!unzip ../input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip

## **Train data**

In [None]:
train = pd.read_csv("/kaggle/working/train.csv")
train.head()

In [None]:
train.info()

In [None]:
print('Number of duplicated rows:', sum(train.duplicated()))

In [None]:
train.iloc[58:60,:]

## **Distributions of classes**

<font size="5">Clearly, all classes are imbalanced</font>

In [None]:
train['toxic'].value_counts()

In [None]:
target = (
    train['toxic']
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={'index':'Target', 'toxic':'Count'})
    .replace([0,1], ['Normal', 'Toxic']) 
    .groupby('Target')
    .sum()
    .reset_index()    
          )   

fig = go.Figure(data=[go.Pie(labels=target['Target'], 
                             values=target['Count'])])

fig.update_traces(hoverinfo='percent+value', 
                  textinfo='label', 
                  textfont_size=20,
                  marker=dict(colors=['#8cb074', '#5a7c47'], line=dict(color='white', width=5)))

fig.update_layout(showlegend=False, 
                  title_text="Target Distribution [toxic]",
                  title_x=0.5,
                  font=dict(family="Hiragino Kaku Gothic Pro, sans-serif", size=20, color='#000000'))


fig.show()

In [None]:
train['severe_toxic'].value_counts()

In [None]:
target = (
    train['severe_toxic']
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={'index':'Target', 'severe_toxic':'Count'})
    .replace([0,1], ['Normal', 'Severe_toxic']) 
    .groupby('Target')
    .sum()
    .reset_index()    
          )   

fig = go.Figure(data=[go.Pie(labels=target['Target'], 
                             values=target['Count'])])

fig.update_traces(hoverinfo='percent+value', 
                  textinfo='label', 
                  textfont_size=20,
                  marker=dict(colors=['#8cb074', '#5a7c47'], line=dict(color='white', width=1)))

fig.update_layout(showlegend=False, 
                  title_text="Target Distribution [severe_toxic]",
                  title_x=0.5,
                  font=dict(family="Hiragino Kaku Gothic Pro, sans-serif", size=20, color='#000000'))


fig.show()

In [None]:
train['obscene'].value_counts()

In [None]:
target = (
    train['obscene']
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={'index':'Target', 'obscene':'Count'})
    .replace([0,1], ['Normal', 'Obscene']) 
    .groupby('Target')
    .sum()
    .reset_index()    
          )   

fig = go.Figure(data=[go.Pie(labels=target['Target'], 
                             values=target['Count'])])

fig.update_traces(hoverinfo='percent+value', 
                  textinfo='label', 
                  textfont_size=20,
                  marker=dict(colors=['#8cb074', '#5a7c47'], line=dict(color='white', width=5)))

fig.update_layout(showlegend=False, 
                  title_text="Target Distribution [obscene]",
                  title_x=0.5,
                  font=dict(family="Hiragino Kaku Gothic Pro, sans-serif", size=20, color='#000000'))


fig.show()

In [None]:
train['threat'].value_counts()

In [None]:
target = (
    train['threat']
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={'index':'Target', 'threat':'Count'})
    .replace([0,1], ['Normal', 'Threat']) 
    .groupby('Target')
    .sum()
    .reset_index()    
          )   

fig = go.Figure(data=[go.Pie(labels=target['Target'], 
                             values=target['Count'])])

fig.update_traces(hoverinfo='percent+value', 
                  textinfo='label', 
                  textfont_size=20,
                  marker=dict(colors=['#8cb074', '#5a7c47'], line=dict(color='white', width=1)))

fig.update_layout(showlegend=False, 
                  title_text="Target Distribution [threat]",
                  title_x=0.5,
                  font=dict(family="Hiragino Kaku Gothic Pro, sans-serif", size=20, color='#000000'))


fig.show()

In [None]:
train['insult'].value_counts()

In [None]:
target = (
    train['insult']
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={'index':'Target', 'insult':'Count'})
    .replace([0,1], ['Normal', 'Insult']) 
    .groupby('Target')
    .sum()
    .reset_index()    
          )   

fig = go.Figure(data=[go.Pie(labels=target['Target'], 
                             values=target['Count'])])

fig.update_traces(hoverinfo='percent+value', 
                  textinfo='label', 
                  textfont_size=20,
                  marker=dict(colors=['#8cb074', '#5a7c47'], line=dict(color='white', width=5)))

fig.update_layout(showlegend=False, 
                  title_text="Target Distribution [insult]",
                  title_x=0.5,
                  font=dict(family="Hiragino Kaku Gothic Pro, sans-serif", size=20, color='#000000'))


fig.show()

In [None]:
train['identity_hate'].value_counts()

In [None]:
target = (
    train['identity_hate']
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={'index':'Target', 'identity_hate':'Count'})
    .replace([0,1], ['Normal', 'Identity_hate']) 
    .groupby('Target')
    .sum()
    .reset_index()    
          )   

fig = go.Figure(data=[go.Pie(labels=target['Target'], 
                             values=target['Count'])])

fig.update_traces(hoverinfo='percent+value', 
                  textinfo='label', 
                  textfont_size=20,
                  marker=dict(colors=['#8cb074', '#5a7c47'], line=dict(color='white', width=1)))

fig.update_layout(showlegend=False, 
                  title_text="Target Distribution [identity_hate]",
                  title_x=0.5,
                  font=dict(family="Hiragino Kaku Gothic Pro, sans-serif", size=20, color='#000000'))


fig.show()

# **Exploring Test data**

In [None]:
test_labels = pd.read_csv("/kaggle/working/test_labels.csv")
test_labels.head(100)

In [None]:
test_labels_unique = test_labels[test_labels['toxic']!=-1]['id'].unique()

In [None]:
len(test_labels_unique)

In [None]:
test = pd.read_csv("/kaggle/working/test.csv")
test.tail()

In [None]:
print('Number of duplicated rows:', sum(test.duplicated()))

In [None]:
test.info()