In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:

!pip install texthero
!pip install tweet-preprocessor


In [None]:
import seaborn as sns 
import matplotlib.pyplot as plt 

In [None]:
#reading the data
train_df= pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
test_df= pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')
print(train_df.columns,train_df.shape)
target_col= train_df.columns[2:]
feature_col= train_df.columns[1:2]
train_df.head()

In [None]:
target_col , feature_col

In [None]:
test_df.head()

## Data Preprocessing 

In [None]:
import re 
import nltk
from wordcloud import WordCloud
from nltk.stem import WordNetLemmatizer 
from textblob import TextBlob,Word
from nltk.corpus import words
nltk.download('words')
import texthero as hero
import re
from texthero import stopwords

from nltk.corpus import wordnet

import tensorflow as tf

from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer

import tensorflow as tf


def lemma_per_pos(sent):
    '''function to lemmatize according to part of speech tag'''
    tweet_tokenizer=TweetTokenizer()
    lemmatizer = nltk.stem.WordNetLemmatizer()
    lemmatized_list = [lemmatizer.lemmatize(w) for w in  tweet_tokenizer.tokenize(sent)]
    return " ".join(lemmatized_list)

def df_preprocessing(df,feature_col):
    '''
    Preprocessing of dataframe
    '''
    stop = set(stopwords.words('english'))
    df[feature_col]= (df[feature_col].pipe(hero.lowercase).
                      pipe(hero.remove_urls).
                      pipe(hero.remove_digits).
                      pipe(hero.remove_punctuation).
                      pipe(hero.remove_html_tags) )
    # lemmatization
#     df[feature_col]= [lemma_per_pos(sent) for sent in df[feature_col]]
    # df[col_name]= hero.remove_stopwords(df[col_name],custom_stopwords)
    return df

In [None]:
with tf.device('/GPU:0'):
    proc_train_df= df_preprocessing(train_df,feature_col[0])

In [None]:
proc_test_df = df_preprocessing(test_df,feature_col[0])

## Length Statistics

In [None]:
proc_train_df['len']= proc_train_df[feature_col[0]].str.split().map(lambda x : len(x))
print('Max length: {}, Min length: {}, Average Length :{}'.format(max(proc_train_df['len']),min(proc_train_df['len']),int(proc_train_df['len'].mean())))
proc_train_df.len.hist()
plt.show()

In [None]:
# Only taking article length lower than 128
# proc_train_df= proc_train_df[(proc_train_df.len<=512) & (proc_train_df.len>=10)].reset_index(drop=True)
print('Max length: {}, Min length: {}, Average Length :{}'.format(max(proc_train_df['len']),min(proc_train_df['len']),int(proc_train_df['len'].mean())))
# Now the distribution is 
print(proc_train_df.shape)
proc_train_df.len.hist()
plt.show()

## Checking distribution of label

In [None]:
plt.figure(figsize=(10,6))
plt.hist(x=[proc_train_df.toxic,proc_train_df.severe_toxic,proc_train_df.obscene,proc_train_df.threat,proc_train_df.insult,proc_train_df.identity_hate])
plt.show()

# BERT model declaration

In [None]:
from transformers import AutoTokenizer,TFDistilBertModel, DistilBertConfig
from transformers import TFAutoModel
import tensorflow as tf 
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout
from transformers import AdamW, get_linear_schedule_with_warmup
import tensorflow_addons as tfa


#Creating tokenizer
def create_tokenizer(pretrained_weights='distilbert-base-uncased'):
  '''Function to create the tokenizer'''

  tokenizer = AutoTokenizer.from_pretrained(pretrained_weights)
  return tokenizer

#Tokenization of the data
def data_tokenization(dataset,feature_col,max_len,tokenizer):
    '''dataset: Pandas dataframe with feature name is column name 
    Pretrained_weights: selected model 
    RETURN: [input_ids, attention_mask]'''

    tokens = dataset[feature_col].apply(lambda x: tokenizer(x,return_tensors='tf', 
                                                            truncation=True,
                                                            padding='max_length',
                                                            max_length=max_len, 
                                                            add_special_tokens=True))
    input_ids= []
    attention_mask=[]
    for item in tokens:
        input_ids.append(item['input_ids'])
        attention_mask.append(item['attention_mask'])
    input_ids, attention_mask=np.squeeze(input_ids), np.squeeze(attention_mask)


    return [input_ids,attention_mask]

def bert_model(pretrained_weights,max_len,learning_rate):
  '''BERT model creation with pretrained weights
  INPUT:
  pretrained_weights: Language model pretrained weights
  max_len: input length '''
  print('Model selected:', pretrained_weights)
  bert=TFAutoModel.from_pretrained(pretrained_weights)
  
  # This is must if you would like to train the layers of language models too.
  for layer in bert.layers:
      layer.trainable = True

  # parameter declaration
#   step = tf.Variable(0, trainable=False)
#   schedule = tf.optimizers.schedules.PiecewiseConstantDecay([10000, 15000], [2e-0, 2e-1, 1e-2])
#   # lr and wd can be a function or a tensor
#   lr = learning_rate * schedule(step)
#   wd = lambda:lr * schedule(step)
#   optimizer = tfa.optimizers.AdamW(learning_rate=lr, weight_decay=wd)

  optimizer= tf.keras.optimizers.Adam(learning_rate=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False,name='Adam')
#   optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate)

  # declaring inputs, BERT take input_ids and attention_mask as input
  input_ids= Input(shape=(max_len,),dtype=tf.int32,name='input_ids')
  attention_mask=Input(shape=(max_len,),dtype=tf.int32,name='attention_mask')

  bert= bert(input_ids,attention_mask=attention_mask)
  x= bert[0][:,0,:]
  x=tf.keras.layers.Dropout(0.1)(x)
  x= tf.keras.layers.Dense(128)(x)
  x=tf.keras.layers.Dense(64)(x)
  x=tf.keras.layers.Dense(32)(x)

  output=tf.keras.layers.Dense(6,activation='sigmoid')(x)

  model=Model(inputs=[input_ids,attention_mask],outputs=[output])
  # compiling model 
  model.compile(optimizer=optimizer,
                loss=tf.keras.losses.BinaryCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.NONE,name='binary_crossentropy'),
                metrics=['accuracy'])
  return model

In [None]:
pretrained_weights='bert-base-uncased'
max_len=256
epochs=3
learning_rate=2e-5
batch_size=4

In [None]:
tokenizer= create_tokenizer(pretrained_weights)

In [None]:
x_train= data_tokenization(proc_train_df,feature_col[0],max_len,tokenizer)

In [None]:
y_train= proc_train_df[target_col].values
y_train

In [None]:
bert=bert_model(pretrained_weights,max_len,learning_rate)
bert.summary()

In [None]:
with tf.device('/GPU:0'):
    bert.fit(x_train,y_train,batch_size=batch_size,epochs=epochs,verbose=1)

In [None]:
test_ids= proc_test_df['id']
x_test= data_tokenization(proc_test_df,feature_col[0],max_len,tokenizer)
x_test

In [None]:
preds= bert.predict(x_test)
submiss_df= pd.DataFrame(preds, columns= target_col)
submiss_df['id']=test_ids
submiss_df

In [None]:
submiss_df = submiss_df[['id']+target_col.tolist()]
submiss_df

In [None]:
submiss_df.to_csv('submissioin.csv', index=False, header=True)