In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.models import Sequential
from tensorflow.keras.layers import LSTM, GRU, SimpleRNN, BatchNormalization, Embedding
from keras.layers.core import Dense, Activation, Dropout
from keras.utils import np_utils, pad_sequences
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from plotly import graph_objs as go 
import plotly.express as px
import plotly.figure_factory as ff

In [3]:
'''
try:
  tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
  print('The TPU Cluster Resolver is '+ tpu.master())
except ValueError:
  print("The Value Error has occurred")
  tpu = None

if tpu:
  tf.config.experimental_connect_to_cluster(tpu)
  tf.tpu.experimental.initialize_tpu_system(tpu)
  strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
print(f'The number of replicas are {strategy.num_replicas_in_sync}')
'''

'\ntry:\n  tpu = tf.distribute.cluster_resolver.TPUClusterResolver()\n  print(\'The TPU Cluster Resolver is \'+ tpu.master())\nexcept ValueError:\n  print("The Value Error has occurred")\n  tpu = None\n\nif tpu:\n  tf.config.experimental_connect_to_cluster(tpu)\n  tf.tpu.experimental.initialize_tpu_system(tpu)\n  strategy = tf.distribute.experimental.TPUStrategy(tpu)\nelse:\nprint(f\'The number of replicas are {strategy.num_replicas_in_sync}\')\n'

In [4]:
train = pd.read_csv('/content/drive/MyDrive/Datasets/Zero To Transformers Series/jigsaw-toxic-comment-train.csv')
test = pd.read_csv('/content/drive/MyDrive/Datasets/Zero To Transformers Series/test.csv')
validation = pd.read_csv('/content/drive/MyDrive/Datasets/Zero To Transformers Series/validation.csv')

In [5]:
train

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
223544,fff8f64043129fa2,":Jerome, I see you never got around to this…! ...",0,0,0,0,0,0
223545,fff9d70fe0722906,==Lucky bastard== \n http://wikimediafoundatio...,0,0,0,0,0,0
223546,fffa8a11c4378854,==shame on you all!!!== \n\n You want to speak...,0,0,0,0,0,0
223547,fffac2a094c8e0e2,MEL GIBSON IS A NAZI BITCH WHO MAKES SHITTY MO...,1,0,1,0,1,0


In [6]:
train.drop(['severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], axis=1, inplace=True)

In [7]:
train.shape

(223549, 3)

In [8]:
train = train.loc[:12000,:]
train.shape

(12001, 3)

In [9]:
train['comment_text'].apply(lambda x: len(str(x).split())).max()

1403

In [10]:
def roc_auc(prediction, targets):
  tpr, fpr, threshold = metrics.roc_curve(targets, prediction)
  roc_auc = metrics.auc(tpr, fpr)
  return roc_auc

In [11]:
xtrain, xvalid, ytrain, yvalid = train_test_split(train.comment_text.values, train.toxic.values, 
                                                  stratify=train.toxic.values, 
                                                  random_state=42, 
                                                  test_size=0.2, shuffle=True)

Recurrent Neural Networks

In [12]:
token = text.Tokenizer(num_words=None)
max_len = 1500

token.fit_on_texts(list(xtrain) + list(xvalid))
xtrain_seq = token.texts_to_sequences(xtrain)
xvalid_seq = token.texts_to_sequences(xvalid)

xtrain_pad = pad_sequences(xtrain_seq, maxlen=max_len)
xvalid_pad = pad_sequences(xvalid_seq, maxlen=max_len)

word_index = token.word_index

In [13]:
strategy = tf.distribute.get_strategy()

In [14]:
%%time
with strategy.scope():
  model = Sequential()
  model.add(Embedding(len(word_index)+1, 300, input_length=max_len))
  model.add(SimpleRNN(100))
  model.add(Dense(1, activation='sigmoid'))
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1500, 300)         13049100  
                                                                 
 simple_rnn (SimpleRNN)      (None, 100)               40100     
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 13,089,301
Trainable params: 13,089,301
Non-trainable params: 0
_________________________________________________________________
CPU times: user 450 ms, sys: 109 ms, total: 560 ms
Wall time: 692 ms


In [16]:
model.fit(xtrain_pad, ytrain, epochs=5, batch_size=64*strategy.num_replicas_in_sync)

Epoch 1/5
  1/150 [..............................] - ETA: 5:22 - loss: 0.4839 - accuracy: 0.8906

KeyboardInterrupt: ignored

In [None]:
scores = model.predict(xvalid_pad)
print(f'The roc scores are {roc_auc(scores, yvalid): .2f}')

In [None]:
scores_model = []
scores_model.append({'Model': 'SimpleRNN', 'AUC_Score': roc_auc(scores, yvalid)})

Word Embeddings

In [None]:
#from google.colab import files

#uploaded = files.upload()

#for fn in uploaded.keys():
#  print('User uploaded file "{name}" with length {length} bytes'.format(
#      name=fn, length=len(uploaded[fn])))

In [17]:
!wget http://www-nlp.stanford.edu/data/glove.840B.300d.zip

--2023-06-12 08:13:47--  http://www-nlp.stanford.edu/data/glove.840B.300d.zip
Resolving www-nlp.stanford.edu (www-nlp.stanford.edu)... 171.64.67.140
Connecting to www-nlp.stanford.edu (www-nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.840B.300d.zip [following]
--2023-06-12 08:13:47--  https://nlp.stanford.edu/data/glove.840B.300d.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.840B.300d.zip [following]
--2023-06-12 08:13:48--  https://downloads.cs.stanford.edu/nlp/data/glove.840B.300d.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTT

In [None]:
!unzip glove.840B.300d.zip

Archive:  glove.840B.300d.zip
replace glove.840B.300d.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
!ls
!pwd

In [None]:
embeddings_index = {}
f = open('/content/glove.840B.300d.txt', 'r', encoding='utf-8')
for line in tqdm(f):
  values = line.split(' ')
  key = values[0]
  coeff = np.asarray([float(val) for val in values[1:]])
  embeddings_index[key] = coeff
f.close()

In [None]:
print(f'Found {len(embeddings_index)} word vectors')

Long Short Term Memory

In [None]:
# create an embedding matrix for the words we have in the dataset
embedding_matrix = np.zeros((len(word_index)+1, 300))
for word, i in tqdm(word_index.items()):
  embedding_vector = embeddings_index.get(word)
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector

In [None]:
%%time
with strategy.scope():
  model = Sequential()
  model.add(Embedding(len(word_index)+1, 300, weights=[embedding_matrix], input_length=max_len, trainable=False ))
  model.add(LSTM(100, dropout=0.3, recurrent_dropout=0.3))
  model.add(Dense(1, activation='sigmoid'))
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
model.fit(xtrain_pad, ytrain, epochs=5, batch_size=64*strategy.num_replicas_in_sync)

In [None]:
scores = model.predict(xvalid_pad)

In [None]:
print(f'The roc_score is {(roc_auc(scores, yvalid)): .2f}')

In [None]:
scores_model.append({'Model':'LSTM', 'AUC_Score': roc_auc(scores, yvalid)})

In [None]:
scores_model

Gated Recurrent Unit

In [None]:
%%time
with strategy.scope():
  model = Sequential()
  model.add(Embedding(
      len(word_index)+1,
      300,
      weights=[embedding_matrix],
      input_length=max_len,
      trainable=False
  ))
  model.add(SpatialDropout1D(0.3))
  model.add(GRU(300))
  model.add(Dense(1, activation='sigmoid'))
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
model.fit(xtrain_pad, ytrain, epochs=5, batch_size=64*strategy.num_replicas_in_sync)

In [None]:
scores = model.predict(xvalid_pad)

In [None]:
print(f'The roc_auc score is {roc_auc(scores, yvalid)}')

In [None]:
scores_model.append({'Model':'GRU', 'AUC_Score':roc_auc(scores, yvalid)})

In [None]:
scores_model

Bilinear RNN

In [None]:
%%time
with strategy.scope():
  model = Sequential()
  model.add(Embedding(len(word_index)+1, 300, weights=[embedding_matrix], input_length=max_len, trainable=False))
  model.add(Bidirectional(LSTM(300, dropout=0.3, recurrent_dropout=0.3)))
  model.add(Dense(1, activation='sigmoid'))
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
model.fit(xtrain_pad, ytrain, epochs=5, batch_size=64*strategy.num_replicas_in_sync)

In [None]:
scores = model.predict(xvalid_pad)

In [None]:
print(f'The auc score is {roc_auc(scores, yvalid)}')

In [None]:
scores_model.append({'Model':'Bilinear RNN', 'AUC_Score': roc_auc(scores, yvalid)})

In [None]:
results = pd.DataFrame(scores_model).sort_values(by='AUC_Score', ascending=False)

In [None]:
results.style.background_gradient(cmap='Blues')

In [None]:
go.Figure

In [None]:
fig = go.Figure(go.Funnelarea( text=results.Model, values=results.AUC_Score, title='Funnel-Chart of Sentiment Distribution by Model'))
fig.show()

Seq2Seq Model Arcitecture:
Will use other tutorials for this part:

Transformers

BERT

In [None]:
#Loading Dependencies
import os
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint

In [None]:
!pip install KaggleDatasets

In [None]:
!pip install transformers

In [None]:
import kaggledatasets
import transformers

In [None]:
from tokenizers import BertWordPieceTokenizer

In [None]:
#LOADING THE DATA
train1 = pd.read_csv('/content/drive/MyDrive/Datasets/Zero To Transformers Series/jigsaw-toxic-comment-train.csv')
valid = pd.read_csv('/content/drive/MyDrive/Datasets/Zero To Transformers Series/validation.csv')
test = pd.read_csv('/content/drive/MyDrive/Datasets/Zero To Transformers Series/test.csv')
sub = pd.read_csv('/content/drive/MyDrive/Datasets/Zero To Transformers Series/sample_submission.csv')

In [None]:
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=512):
  '''
    Encoder for encoding the text into sequence of integers for BERT input
  '''
  tokenizer.enable_truncation(max_length=maxlen)
  tokenizer.enable_padding(length=maxlen)
  all_ids = []

  for i in tqdm(range(0, len(texts), chunk_size)):
    text_chunk = texts[i:i+chunk_size].tolist()
    encs = tokenizer.encode_batch(text_chunk)
    all_ids.extend([enc.ids for enc in encs])
  
  return np.array(all_ids)

In [None]:
#IMP DATA FOR CONFIG
AUTO = tf.data.experimental.AUTOTUNE

#Configuration
EPOCHS = 3
BATCH_SIZE = 16*strategy.num_replicas_in_sync
MAX_LEN = 192

Tokenization

In [None]:
#First Load the real tokenizer
tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')
#Save the loaded tokenizer locally
tokenizer.save_pretrained('.')
#Reload it with huggingface tokenizers library
fast_tokenizer = BertWordPieceTokenizer('vocab.txt', lowercase=False)
fast_tokenizer

In [None]:
x_train = fast_encode( train1.comment_text.astype(str), fast_tokenizer, maxlen=max_len)

In [None]:
x_valid = fast_encode( valid.comment_text.astype(str), fast_tokenizer, maxlen=max_len)

In [None]:
x_test = fast_encode( test.comment_text.astype(str), fast_tokenizer, maxlen=max_len)

In [None]:
y_train = train1.toxic.values
y_valid = valid.toxic.values

In [None]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train, y_train))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_valid, y_valid))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(x_test)
    .batch(BATCH_SIZE)
)

In [None]:
def build_model( transformer, max_len=512):
  """
    Function for training the BERT model
  """
  input_word_ids = Input(shape=(max_len,), dtype=tf.int_32, name="input_word_ids")
  sequence_output = transformer(input_word_ids)[0]
  cls_token = sequence_output[:, 0,:]
  out = Dense(1, activation='sigmoid')(cls_token)
  model = Model(inputs=input_word_ids, outputs=out)
  model.compile( optimizer=Adam(learning_rate=0.00001), loss='binary_crossentropy', metrics=['accuracy'])
  return model


Start Training

In [None]:
%%time
with strategy.scope():
  transformer_layer = (
      transformers.TFDistilBertModel
      .from_pretrained('distilbert-base-multilingual-cased')
  )
  model = build_model(transformer_layer, max_len )

In [None]:
n_steps = x_train.shape[0] #BATCH SIZE
train_history = model.fit(
    train_dataset,
    steps_per_epoch = n_steps,
    validation_data = valid_dataset,
    epochs = EPOCHS
)

In [None]:
n_steps = x_valid.shape[0]
train_history_2 = model.fit(
    valid_dataset.repeat(),
    steps_per_epoch = n_steps,
    epochs = EPOCHS * 2
)

In [None]:
sub['toxic'] = model.predict(test_dataset, verbose=1)
sub.to_csv('submission.csv', index=False)