### Import Packages

In [None]:
#@title Imports

!pip install pydot --quiet
!pip install gensim==3.8.3 --quiet
# !pip install tensorflow-datasets --quiet
# !pip install -U tensorflow-text==2.8.2 --quiet
!pip install transformers --quiet
!pip install pydot --quiet
!pip install tensorflow_addons

[K     |████████████████████████████████| 24.2 MB 58.0 MB/s 
[K     |████████████████████████████████| 4.4 MB 12.1 MB/s 
[K     |████████████████████████████████| 101 kB 11.2 MB/s 
[K     |████████████████████████████████| 596 kB 57.8 MB/s 
[K     |████████████████████████████████| 6.6 MB 19.4 MB/s 
[?25hLooking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow_addons
  Downloading tensorflow_addons-0.17.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 22.6 MB/s 
Installing collected packages: tensorflow-addons
Successfully installed tensorflow-addons-0.17.1


In [None]:
# Import packages
import pandas as pd
import numpy as np

from tensorflow import keras
import tensorflow as tf
from keras.layers import Embedding, Input, Dense, Lambda
from keras.models import Model
import keras.backend as K
import tensorflow_addons as tfa
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

from google.colab import drive

from collections import Counter
import matplotlib.pyplot as plt
from nltk.util import ngrams

from transformers import BertTokenizer, TFBertModel, logging as hf_logging, TFDistilBertModel, DistilBertConfig, DistilBertTokenizerFast
from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split

import random
import itertools

import sklearn as sk
import os
import nltk
from nltk.corpus import reuters
from nltk.data import find

import re
import gensim
import absl.logging
from datetime import datetime
from pytz import timezone

pd.set_option("display.max_rows", None, # display all rows
              "display.max_columns", None, # display all columns
              "display.max_colwidth", None, # expand column width
              "display.html.use_mathjax", False) # disable Latex style mathjax rendering

In [None]:
drive.mount('/drive') 
%cd /drive/MyDrive/W266 Project/Colab Notebooks/Exploration
!pwd

Mounted at /drive
/drive/MyDrive/W266 Project/Colab Notebooks/Exploration
/drive/MyDrive/W266 Project/Colab Notebooks/Exploration


# Model - TFBERT + DISTILBERT

In [None]:
train = pd.read_csv('../../data/transformed/final/full_data/train_final.csv').sample(frac = 1, random_state = 2) # shuffle rows
val = pd.read_csv('../../data/transformed/final/full_data/val_final.csv').sample(frac = 1, random_state = 2) # shuffle rows
test = pd.read_csv('../../data/transformed/final/full_data/test_final.csv').sample(frac = 1, random_state = 2) # shuffle rows

In [None]:
# quick check to make sure we see expected representation of 0 and 1
display(train['violent'].value_counts())
display(val['violent'].value_counts())
test['violent'].value_counts()

0    143346
1     16225
Name: violent, dtype: int64

0    28920
1     3069
Name: violent, dtype: int64

0    28815
1     3174
Name: violent, dtype: int64

In [None]:
# bert_tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case = False)

bert_tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [None]:
max_length = 128

x_train = bert_tokenizer(list(train['comment_text_transf']),
                         max_length = max_length,
                         truncation = True,
                         padding = 'max_length',
                         return_tensors = 'tf')
y_train = list(train['violent'])

In [None]:
x_val = bert_tokenizer(list(val['comment_text_transf'].apply(str)),
                         max_length = max_length,
                         truncation = True,
                         padding = 'max_length',
                         return_tensors = 'tf')
y_val = list(val['violent'])

In [None]:
x_test = bert_tokenizer(list(test['comment_text_transf'].apply(str)),
                         max_length = max_length,
                         truncation = True,
                         padding = 'max_length',
                         return_tensors = 'tf')
y_test = list(test['violent'])

In [None]:
hf_logging.set_verbosity_error()
tf.get_logger().setLevel('ERROR')
absl.logging.set_verbosity(absl.logging.ERROR)

def create_bert_cls_model(hidden_size=100, dropout=0.3, learning_rate=0.00005, weight_decay=0):
    # bert_model = TFBertModel.from_pretrained('bert-base-cased')

    config = DistilBertConfig(dropout=0.1, 
                          attention_dropout=0.1, 
                          output_hidden_states=True)
    bert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased', config=config)
    
    # Make DistilBERT layers untrainable
    for layer in bert_model.layers:
      layer.trainable = True

    # Build the input layers
    input_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int64, name='input_ids_layer')
    # token_type_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int64, name='token_type_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int64, name='attention_mask_layer')

    # Dictionary of inputs
    bert_inputs = {'input_ids': input_ids,
                  #  'token_type_ids': token_type_ids,
                   'attention_mask': attention_mask} 


    # model output
    bert_out = bert_model(bert_inputs) 

    # Instead of pooled token, using the 'cls' token 
    # pooled_token = bert_out[1]
    # cls_token = bert_out[0][:,0] # First layer, 0th column of each token set
    pooled_token = bert_out[0][:, 0, :]


    # Hidden layers
    hidden = tf.keras.layers.Dense(hidden_size, activation='relu', name='hidden_layer')(pooled_token)
    hidden = tf.keras.layers.Dropout(dropout)(hidden) 
    hidden = tf.keras.layers.Dense(100, activation='relu', name='hidden_layer_2')(hidden)
 
    classification = tf.keras.layers.Dense(1, activation='sigmoid',name='classification_layer')(hidden)
    
    # instantiate model
    classification_model = tf.keras.Model(inputs=[input_ids, 
                                                  # token_type_ids, 
                                                  attention_mask], outputs=[classification])

    # compile model
    classification_model.compile(optimizer=tfa.optimizers.AdamW(learning_rate = learning_rate,
                                                                weight_decay = weight_decay),
                            loss=tf.keras.losses.BinaryCrossentropy(from_logits=False), 
                            metrics=['Precision', 'Recall',
                                     tfa.metrics.F1Score(num_classes=1, name='f1_score',
                                     threshold=0.5, average='weighted'),
                                     'TrueNegatives', 'TruePositives', 
                                     'FalsePositives', 'FalseNegatives',
                                     'accuracy', 'AUC'
                                     ])
    
   
    ### END YOUR CODE
    
    return classification_model

In [None]:
plot_show = False
hidden_size_list = [50]
dropout_list = [0.1]
learning_rate_list = [0.00001]
weight_decay_list = [0]
batch_size_list = [32]
epochs_list = [100]

grid_list = [hidden_size_list, dropout_list, learning_rate_list, 
             weight_decay_list, batch_size_list, epochs_list]
grid = list(itertools.product(*grid_list))
random.shuffle(grid)

for i in range(1):
  start_time = datetime.now(timezone('America/Chicago')).strftime('%Y%m%d-%H%M%S')
  bert_cls_model = create_bert_cls_model(hidden_size=grid[i][0],
                                         dropout=grid[i][1],
                                         learning_rate=grid[i][2],
                                         weight_decay=grid[i][3] 
                                         )
  chkpt_path = f'checkpoint/model_{i+1}_{start_time}/cp.ckpt'
  chkpt_dir = os.path.dirname(chkpt_path)
  
  print(f'''
  {i+1}. hidden_size={grid[i][0]} 
     dropout={grid[i][1]} 
     learning_rate={grid[i][2]} 
     weight_decay={grid[i][3]}
     batch_size={grid[i][4]}
     epochs={grid[i][5]}
     checkpoint_path={chkpt_path} 
        ''') 
  
  if plot_show == True:
    bert_cls_model.summary()
    display(keras.utils.plot_model(bert_cls_model, show_shapes=True, dpi=90))
  
  es = EarlyStopping(monitor='val_f1_score', mode='max', verbose=1, patience=5, min_delta=0.01)
  mc = ModelCheckpoint(chkpt_path, monitor='val_f1_score', mode='max', 
                       save_best_only=True, save_weights_only=True)
  
  bert_cls_model_history = bert_cls_model.fit(
                                            [x_train.input_ids, 
                                            #  x_train.token_type_ids, 
                                             x_train.attention_mask], 
                                             np.array(y_train),   
                                            validation_data=(
                                            [x_val.input_ids, 
                                            #  x_val.token_type_ids, 
                                             x_val.attention_mask], 
                                             np.array(y_val)),    
                                            batch_size=grid[i][4], 
                                            epochs=grid[i][5],
                                            callbacks=[es, mc])

  # bert_cls_model.save(f'checkpoint/final_model{i}_{start_time}')
  # bert_cls_model.save_weights(f'checkpoint/final_weights{i}_{start_time}')

Downloading:   0%|          | 0.00/347M [00:00<?, ?B/s]


  1. hidden_size=50 
     dropout=0.1 
     learning_rate=1e-05 
     weight_decay=0
     batch_size=32
     epochs=100
     checkpoint_path=checkpoint/model_1_20220721-120428/cp.ckpt 
        
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 6: early stopping


hidden_size=50 
     dropout=0.1 
     learning_rate=1e-05 
     weight_decay=0
     batch_size=32
     epochs=100
     checkpoint_path=checkpoint/model_3_20220720-180757/cp.ckpt

In [None]:
chkpt_path = 'checkpoint/model_1_20220721-120428/cp.ckpt'
bert_cls_model = create_bert_cls_model(hidden_size=50,
                                         dropout=0.1,
                                         learning_rate=0.00001,
                                         weight_decay=0
                                         )
bert_cls_model.load_weights(chkpt_path)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f3024079e10>

In [None]:
score = bert_cls_model.evaluate([x_val.input_ids, 
                                #  x_test.token_type_ids, 
                                 x_val.attention_mask], 
                                 np.array(y_val))



In [None]:
score = bert_cls_model.evaluate([x_test.input_ids, 
                                #  x_test.token_type_ids, 
                                 x_test.attention_mask], 
                                 np.array(y_test))



In [None]:
predictions = bert_cls_model.predict([x_test.input_ids, 
                                      # x_test.token_type_ids, 
                                      x_test.attention_mask])

In [None]:
predictions = pd.Series(np.rint(predictions).flatten(), dtype = int)
test_ids = pd.Series(final_test.index)
test_check = final_test.copy()
test_check = test_check.reset_index(drop = True)
test_check = pd.concat([test_check, predictions, test_ids], axis = 1)
test_check.rename(columns = {'violent': 'actual', 0: 'prediction', 1: 'id'}, inplace = True)
test_check.set_index('id', inplace = True)

In [None]:
wrong_predictions = test_check[test_check['actual'] != test_check['prediction']]
fp = wrong_predictions[wrong_predictions['prediction'] == 1]
fn = wrong_predictions[wrong_predictions['prediction'] == 0]

In [None]:
len(fp)

106

In [None]:
fp.head()

Unnamed: 0_level_0,actual,comment_text_transf,prediction
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
47885,0,"Is eBaum's World Sucks still up? Everytime I type in its old and new URL, it can't find the website. Is this website still up, temporarily down, or closed?",1
15250,0,"fjf jfdjj dhnd djjndhf xdjhnde djujn djj rdejb dfuufbb fug hidbbmk duuhbh djj djudhhdfiivf fuhfvhfvnvhjfv v uivf fvbhes ddwppsdfjiodkzh er gjhhng hcvhn ifhh l; . vbhhfyjffgnhfdu mf bfku fujfv vjgf vgjghfgjh gj jkf 8rfiyuuerhkjsdhfuhfviuhfckjsnjghxvjxhf s fvhue s nvcjkdhdf jhfjfnxcv jk hdk h fjfhfvfj jjf ihgs,zhnfghjf i nfyufgohuhbvhjb vfihbfr bfgbg gfujnb ffcu hfrug f h re v bfvjjf hr g fuhhc jfhvnfc vjbv gtb vhjjv hbbvcjhjsdhugbgbc n cjg vhdfbcg vfhjbcfhgf",1
46651,0,"Edit War!!! You don't belong to His Holiness Gohar Shahi, you dirty soul belongs to younas harami & Let me tell you that you are not working for the cause of His Holiness, but you are a group of opportunists. Don't mess up with us, otherwise we would open your reality to all.",1
32715,0,copy and save to your comp for future vand11sm to atlantic records. YOU SUCK! copy and save to your comp for future vand11sm to atlantic records.,1
61113,0,"If I am unblocked, I will continue editing as usual? A lack of remorse is not going to get you an unblock. How about admitting that your repeated use of the word nigger was unneeded and unacceptable in a community where many people find it very offensive. I've seen people chastised for a lot less, and to be frank, you should have seen it coming repeatedly using the word like you did. An unblock request should show that you are not going to be repeating the same behavior that got you blocked in the first place, so maybe you should clearly state that, not that you are going to be continue on without modifying your behavior. Regards, —",1
