### Import Packages

In [1]:
#@title Imports

!pip install pydot --quiet
!pip install gensim==3.8.3 --quiet
# !pip install tensorflow-datasets --quiet
# !pip install -U tensorflow-text==2.8.2 --quiet
!pip install transformers --quiet
!pip install pydot --quiet
!pip install tensorflow_addons

[K     |████████████████████████████████| 24.2 MB 3.2 MB/s 
[K     |████████████████████████████████| 4.7 MB 5.4 MB/s 
[K     |████████████████████████████████| 101 kB 9.3 MB/s 
[K     |████████████████████████████████| 596 kB 54.8 MB/s 
[K     |████████████████████████████████| 6.6 MB 38.5 MB/s 
[?25hLooking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow_addons
  Downloading tensorflow_addons-0.17.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 5.0 MB/s 
Installing collected packages: tensorflow-addons
Successfully installed tensorflow-addons-0.17.1


In [2]:
# Import packages
import pandas as pd
import numpy as np

from tensorflow import keras
import tensorflow as tf
from keras.layers import Embedding, Input, Dense, Lambda
from keras.models import Model
import keras.backend as K
import tensorflow_addons as tfa
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

from google.colab import drive

from collections import Counter
import matplotlib.pyplot as plt
from nltk.util import ngrams

from transformers import BertTokenizer, TFBertModel, logging as hf_logging, TFDistilBertModel, DistilBertConfig, DistilBertTokenizerFast
from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split

import random
import itertools

import sklearn as sk
import os
import nltk
from nltk.corpus import reuters
from nltk.data import find

import re
import gensim
import absl.logging
from datetime import datetime
from pytz import timezone

pd.set_option("display.max_rows", None, # display all rows
              "display.max_columns", None, # display all columns
              "display.max_colwidth", None, # expand column width
              "display.html.use_mathjax", False) # disable Latex style mathjax rendering

In [3]:
drive.mount('/drive') 
%cd /drive/MyDrive/W266 Project/Colab Notebooks/Exploration
!pwd

Mounted at /drive
/drive/.shortcut-targets-by-id/1p1bDkEjmNKPzX456WZWBr8qtvPr6Pt5m/W266 Project/Colab Notebooks/Exploration
/drive/.shortcut-targets-by-id/1p1bDkEjmNKPzX456WZWBr8qtvPr6Pt5m/W266 Project/Colab Notebooks/Exploration


# Model - TFBERT + DISTILBERT

In [4]:
final_train = pd.read_csv('../../data/transformed_censor_profanity/train_profanity_censor_hashtag.csv').sample(frac = 1) # shuffle rows
final_test = pd.read_csv('../../data/transformed_censor_profanity/test_profanity_censor_hashtag.csv').sample(frac = 1) # shuffle rows

In [5]:
final_test.columns

Index(['comment_text', 'violent', 'comment_text_with_hashtag',
       'comment_text_wo_double_quotes'],
      dtype='object')

In [6]:
# Reduce size of data for quick testing 
small_train = final_train.sample(n = 8000, random_state = 2)
small_test = final_test.sample(n = 2000, random_state = 2)

# Simulate Undersampling
# small_train_1 = final_train[final_train['violent'] == 1].sample(n = 5000, random_state = 2)
# small_train_0 = final_train[final_train['violent'] == 0].sample(n = 3000, random_state = 2)
# small_train = pd.concat([small_train_1, small_train_0], ignore_index=True, axis=0)

# repoint to avoid having to update code below
final_train = small_train
final_test = small_test

In [7]:
# quick check to make sure we see expected representation of 0 and 1
display(final_train['violent'].value_counts())
final_test['violent'].value_counts()

0    7125
1     875
Name: violent, dtype: int64

0    1808
1     192
Name: violent, dtype: int64

In [8]:
# bert_tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case = False)

bert_tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [9]:
final_test.head()

Unnamed: 0,comment_text,violent,comment_text_with_hashtag,comment_text_wo_double_quotes
2323,****** DRM no one can stand the ******** especially Ubisoft put into their software.,1,# DRM no one can stand the # especially Ubisoft put into their software.,# DRM no one can stand the # especially Ubisoft put into their software.
4317,"For the record, you can eat **** too. I can't understand for the life of me why that ****** couldn't just leave me alone.",1,"For the record, you can eat # too. I can't understand for the life of me why that # couldn't just leave me alone.","For the record, you can eat # too. I can't understand for the life of me why that # couldn't just leave me alone."
2565,Lakshmi the prostitute,1,Lakshmi the prostitute,Lakshmi the prostitute
55079,"This is your last warning. You will be blocked from editing the next time you vandalize a page, as you did with this edit to Platypus. –",0,"This is your last warning. You will be blocked from editing the next time you vandalize a page, as you did with this edit to Platypus. –","This is your last warning. You will be blocked from editing the next time you vandalize a page, as you did with this edit to Platypus. –"
50292,"ROFL, Ariel, you watchin my talk page? P I love it! ~",0,"ROFL, Ariel, you watchin my talk page? P I love it! ~","ROFL, Ariel, you watchin my talk page? P I love it! ~"


In [10]:
max_length = 128

x_train = bert_tokenizer(list(final_train['comment_text_wo_double_quotes']),
                         max_length = max_length,
                         truncation = True,
                         padding = 'max_length',
                         return_tensors = 'tf')
y_train = list(final_train['violent'])

In [11]:
x_test = bert_tokenizer(list(final_test['comment_text_wo_double_quotes'].apply(str)),
                         max_length = max_length,
                         truncation = True,
                         padding = 'max_length',
                         return_tensors = 'tf')
y_test = list(final_test['violent'])

In [35]:
hf_logging.set_verbosity_error()
tf.get_logger().setLevel('ERROR')
absl.logging.set_verbosity(absl.logging.ERROR)

def create_bert_cls_model(hidden_size=100, dropout=0.3, learning_rate=0.00005, weight_decay=0):
    # bert_model = TFBertModel.from_pretrained('bert-base-cased')

    config = DistilBertConfig(dropout=0.1, 
                          attention_dropout=0.1, 
                          output_hidden_states=True)
    bert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased', config=config)
    
    # Make DistilBERT layers untrainable
    for layer in bert_model.layers:
      layer.trainable = True

    # Build the input layers
    input_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int64, name='input_ids_layer')
    # token_type_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int64, name='token_type_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int64, name='attention_mask_layer')

    # Dictionary of inputs
    bert_inputs = {'input_ids': input_ids,
                  #  'token_type_ids': token_type_ids,
                   'attention_mask': attention_mask} 


    # model output
    bert_out = bert_model(bert_inputs) 

    # Instead of pooled token, using the 'cls' token 
    # pooled_token = bert_out[1]
    # cls_token = bert_out[0][:,0] # First layer, 0th column of each token set
    pooled_token = bert_out[0][:, 0, :]

    # print(pooled_token)
    # Hidden layers
    hidden = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64),input_shape=(0,768))(pooled_token) 
    hidden = tf.keras.layers.Dense(64, activation='relu') (hidden)
    classification = tf.keras.layers.Dense(1)(hidden)

    # # # Hidden layers
    # # hidden = tf.keras.layers.Dense(hidden_size, activation='relu', name='hidden_layer')(pooled_token)
    # # hidden = tf.keras.layers.Dropout(dropout)(hidden) 
    # # hidden = tf.keras.layers.Dense(100, activation='relu', name='hidden_layer_2')(hidden)
 
    # # classification = tf.keras.layers.Dense(1, activation='sigmoid',name='classification_layer')(hidden)
    
    # # instantiate model
    # classification_model = tf.keras.Model(inputs=[input_ids, 
    #                                               # token_type_ids, 
    #                                               attention_mask], outputs=[classification])

    # # compile model
    # classification_model.compile(optimizer=tfa.optimizers.AdamW(learning_rate = learning_rate,
    #                                                             weight_decay = weight_decay),
    #                         loss=tf.keras.losses.BinaryCrossentropy(from_logits=False), 
    #                         metrics=['Precision', 'Recall',
    #                                  tfa.metrics.F1Score(num_classes=1, name='f1_score',
    #                                  threshold=0.5, average='weighted'),
    #                                  'TrueNegatives', 'TruePositives', 
    #                                  'FalsePositives', 'FalseNegatives',
    #                                  'accuracy', 'AUC'
    #                                  ])
    
   
    ### END YOUR CODE
    return pooled_token
    # return classification_model

In [36]:
model = create_bert_cls_model()
# model.summary()

ValueError: ignored

In [33]:
model.shape

TensorShape([None, 768])

In [None]:
plot_show = False
# hidden_size_list = [150, 100]
# dropout_list = [0.1]
# learning_rate_list = [0.00002]
# weight_decay_list = [0]
# batch_size_list = [32]
# epochs_list = [10, 15, 50]

hidden_size_list = [50]
dropout_list = [0.1]
learning_rate_list = [0.00001]
weight_decay_list = [0]
batch_size_list = [32]
epochs_list = [100]

grid_list = [hidden_size_list, dropout_list, learning_rate_list, 
             weight_decay_list, batch_size_list, epochs_list]
grid = list(itertools.product(*grid_list))
random.shuffle(grid)

# for i in range(3):
for i in range(1):
  start_time = datetime.now(timezone('America/Chicago')).strftime('%Y%m%d-%H%M%S')
  bert_cls_model = create_bert_cls_model(hidden_size=grid[i][0],
                                         dropout=grid[i][1],
                                         learning_rate=grid[i][2],
                                         weight_decay=grid[i][3] 
                                         )
  chkpt_path = f'checkpoint/model_{i+1}_{start_time}/cp.ckpt'
  chkpt_dir = os.path.dirname(chkpt_path)
  
  print(f'''
  {i+1}. hidden_size={grid[i][0]} 
     dropout={grid[i][1]} 
     learning_rate={grid[i][2]} 
     weight_decay={grid[i][3]}
     batch_size={grid[i][4]}
     epochs={grid[i][5]}
     checkpoint_path={chkpt_path} 
        ''') 
  
  if plot_show == True:
    bert_cls_model.summary()
    display(keras.utils.plot_model(bert_cls_model, show_shapes=True, dpi=90))
  
  es = EarlyStopping(monitor='val_f1_score', mode='max', verbose=1, patience=5, min_delta=0.02)
  mc = ModelCheckpoint(chkpt_path, monitor='val_f1_score', mode='max', 
                       save_best_only=True, save_weights_only=True)
  
  bert_cls_model_history = bert_cls_model.fit(
                                            [x_train.input_ids, 
                                            #  x_train.token_type_ids, 
                                             x_train.attention_mask], 
                                             np.array(y_train),   
                                            validation_data=(
                                            [x_test.input_ids, 
                                            #  x_test.token_type_ids, 
                                             x_test.attention_mask], 
                                             np.array(y_test)),    
                                            batch_size=grid[i][4], 
                                            epochs=grid[i][5],
                                            callbacks=[es, mc])

  # bert_cls_model.save(f'checkpoint/final_model{i}_{start_time}')
  # bert_cls_model.save_weights(f'checkpoint/final_weights{i}_{start_time}')

ValueError: ignored

In [None]:
# chkpt_path = 'checkpoint/model1_20220719-213219/cp.ckpt'
bert_cls_model = create_bert_cls_model(hidden_size=hidden_size_list[0], #450,
                                         dropout=dropout_list[0],
                                         learning_rate=learning_rate_list[0],
                                         weight_decay=weight_decay_list[0]
                                         )

bert_cls_model.load_weights(chkpt_path) 

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7ff2999bf7d0>

In [None]:
score = bert_cls_model.evaluate([x_test.input_ids, 
                                #  x_test.token_type_ids, 
                                 x_test.attention_mask], 
                                 np.array(y_test))



In [None]:
predictions = bert_cls_model.predict([x_test.input_ids, 
                                      # x_test.token_type_ids, 
                                      x_test.attention_mask])

In [None]:
predictions = pd.Series(np.rint(predictions).flatten(), dtype = int)
test_ids = pd.Series(final_test.index)
test_check = final_test.copy()
test_check = test_check.reset_index(drop = True)
test_check = pd.concat([test_check, predictions, test_ids], axis = 1)
test_check.rename(columns = {'violent': 'actual', 0: 'prediction', 1: 'id'}, inplace = True)
test_check.set_index('id', inplace = True)

In [None]:
wrong_predictions = test_check[test_check['actual'] != test_check['prediction']]
fp = wrong_predictions[wrong_predictions['prediction'] == 1]
fn = wrong_predictions[wrong_predictions['prediction'] == 0]

In [None]:
len(fp)

93

In [None]:
fp.head()

Unnamed: 0_level_0,comment_text,actual,comment_text_with_hashtag,comment_text_wo_double_quotes,prediction
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
37772,"This type of censorship makes me sick Vote to keep, show these hypocrites what's what, tolerance? ha, only when it's good for them",0,"This type of censorship makes me sick Vote to keep, show these hypocrites what's what, tolerance? ha, only when it's good for them","This type of censorship makes me sick Vote to keep, show these hypocrites what's what, tolerance? ha, only when it's good for them",1
54382,******** its still off,0,# its still off,# its still off,1
59233,WPTROLL. Crawl back into your hole.,0,WPTROLL. Crawl back into your hole.,WPTROLL. Crawl back into your hole.,1
21803,"BYE , I'M NEVER COMING BACK!!! AND YOU NINNYS CAN SQUIM AND ***** ABOUT ANYTHING YOU WANT!!",0,"BYE , I'M NEVER COMING BACK!!! AND YOU NINNYS CAN SQUIM AND # ABOUT ANYTHING YOU WANT!!","BYE , I'M NEVER COMING BACK!!! AND YOU NINNYS CAN SQUIM AND # ABOUT ANYTHING YOU WANT!!",1
15384,"Or we could just grow the **** up and go nude around electronics. Seriously, this 'clothing' thing has been trouble from day 1.",0,"Or we could just grow the # up and go nude around electronics. Seriously, this 'clothing' thing has been trouble from day 1.","Or we could just grow the # up and go nude around electronics. Seriously, this 'clothing' thing has been trouble from day 1.",1
