### Import Packages

In [None]:
#@title Imports

!pip install pydot --quiet
!pip install gensim==3.8.3 --quiet
# !pip install tensorflow-datasets --quiet
# !pip install -U tensorflow-text==2.8.2 --quiet
!pip install transformers --quiet
!pip install pydot --quiet
!pip install tensorflow_addons

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# Import packages
import pandas as pd
import numpy as np

from tensorflow import keras
import tensorflow as tf
from keras.layers import Embedding, Input, Dense, Lambda
from keras.models import Model
import keras.backend as K
import tensorflow_addons as tfa
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

from google.colab import drive

from collections import Counter
import matplotlib.pyplot as plt
from nltk.util import ngrams

from transformers import BertTokenizer, TFBertModel, logging as hf_logging, TFDistilBertModel, DistilBertConfig, DistilBertTokenizerFast
from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split

import random
import itertools

import sklearn as sk
import os
import nltk
from nltk.corpus import reuters
from nltk.data import find

import re
import gensim
import absl.logging
from datetime import datetime
from pytz import timezone

pd.set_option("display.max_rows", None, # display all rows
              "display.max_columns", None, # display all columns
              "display.max_colwidth", None, # expand column width
              "display.html.use_mathjax", False) # disable Latex style mathjax rendering

In [None]:
drive.mount('/drive') 
%cd /drive/MyDrive/W266 Project/Colab Notebooks/Exploration
!pwd

Drive already mounted at /drive; to attempt to forcibly remount, call drive.mount("/drive", force_remount=True).
/drive/.shortcut-targets-by-id/1p1bDkEjmNKPzX456WZWBr8qtvPr6Pt5m/W266 Project/Colab Notebooks/Exploration
/drive/.shortcut-targets-by-id/1p1bDkEjmNKPzX456WZWBr8qtvPr6Pt5m/W266 Project/Colab Notebooks/Exploration


# Model - TFBERT + DISTILBERT + BERT W/ CNN

In [None]:
final_train = pd.read_csv('../../data/transformed/final/train.csv').sample(frac = 1) # shuffle rows
final_test = pd.read_csv('../../data/transformed/final/test.csv').sample(frac = 1) # shuffle rows

In [None]:
# Reduce size of data for quick testing 
small_train = final_train.sample(n = 8000, random_state = 2)
small_test = final_test.sample(n = 2000, random_state = 2)

# Simulate Undersampling
# small_train_1 = final_train[final_train['violent'] == 1].sample(n = 5000, random_state = 2)
# small_train_0 = final_train[final_train['violent'] == 0].sample(n = 3000, random_state = 2)
# small_train = pd.concat([small_train_1, small_train_0], ignore_index=True, axis=0)

# repoint to avoid having to update code below
final_train = small_train
final_test = small_test

In [None]:
# quick check to make sure we see expected representation of 0 and 1
display(final_train['violent'].value_counts())
final_test['violent'].value_counts()

0    7181
1     819
Name: violent, dtype: int64

0    1794
1     206
Name: violent, dtype: int64

In [None]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case = False)

# bert_tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [None]:
max_length = 128

x_train = bert_tokenizer(list(final_train['comment_text_transf']),
                         max_length = max_length,
                         truncation = True,
                         padding = 'max_length',
                         return_tensors = 'tf')
y_train = list(final_train['violent'])

In [None]:
x_test = bert_tokenizer(list(final_test['comment_text_transf'].apply(str)),
                         max_length = max_length,
                         truncation = True,
                         padding = 'max_length',
                         return_tensors = 'tf')
y_test = list(final_test['violent'])

In [None]:
hf_logging.set_verbosity_error()
tf.get_logger().setLevel('ERROR')
absl.logging.set_verbosity(absl.logging.ERROR)

def create_bert_cnn_model(hidden_size = 50, 
                          learning_rate = 0.00001,
                          num_filters = [100, 100, 50, 25],
                          kernel_sizes = [3, 5, 10, 20],
                          dropout = 0.1):

    bert_model = TFBertModel.from_pretrained('bert-base-cased')

    input_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int64, name='input_ids_layer')
    token_type_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int64, name='token_type_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int64, name='attention_mask_layer')
    
    # Dictionary of inputs
    bert_inputs = {'input_ids': input_ids,
                   'token_type_ids': token_type_ids,
                   'attention_mask': attention_mask} 
    
    bert_out = bert_model(input_ids)

    output_token = bert_out[0][:,1:-1]
    
    h = output_token 
    conv_layers_for_all_kernel_sizes = []
    for kernel_size, filters in zip(kernel_sizes, num_filters):
        conv_layer = keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, activation='relu')(h)
        conv_layer = keras.layers.GlobalMaxPooling1D()(conv_layer)
        conv_layers_for_all_kernel_sizes.append(conv_layer)

    h = keras.layers.concatenate(conv_layers_for_all_kernel_sizes, axis=1)
    h = keras.layers.Dropout(rate=dropout)(h)

    hidden = tf.keras.layers.Dense(hidden_size, activation='relu', name='hidden_layer')(h)

    classification = tf.keras.layers.Dense(1, activation='sigmoid',name='classification_layer')(hidden)

   # instantiate model
    classification_model = tf.keras.Model(inputs=[input_ids, 
                                                  token_type_ids, 
                                                  attention_mask], outputs=[classification])

    # compile model
    classification_model.compile(optimizer=tfa.optimizers.AdamW(learning_rate = learning_rate,
                                                                weight_decay = 0),
                            loss=tf.keras.losses.BinaryCrossentropy(from_logits=False), 
                            metrics=['Precision', 'Recall',
                                     tfa.metrics.F1Score(num_classes=1, name='f1_score',
                                     threshold=0.5, average='weighted'),
                                     'TrueNegatives', 'TruePositives', 
                                     'FalsePositives', 'FalseNegatives',
                                     'accuracy', 'AUC'
                                     ])
    
   
    ### END YOUR CODE
    
    return classification_model

In [None]:
plot_show = False
hidden_size_list = [50, 75]
dropout_list = [0.1]
learning_rate_list = [0.00001, 0.00002]
num_filters_list = [[100, 100, 50, 25], [200, 200, 100, 50]]
kernel_sizes_list = [[3, 5, 10, 20], [5, 10, 15, 30]]
batch_size_list = [16, 32]
epochs_list = [75, 100]

grid_list = [hidden_size_list, dropout_list, learning_rate_list, 
             num_filters_list, kernel_sizes_list, batch_size_list, epochs_list]
grid = list(itertools.product(*grid_list))
random.shuffle(grid)

for i in range(3):
  start_time = datetime.now(timezone('America/Chicago')).strftime('%Y%m%d-%H%M%S')
  bert_cnn_model = create_bert_cnn_model(hidden_size=grid[i][0],
                                         dropout=grid[i][1],
                                         learning_rate=grid[i][2],
                                         num_filters=grid[i][3],
                                         kernel_sizes=grid[i][4]
                                         )
  chkpt_path = f'checkpoint/bertCNNmodel_{i+1}_{start_time}/cp.ckpt'
  chkpt_dir = os.path.dirname(chkpt_path)

  print(f'''
  {i+1}. hidden_size={grid[i][0]} 
     dropout={grid[i][1]} 
     learning_rate={grid[i][2]} 
     num_filters={grid[i][3]}
     kernel_sizes={grid[i][4]}
     batch_size={grid[i][5]}
     epochs={grid[i][6]}
     checkpoint_path={chkpt_path} 
        ''') 
  if plot_show == True:
    bert_cnn_model.summary()
    display(keras.utils.plot_model(bert_cnn_model, show_shapes=True, dpi=90))
  
  es = EarlyStopping(monitor='val_f1_score', mode='max', verbose=1, patience=5, min_delta=0.02)
  mc = ModelCheckpoint(chkpt_path, monitor='val_f1_score', mode='max', 
                        save_best_only=True, save_weights_only=True)
  
  bert_cnn_model_history = bert_cnn_model.fit(
                                              [x_train.input_ids, 
                                              x_train.token_type_ids, 
                                              x_train.attention_mask], 
                                              np.array(y_train),   
                                              validation_data=(
                                              [x_test.input_ids, 
                                              x_test.token_type_ids, 
                                              x_test.attention_mask], 
                                              np.array(y_test)),    
                                              batch_size=grid[i][5], 
                                              epochs=grid[i][6],
                                              callbacks=[es, mc])


  1. hidden_size=50 
     dropout=0.1 
     learning_rate=1e-05 
     num_filters=[200, 200, 100, 50]
     kernel_sizes=[5, 10, 15, 30]
     batch_size=32
     epochs=100
     checkpoint_path=checkpoint/bertCNNmodel_1_20220720-223539/cp.ckpt 
        
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 8: early stopping

  2. hidden_size=50 
     dropout=0.1 
     learning_rate=2e-05 
     num_filters=[200, 200, 100, 50]
     kernel_sizes=[5, 10, 15, 30]
     batch_size=32
     epochs=100
     checkpoint_path=checkpoint/bertCNNmodel_2_20220720-230741/cp.ckpt 
        
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 6: early stopping

  3. hidden_size=50 
     dropout=0.1 
     learning_rate=1e-05 
     num_filters=[200, 200, 100, 50]
     kernel_sizes=[5, 10, 15, 30]
     batch_size=16
     epochs=100
     checkpoint_path=checkpoint/bertCNNmodel_3_20220720-233211/cp.ckpt 
        
Epoch 1/100
Epoch 2/100
E

In [None]:
chkpt_path = 'checkpoint/model_3_20220720-180757/cp.ckpt'
bert_cnn_model = create_bert_cnn_model(
    hidden_size=50,
    dropout=0.1,
    learning_rate=0.00001,
    num_filters=[100, 100, 50, 25],
    kernel_sizes=[3, 5, 10, 20]  
)
bert_cnn_model.load_weights(chkpt_path)

SyntaxError: ignored

In [None]:
score = bert_cnn_model.evaluate([x_test.input_ids, 
                                 x_test.token_type_ids, 
                                 x_test.attention_mask], 
                                 np.array(y_test))

In [None]:
predictions = bert_cnn_model.predict([x_test.input_ids, 
                                      x_test.token_type_ids, 
                                      x_test.attention_mask])

In [None]:
predictions = pd.Series(np.rint(predictions).flatten(), dtype = int)
test_ids = pd.Series(final_test.index)
test_check = final_test.copy()
test_check = test_check.reset_index(drop = True)
test_check = pd.concat([test_check, predictions, test_ids], axis = 1)
test_check.rename(columns = {'violent': 'actual', 0: 'prediction', 1: 'id'}, inplace = True)
test_check.set_index('id', inplace = True)

In [None]:
wrong_predictions = test_check[test_check['actual'] != test_check['prediction']]
fp = wrong_predictions[wrong_predictions['prediction'] == 1]
fn = wrong_predictions[wrong_predictions['prediction'] == 0]

In [None]:
len(fp)

In [None]:
fp.head()