# Install Dependencies

In [None]:
!pip install transformers
!pip install boto3

# Load Dependencies

In [None]:
import os
import boto3
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import tensorflow as tf
from transformers import BertTokenizer

# Load and View Dataset

In [None]:
#Add your own s3 bucket credentials
s3 = boto3.resource(
    service_name = 's3',
    region_name='us-east-1',
    aws_access_key_id='AKIASIUPX3HHRSZTCVYO',
    aws_secret_access_key='cNiBledLcP/7yInbTNN2kjTo6oH5JWnABGSPjLMz'
)

In [None]:
test_obj = s3.Bucket('test-rohith-1').Object('test.csv').get()

train_obj = s3.Bucket('test-rohith-1').Object('train.csv').get()

In [None]:
train_obj

In [None]:
train = pd.read_csv(train_obj['Body'])
test = pd.read_csv(test_obj['Body'])

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.info()

In [None]:
ID = 'id'
DATA_COLUMN = 'comment_text'
LABEL_COLUMNS = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']

In [None]:
for label in LABEL_COLUMNS:
  print(train[label].value_counts())

# Jump to load model from AWS Cell if needed!

# Initialize BERT Tokenizer

Next, we need to preprocess our data so that it matches the data BERT was trained on. For this, we'll need to do a couple of things.

- Lowercase our text (if we're using a BERT lowercase model)
- Tokenize it (i.e. "i love cornetto" -> ["i", "love", "cornetto"])
- Break words into WordPieces (i.e. "calling" -> ["call", "##ing"])
- Map our words to indexes using a vocab file that BERT provides
- Add special "CLS" and "SEP" tokens (see the BERT paper)
- Append "index" and "segment" tokens to each input (see the BERT paper)

The BERT Model we're going to use expects lowercase data since the BERT's vocab file has lowercased words. BTS, BERT uses WordPiece Techique

WordPiece is a technique to segment words into subword-level in NLP tasks. The vocabulary is initialized with all the individual characters in the language, and then the most frequent/likely combinations of the symbols in the vocabulary are iteratively added to the vocabulary.

Consider the word 'walking'. It gets segmented as walk@@ ing, walk@@ ed, etc., notice that all of them will now have walk@@ in common, which will occur much frequently while training, and the model might be able to learn more about it.

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

#### Example

In [None]:
train[DATA_COLUMN].iloc[0]

In [None]:
token = tokenizer.encode_plus(
    train[DATA_COLUMN].iloc[0],
    max_length = 256,
    truncation = True,
    add_special_tokens =True,
    return_tensors='tf'
)

token

#### Generate Training Data

In [None]:
X_input_ids = np.zeros((len(train), 256))
X_attn_masks = np.zeros((len(train), 256))

In [None]:
X_input_ids.shape

In [None]:
def generate_training_data(df, ids, masks, tokenizer):
    for i, text in tqdm(enumerate(df[DATA_COLUMN])):
        tokenized_text = tokenizer.encode_plus(
            text,
            max_length=256, 
            truncation=True, 
            padding='max_length', 
            add_special_tokens=True,
            return_tensors='tf'
        )
        ids[i, :] = tokenized_text.input_ids
        masks[i, :] = tokenized_text.attention_mask
    return ids, masks

In [None]:
X_input_ids, X_attn_masks = generate_training_data(train, X_input_ids, X_attn_masks, tokenizer)

In [None]:
X_input_ids

#### Generate Labels

In [None]:
labels = np.zeros((len(train), 6))
labels.shape

In [None]:
labels = train.iloc[:, 2:].to_numpy()
labels

In [None]:
type(labels)

# Create Dataset 
(in Tensorflow acceptable format)

In [None]:
# creating a data pipeline using tensorflow dataset utility, 
# creates batches of data for easy loading...

dataset = tf.data.Dataset.from_tensor_slices((X_input_ids, X_attn_masks, labels))
dataset.take(1) # one sample data

In [None]:
def DatasetMapFunction(input_ids, attn_masks, labels):
    return {
        'input_ids': input_ids,
        'attention_mask': attn_masks
    }, labels

In [None]:
# converting to required format for tensorflow dataset
dataset = dataset.map(DatasetMapFunction) 

In [None]:
dataset.take(1)

In [None]:
# batch size, drop any left out tensor
dataset = dataset.shuffle(10000).batch(16, drop_remainder=True) 

In [None]:
dataset.take(1)

In [None]:
# for each 16 batch of data we will have len(df)//16 samples, 
# take 90% of that for train.

p = 0.9
train_size = int((len(train)//16)*p) 

In [None]:
train_size

In [None]:
train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size)

# Model Creation

In [None]:
from transformers import TFBertModel

In [None]:
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

In [None]:
# defining 2 input layers for input_ids and attn_masks
input_ids = tf.keras.layers.Input(shape=(256,), name='input_ids', dtype='int32')
attn_masks = tf.keras.layers.Input(shape=(256,), name='attention_mask', dtype='int32')

In [None]:
#intermediate layers

bert_embds = bert_model.bert(input_ids, attention_mask=attn_masks)[1] 
# 0 -> activation layer (3D), 1 -> pooled output layer (2D)

intermediate_layer = tf.keras.layers.Dense(512, activation='relu', name='intermediate_layer')(bert_embds)


In [None]:
# output_layer = tf.keras.layers.Dense(5, activation='softmax', name='output_layer')(intermediate_layer) 
output_layer = tf.keras.layers.Dense(6, activation='sigmoid', name='output_layer')(intermediate_layer) 

In [None]:
#Combine the layers and create a model out of it

model = tf.keras.Model(inputs=[input_ids, attn_masks], outputs=output_layer)
model.summary()

In [None]:
#Compile defines the loss function, 
# the optimizer and the metrics. That's all. 
# It has nothing to do with the weights and 
# you can compile a model as many times as you want 
# without causing any problem to pretrained weights

optim = tf.keras.optimizers.Adam(learning_rate=1e-5, decay=1e-6)

loss_func = tf.keras.losses.BinaryCrossentropy()

acc = tf.keras.metrics.BinaryAccuracy('accuracy')

In [None]:
model.compile(optimizer=optim, loss=loss_func, metrics=[acc])

# Train the model

In [None]:
# if os.path.exists('bert_offensive_model'):
#   model = tf.keras.models.load_model('bert_offensive_model')

In [None]:
hist = model.fit(
    train_dataset,
    validation_data = val_dataset,
    epochs = 20,
    steps_per_epoch = train_size // 16
)

# Directly load model from AWS S3

In [None]:
# Download the folder output from S3
def download_s3_folder(s3, bucket_name, s3_folder, local_dir=None):
    bucket = s3.Bucket(bucket_name)
    for obj in bucket.objects.filter(Prefix=s3_folder):
        target = obj.key if local_dir is None \
            else os.path.join(local_dir, os.path.relpath(obj.key, s3_folder))
        if not os.path.exists(os.path.dirname(target)):
            os.makedirs(os.path.dirname(target))
        if obj.key[-1] == '/':
            continue
        bucket.download_file(obj.key, target)
        
download_s3_folder(s3, 'test-rohith-1', 'bert-model-files')

# Prediction

In [None]:
loaded_model = tf.keras.models.load_model('bert-model-files')

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
loaded_model = tf.keras.models.load_model('my_model_atf.h5', custom_objects={'bert_embeds': bert_model})
# loaded_model = tf.keras.models.load_model('my_model_atf.h5', custom_objects={'bert_embeds': bert_model.bert(input_ids, attention_mask=attn_masks)[1]})


In [None]:
def prepare_data(input_text, tokenizer):
    token = tokenizer.encode_plus(
        input_text,
        max_length=256, 
        truncation=True, 
        padding='max_length', 
        add_special_tokens=True,
        return_tensors='tf'
    )
    return {
        'input_ids': tf.cast(token.input_ids, tf.float64),
        'attention_mask': tf.cast(token.attention_mask, tf.float64)
    }

In [None]:
input_text = input('Enter your comment: ')

processed_data = prepare_data(input_text, tokenizer)
probs = loaded_model.predict(processed_data)[0]

In [None]:
print("CATEGORY::::PROBABILITY")
for k,v in zip(LABEL_COLUMNS,probs):
  print(k,v,sep="::::")

In [None]:
import matplotlib.pyplot as plt

colors = ['red'] * len(LABEL_COLUMNS)
colors.append('white')

labels = LABEL_COLUMNS[:]
labels.append('')

probs = probs.tolist()
probs.append(1)

plt.figure(figsize=(7,7))
plt.xticks(np.arange(0,1,0.1))
plt.barh(labels, probs, color=colors)

# Upload folders if needed

In [None]:
# Upload the folder to S3
def upload_objects(s3, bucket_name, root_path, key_folder_name):
    try:
        my_bucket = s3.Bucket(bucket_name)

        for path, subdirs, files in os.walk(root_path):
            path = path.replace("\\","/")
            directory_name = path.replace(root_path,"")
            directory_name += key_folder_name
            for file in files:
                my_bucket.upload_file(os.path.join(path, file), directory_name+'/'+file)

    except Exception as err:
        print(err)

In [None]:
upload_objects(s3, 'test-rohith-1', './bert_offensive_model', 'bert-model-files')

In [None]:
upload_objects(s3, 'test-rohith-1', './bert_offensive_model/assets', 'bert-model-files/assets')

In [None]:
upload_objects(s3, 'test-rohith-1', './bert_offensive_model/variables', 'bert-model-files/variables')