<a href="https://colab.research.google.com/github/sayarghoshroy/Intro_to_DL_tutorial/blob/master/classification_test_bench.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import warnings
warnings.simplefilter('ignore')

In [None]:
%%capture
# Suppressing cell output
!pip install torch
import torch

In [None]:
cuda_available = torch.cuda.is_available()
print('CUDA is available: ' + str(cuda_available))
print('PyTorch version: ' + str(torch.__version__))
if cuda_available:
  torch.device('cuda')

CUDA is available: True
PyTorch version: 1.8.1+cu101


In [None]:
!nvidia-smi

Sun Apr 11 20:24:08 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.67       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P8     9W /  70W |      3MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
import os
import time
import sys
import json
import numpy as np
import pickle
import shutil

In [None]:
%%capture
# Suppressing cell output
!pip install datasets
!pip install protobuf
!pip install simpletransformers

# Note: If you're facing issues on Colab
# Restart and rerun from this cell

In [None]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
from sklearn.metrics import classification_report
import simpletransformers
import logging
import pandas as pd

In [None]:
# Set data name and path
data_name = 'data'
data_path = 'drive/My Drive/data'
# For referencing data stored on Google drive like the above
# Mount drive first

# Expectation:
# data_path directory should contain train, val, test jsons
# data-points should be present as a list of dicts
# with each dict having a 'source', and a 'target'

with open(data_path + '/' + 'train.json', 'r+') as f:
  raw_train = json.load(f)

with open(data_path + '/' + 'val.json', 'r+') as f:
  raw_val = json.load(f)

with open(data_path + '/' + 'test.json', 'r+') as f:
  raw_test = json.load(f)

In [None]:
# Verifying loaded data
assert type(raw_train) == type(raw_val)
assert type(raw_train) == type(raw_test)
print('Raw data object type: ' + str(type(raw_train)))
print()

print('Fields in the raw data: ')
unit = raw_train[0]

for key in unit:
  print('• ' + str(key))

Raw data object type: <class 'list'>

Fields in the raw data: 
• source
• target


In [None]:
# To test out the procedure with small amounts of data
global_testing_mode = 0
global_testing_unit_count = 512

In [None]:
print('Number of Samples in: ')
print('• train: ' + str(len(raw_train)))
print('• val: ' + str(len(raw_val)))
print('• test: ' + str(len(raw_test)))

# Defining mappings for training
def create_set(set_name = 'train'):
  global raw_train, raw_val, raw_test
  global global_testing_mode, global_testing_unit_count
  work_on = None

  if set_name == 'train':
    work_on = raw_train
  elif set_name == 'val':
    work_on = raw_val
  elif set_name == 'test':
    work_on = raw_test
  else:
    print('Invalid Data Split.')
    return -1
  
  data_size = len(work_on)
  if global_testing_mode:
    data_size = global_testing_unit_count

  data = []
  for index in range(data_size):
    unit = [work_on[index]['source'], work_on[index]['target']]
    data.append(unit)

  return data

train = create_set('train')
val = create_set('val')
test = create_set('test')

Number of Samples in: 
• train: 17348
• val: 2478
• test: 4957


In [None]:
# Getting number of positive and negative samples in train split
total_in_train = len(train)
positive_in_train = 0
correct_imbalance = True

for unit in train:
  positive_in_train += unit[1]

print('Number of positive samples: ' + str(positive_in_train))
print('Number of negative samples: ' + str(total_in_train - positive_in_train))

# Weights to correct the class imbalance
greater_class_count = max((total_in_train - positive_in_train), positive_in_train)
class_weights = [greater_class_count / (total_in_train - positive_in_train),
                 greater_class_count / positive_in_train]

if correct_imbalance == False:
  # Disabling weighing of classes
  class_weights = [1, 1]

Number of positive samples: 14450
Number of negative samples: 2898


In [None]:
# Defining dataframes
train_df = pd.DataFrame(train)
train_df.columns = ['source', 'label']

val_df = pd.DataFrame(val)
val_df.columns = ['source', 'label']

# Verifying correctness
train_df

Unnamed: 0,source,label
0,Early bird get the worm.,0
1,if youre not cheering for mexico youre a littl...,1
2,RT : These hoes ain't loyal,1
3,""" lames crying over hoes thats tears of a clown """,1
4,: I feel sorry for da females that dont get wh...,1
...,...,...
17343,When a hoe tryna get at you while ya girl is l...,1
17344,Fuk wit a real nicca,1
17345,"naw nigga what's up with em hands, what happen...",1
17346,Yous a bitch ass nigga &;: Rt i Push the tape ...,1


In [None]:
%%capture
# Leveraging a pre-trained Transformer Model

model_index = 1
# Set 0 for bert-base, 1 for roberta-base

model_loc = ['bert-base-uncased', 'roberta-base'][model_index]
model_type = ['bert', 'roberta'][model_index]

is_lower = False
if model_index == 0:
  is_lower = True

length_setting = 256
model_name = model_loc + '_' + data_name + '_' + str(length_setting)
cache_name = model_name + '_cache_dir'

batch_size = 32
num_epochs = 4
num_gpus = 4

if global_testing_mode == 1:
  model_name += '_testing'
  num_epochs = 2
  length_setting = 64

model_args = ClassificationArgs(train_batch_size = batch_size,
                                max_seq_length = length_setting,
                                save_steps = -1,
                                n_gpu = num_gpus,
                                num_train_epochs = num_epochs,
                                evaluate_during_training = True,
                                overwrite_output_dir = True,
                                save_eval_checkpoints = False,
                                save_model_every_epoch = False,
                                cache_dir = cache_name,
                                fp16 = True,
                                manual_seed = 42,
                                do_lower_case = is_lower,
                                best_model_dir = model_name)

model = ClassificationModel(model_type,
                            model_loc,
                            use_cuda = cuda_available,
                            args = model_args,
                            num_labels = 2,
                            weight = class_weights)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out

In [None]:
# Training

start = time.time()
model.train_model(train_df, eval_df = val_df)
end = time.time()
time_to_train = int(round(end - start))

hours = int(time_to_train / 3600)
minutes = int(int(time_to_train % 3600) / 60)
seconds = int(time_to_train % 60)
print()
print('Number of Epochs: ' + str(num_epochs))
print('Maximum Sequence Length: ' + str(length_setting))
print('Batch size: ' + str(batch_size))
print('Time taken for training: ' + str(hours).zfill(2) + ':' + str(minutes).zfill(2) + ':' + str(seconds).zfill(2))

In [None]:
# Inference
infer_now = True

if infer_now == True:
  model = ClassificationModel(model_type, model_name)
  print('Using Model: ' + str(model_name))
  print()
  
  val_sources = [unit[0] for unit in val]
  test_sources = [unit[0] for unit in test]

  val_targets = [unit[1] for unit in val]
  test_targets = [unit[1] for unit in test]

  # Evaluation on val data
  print('Results on the validation split: ')
  val_predictions, val_outputs = model.predict(val_sources)
  print(classification_report(val_targets, val_predictions, digits = 6))
  print()

  # Evaluation on test data
  print('Results on the test split: ')
  test_predictions, test_outputs = model.predict(test_sources)
  print(classification_report(test_targets, test_predictions, digits = 6))

In [None]:
compress_model = False
if compress_model == True:
  shutil.make_archive(model_name, 'zip', model_name)
  shutil.make_archive(cache_name, 'zip', cache_name)

In [None]:
# ^_^ Thank You