<a href="https://colab.research.google.com/github/sayarghoshroy/Hate-Speech-Detection/blob/master/pytorch_classifier_adaptive.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Link to Saved Models: [Here](https://drive.google.com/file/d/1_GDUTiTo149x8oFN1aqsLKFszRCfbAwF/view?usp=sharing)

In [2]:
!pip install nltk
!pip install bert-tensorflow
!pip install transformers
!pip install seaborn
!pip install sklearn-crfsuite
!pip install -U sentence-transformers
import nltk
nltk.download('all')

Collecting bert-tensorflow
[?25l  Downloading https://files.pythonhosted.org/packages/20/16/0f9376af49c6adcfbaf2470a8f500105a74dd803aa54ac0110af445837b5/bert_tensorflow-1.0.4-py2.py3-none-any.whl (64kB)
[K     |████████████████████████████████| 71kB 4.0MB/s 
Installing collected packages: bert-tensorflow
Successfully installed bert-tensorflow-1.0.4
Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ae/05/c8c55b600308dc04e95100dc8ad8a244dd800fe75dfafcf1d6348c6f6209/transformers-3.1.0-py3-none-any.whl (884kB)
[K     |████████████████████████████████| 890kB 8.3MB/s 
Collecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 28.7MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a

True

In [3]:
import random
import pickle
import re
import time
import datetime

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm, neighbors
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
# Check where we need this
# from nltk.corpus import stopwordsm
from nltk.tokenize import sent_tokenize

import torch
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import Dataset
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelWithLMHead


import torch.nn as nn
from transformers import XLMRobertaTokenizer, XLMRobertaModel
import gensim.models as gsm

from torch.utils.data import Subset
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score


from tqdm import tqdm 
import gc
import os

  import pandas.util.testing as tm


In [4]:
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [5]:
data_loc = '/content/drive/My Drive/2020_processed_data/'

In [8]:
e2v = gsm.KeyedVectors.load_word2vec_format('/content/drive/My Drive/emoji2vec.bin', binary=True)

def getEmojiEmbeddings(emojiList,dim=300,verbose = False):
  # Generates an emoji vector by averaging the emoji representation for each emoji
  # If no emoji returns an empty list of dimension dim
  if dim < 300:
    raise IndexError("Dim has to be greater than 300")
  result = np.zeros(dim)
  if (len(emojiList) == 0):
    return result
  else:
    embs = None
    for i in emojiList:
      if verbose:
        if i not in e2v.vocab:
          print(i)
    embs = np.mean([e2v[i] for i in emojiList if i in e2v.vocab], axis=0)
  if np.any(np.isnan(embs)):
    return result
  result[:300] = embs
  return result 

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [9]:
def loadData(lang):
  """ Function to load data for one language from the preprocessed pickle file"""
  if lang not in ['hi','en','ge']:
      raise NameError("Language not found")
  fileName = lang + '.pickle'
  with open(DATASET_ROOT+fileName, 'rb') as f:
    ged = pickle.load(f)
  df = pd.DataFrame.from_dict(ged)
  train_df, test_df = model_selection.train_test_split(df, random_state = 42, test_size = 0.25)
  return train_df, test_df, df

def loadDataAllLangs():
  """ Function to load data for all languages from the preprocessed pickle file"""

  hi_train,hi_test,hi_df = loadData('hi')
  en_train,en_test,en_df = loadData('en')
  ge_train,ge_test,ge_df = loadData('ge')
  print("total size:", len(ge_df) + len(hi_df)+len(en_df))
  train_df = pd.concat([hi_train,en_train,ge_train],ignore_index=True)
  test_df =  pd.concat([hi_test,en_test,ge_test],ignore_index=True)
  df = pd.concat([hi_df,en_df,ge_df],ignore_index=True)
  train_df = train_df.sample(frac = 1, random_state=42)
  test_df = test_df.sample(frac = 1, random_state=42)
  df = df.sample(frac = 1, random_state=42)
  return train_df,test_df,df

class HASOCDataset(Dataset):
  """ Data loader to load the data for the Torch """
  def __init__(self, dataPath, isDF = False):
    if isDF:
      self.df = pd.DataFrame.from_dict(dataPath)
    else:
      data = pickle.load(open(dataPath,'rb'))
      self.df = pd.DataFrame.from_dict(data)
  def __len__(self):
    return len(self.df)
  def __getitem__(self,index):
    return self.df.iloc[index]

In [10]:
def set_seed(seed):
     # """ Sets all seed to the given value, so we can reproduce (:3) """
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
set_seed(42)

In [11]:
class FullExample(object):
  """ Not necessary any more, mainly here in case we might need to use the entire thing. """
  def __init__(self, id, task_1, task_2, hasoc_id, full_tweet, tweet_raw_text, hashtags, smiley, emoji, url, mentions, numerals, reserved_word, segmented_hash):
    self.id  = id
    self.task_1 = task_1
    self.task_2 = task_2
    self.hasoc_id = hasoc_id
    self.full_tweet = full_tweet
    self.tweet_raw_text = tweet_raw_text
    self.hashtags = hashtags
    self.smiley = smiley
    self.emoji = emoji
    self.url = url 
    self. mentions = mentions 
    self.numerals = numerals
    self.reserved_word = reserved_word
    self.segmented_hash = segmented_hash
  
class Example(object):
  """ Contains the data for one example from the dataset """
  def __init__(self, id, task_1, task_2, hasoc_id, full_tweet, tweet_raw_text,  emoji,  segmented_hash):
    self.id  = id
    self.task_1 = task_1
    self.task_2 = task_2
    self.hasoc_id = hasoc_id
    self.full_tweet = full_tweet
    self.tweet_raw_text = tweet_raw_text
    self.emoji = emoji
    self.segmented_hash = segmented_hash

class ExampleFeautres(object):
    """ Contains the dataset in a batch friendly feaute set """
    def __init__(self, id, task_1, task_2, input_ids, input_mask,input_length,  emoji,  hash):
      self.id  = id
      self.task_1 = task_1
      self.task_2 = task_2
      self.emoji = torch.tensor(emoji)
      self.input_ids = input_ids
      self.input_mask = input_mask
      self.input_length = input_length 
      self.hash = torch.tensor(hash)

In [12]:
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/xlm-r-100langs-bert-base-nli-mean-tokens")
max_seq_length = 74
# e2v = gsm.KeyedVectors.load_word2vec_format('/content/drive/My Drive/emoji2vec.bin', binary = True)
sent_encoder = SentenceTransformer('xlm-r-100langs-bert-base-nli-mean-tokens')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=541.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=5069051.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=150.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=147.0, style=ProgressStyle(description_…




100%|██████████| 1.01G/1.01G [00:13<00:00, 77.0MB/s]


In [13]:
labels_task1 = {'NOT':0, 'HOF':1}
labels_task2 = {'NONE':0,'PRFN':1,'OFFN':2,'HATE':3}

def convertExamplesToFeature(example):
  """ Given a data row convert it to feautres so it's batch friendly """
  raw_text = example.tweet_raw_text
  tokens = tokenizer.tokenize(raw_text)
  if (len(tokens) > (max_seq_length-2)):
    tokens = tokens[: (max_seq_length-2)]
  tokens = [tokenizer.cls_token] + tokens + [tokenizer.sep_token]
  input_ids = tokenizer.convert_tokens_to_ids(tokens)
  input_mask = [1] * len(input_ids)
  input_length = len(input_ids)
  padding = [0] * (max_seq_length - len(input_ids))
  input_ids += padding
  input_mask += padding
  hashtags = ' '.join(example.segmented_hash)
  hashembs = sent_encoder.encode(hashtags)
  # Do we want to propage the values across hashtags? ~ Prolly not
  # but the following code keeps that provision in case we need it. 
  # hashtags = []
  # hashtokens = tokenizer.tokenize(hashtags)
  # if (len(hashtokens) > (max_hash_length-2)):
  #   hashtokens = tokens[: (max_hash_length-2)]
  # tokens = [tokenizer.cls_token] + hashtokens + [tokenizer.sep_token]
  # hashinput_ids = tokenizer.convert_tokens_to_ids(hashtokens)
  # hashinput_mask = [1] * len(hashinput_ids)
  # input_length = len(hashinput_ids)
  # padding = [0] * (max_hash_length - len(hashinput_ids))
  # input_ids += padding
  # input_mask += padding
  emojiVec = getEmojiEmbeddings(example.emoji)
  task1 = labels_task1[example.task_1]
  task2 = labels_task2[example.task_2]
  id = example.id
  return ExampleFeautres(id, task1, task2, input_ids, input_mask, input_length, emojiVec, hashembs)

In [14]:
def getDataset(input_features):
    """
    Mappings for index-> features 
    0 -> ID
    1 -> input ids
    2 -> input masks
    3 -> input lengths 
    4 -> hash embs 
    5 -> emoji embs 
    6 -> task1
    7 -> task2
    """
    all_input_page_ids = torch.tensor([f.id for f in input_features], dtype=torch.long)
    all_input_ids = torch.tensor([f.input_ids for f in input_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in input_features], dtype=torch.long)
    all_input_lengths = torch.tensor([f.input_length for f in input_features], dtype=torch.long)
    all_hash_embs = torch.stack([f.hash for f in input_features])
    all_emoji_embs = torch.stack([f.emoji for f in input_features])
    all_task_1 = torch.tensor([f.task_1 for f in input_features], dtype=torch.long)
    all_task_2 = torch.tensor([f.task_2 for f in input_features], dtype=torch.long)

    dataset = TensorDataset(all_input_page_ids, all_input_ids, all_input_mask,all_input_lengths, all_hash_embs, all_emoji_embs, all_task_1,  all_task_2)
    return dataset 

In [15]:
def train_val_dataset(dataset, val_split = 0.2):
    train_idx, val_idx = train_test_split(list(range(len(dataset))), test_size=val_split)
    datasets = {}
    datasets['train'] = Subset(dataset, train_idx)
    datasets['valid'] = Subset(dataset, val_idx)
    return datasets

def getDataloader(path_to_pickle, val_split = 0.2, batch_size = 16, multiLing = True):
  if multiLing:
    tr,tt,df = loadDataAllLangs()
    tempDataset = HASOCDataset(df, isDF=True)
  else:
    tempDataset = HASOCDataset(path_to_pickle)
  input_features = []
  for i in tqdm(range(len(tempDataset))):
    example = Example(i,tempDataset[i]['task_1'],tempDataset[i]['task_2'],tempDataset[i]['hasoc_id'], tempDataset[i]['full_tweet'],tempDataset[i]['tweet_raw_text'], tempDataset[i]['emoji'],tempDataset[i]['segmented_hash'])
    input_feature = convertExamplesToFeature(example)
    input_features.append(input_feature)
  dataset = getDataset(input_features)
  # print(len(dataset))
  set_seed(42)
  data_sampler = RandomSampler(dataset)
  dd = train_val_dataset(dataset, val_split)
  train_dataloader = DataLoader(dd['train'], sampler = RandomSampler(dd['train']), batch_size=batch_size, drop_last=True)
  valid_dataloader = DataLoader(dd['valid'] , batch_size=batch_size, drop_last=True)
  dataloader = DataLoader(dataset , batch_size=batch_size, drop_last=True)
  dataloaders = {x:DataLoader(dd[x], 32, shuffle = True, num_workers = 4) for x in ['train','valid']} 

  return train_dataloader, valid_dataloader, dataloader, dataloaders

In [16]:
DATASET_ROOT = data_loc
train_dataloader, valid_dataloader, dataloader, dataloaders = getDataloader(data_loc , multiLing = True)

  0%|          | 0/9044 [00:00<?, ?it/s]

total size: 9044


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 9044/9044 [02:35<00:00, 58.10it/s]


In [17]:
# Here cause restarting the code sometimes causes Colab to crash, so don't wanna waste
# time waiting for the entire thing to run again. 

# import pickle
# with open('train.pickle','wb') as handle:
#   pickle.dump(train_dataloader, handle)
# with open('valid.pickle','wb') as handle:
#   pickle.dump(valid_dataloader, handle)
# with open('train.pikcle','rb') as handle:
#   train_dataloader = pickle.load(handle)
# with open('valid.pickle','rb') as handle:
#   valid_dataloader = pickle.load( handle)

In [18]:
class ClassificationHead(nn.Module):
  """ Classification head for the Roberta Model """ 
  def __init__(self, numberOfClasses, hidden_size_bert, hidden_size_post_feats, dropout_val = 0.2):
    super().__init__()
    self.denseInit = nn.Linear(hidden_size_post_feats, hidden_size_bert)
    self.dense = nn.Linear(hidden_size_bert, hidden_size_bert)
    self.dropout = nn.Dropout(dropout_val)
    self.output = nn.Linear(hidden_size_bert, numberOfClasses)
  def forward(self, x):
    # print(x.shape)
    x = self.dropout(x)
    x = self.denseInit(x)
    x = torch.tanh(x)
    x = self.dropout(x)
    x = self.dense(x)
    x  = torch.tanh(x)
    x = self.dropout(x)
    x  = self.output(x)
    return x

class TextClassification(nn.Module):
  """ Classifier with feature injection """
  def __init__(self, numberOfClasses,dropout_val = 0.1, batch_size = 16):
     super(TextClassification, self).__init__()
     self.bert = XLMRobertaModel.from_pretrained("sentence-transformers/xlm-r-100langs-bert-base-nli-mean-tokens")
     self.classifier = ClassificationHead(numberOfClasses, self.bert.config.hidden_size, (self.bert.config.hidden_size * 2 + 300) , dropout_val)
  def forward(self, input_seq, attention_mask, emoji, hashTag):
    bert_pooled_output = self.bert(input_seq, attention_mask=attention_mask)[0]
    bert_pooled_output = bert_pooled_output[:, 0, :]
    bert_pooled_out_feat = torch.cat([bert_pooled_output, emoji, hashTag], axis = 1)
    # print("Shape",bert_pooled_out_feat.shape)
    output = self.classifier(bert_pooled_out_feat)
    return output

In [19]:
model_name = 'adaptive'
model_loc = '/content/drive/My Drive/hasoc_saved/'

In [20]:
def modelEvaluate(model, valid_dataloader = valid_dataloader, task = 1):
  gc.collect()
  if task == 1:
    taskIndex = 6
  elif task == 2:
    taskIndex = 7
  model.eval()
  predictions, true_labels = [], []
  logits = []
  # Predict 
  for batch in valid_dataloader:
    # Add batch to GPU
    b_input_ids = batch[1]
    b_input_mask = batch[2]
    b_labels = batch[taskIndex]
    b_emoji = batch[5]
    b_hashtag = batch[4]
    with torch.no_grad():
      pred = model(b_input_ids,b_input_mask ,b_emoji.float(), b_hashtag.float())
    logits.append(pred.detach().cpu().numpy())
    label_ids = b_labels.to('cpu').numpy()
    # Store predictions and true labels
    predictions.append(logits)
    true_labels.append(label_ids)
    flat_true_labels = np.concatenate(true_labels, axis = 0)
    predictions = []
    for i in logits:
      for j in i:
        predictions.append(j)
    flat_predictions = [np.argmax(i) for i in predictions]
    assert(len(flat_predictions) == len(flat_true_labels))
    return flat_predictions, flat_true_labels

In [21]:
path = model_loc + model_name + ".pt"
scale = 1
# Change these: It will save automatically during training

In [22]:
def make_optim(model, rate = 2e-5):
  return AdamW(model.parameters(),
                lr = rate, # default = 5e-5, using 2e-5
                eps = 1e-8) # default = 1e-8

def train_model(train_dataloader, valid_dataloader, numberOfEpochs = 10, task = 1):
  """ Train Loop for the model """
  scale = 1
  if task == 2:
    classNum = 4
    taskIndex = 7
  elif task == 1:
    classNum = 2
    taskIndex = 6
  else:
    raise NameError("Task not defined")
  total_steps = len(train_dataloader)
  print("Start")

  model = TextClassification(classNum) # task 1 
  if device == "gpu":
    model.cuda()
  
  loss_function = nn.CrossEntropyLoss().to(device)
  epoch_loss = 0
  batch_accuracy_scores = []
  global_pred = []
  global_label = []

  present_rate = 2e-5
  old_best = -1
  epoch = 0

  while(1):
    # when the learn rate falls below a lower threshold, you stop your training
    # until that moment, march on
    epoch += 1
    print("\nEpoch:", epoch)
    print("Present Rate: " + str(present_rate))
    optimizer = make_optim(model, present_rate)
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                              num_warmup_steps = 0, # Default value in run_glue.py
                                              num_training_steps = total_steps)
    gc.collect()
    model.train()
    epoch_loss = 0
    batch_accuracy_scores = []
    train_data_count = float(len(train_dataloader))

    # to check if performance with default weights
    predictions, true_labels = modelEvaluate(model, valid_dataloader, task)
    score_now = f1_score(true_labels, predictions, average = 'macro')
    print("Validation Macro: " + str(score_now))

    if (score_now > old_best):
      print("Continuing on track")
      old_best = score_now

      # delete previous best 
      delete_filename = path
      open(delete_filename, 'w').close() # overwrite and make the file blank instead
      os.remove(delete_filename) # delete the blank file from google drive will move the file to bin instead
      torch.save(model.state_dict(), path)

    else:
      print("Backtrack")
      model.load_state_dict(torch.load(path))
      present_rate /= (4 * scale)
      scale *= 4
      if present_rate < 1e-8:
        break

    # For quick eval
    cnt = 0
    # for i, batch in tqdm(enumerate(train_dataloader)):
    for i, batch in enumerate(train_dataloader):
        print("Iter: " + str(cnt + 1))
        # COMMENT OUT THE NEXT 2 LINES IN ACTUAL TRAINING
        # if cnt == 4:
        #   break
        cnt += 1
        b_input_ids = batch[1]
        b_input_mask = batch[2]
        b_labels = batch[taskIndex]
        b_emoji = batch[5]
        b_hashtag = batch[4]
        pred = model(b_input_ids,b_input_mask ,b_emoji.float(), b_hashtag.float())
        loss = loss_function(pred.view(-1, classNum), b_labels.view(-1))
        with torch.no_grad():
          epoch_loss += (loss.item() * len(b_labels))
          global_pred.append(pred)
          global_label.append(b_labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # scheduler.step()

  return model

In [23]:
gc.collect()
model = train_model(train_dataloader, valid_dataloader, 2, task = 2)

Start


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1112256686.0, style=ProgressStyle(descr…



Epoch: 1
Present Rate: 2e-05
Validation Macro: 0.04166666666666667
Continuing on track
Iter: 1
Iter: 2
Iter: 3
Iter: 4
Iter: 5
Iter: 6
Iter: 7
Iter: 8
Iter: 9
Iter: 10
Iter: 11
Iter: 12
Iter: 13
Iter: 14
Iter: 15
Iter: 16
Iter: 17
Iter: 18
Iter: 19
Iter: 20
Iter: 21
Iter: 22
Iter: 23
Iter: 24
Iter: 25
Iter: 26
Iter: 27
Iter: 28
Iter: 29
Iter: 30
Iter: 31
Iter: 32
Iter: 33
Iter: 34
Iter: 35
Iter: 36
Iter: 37
Iter: 38
Iter: 39
Iter: 40
Iter: 41
Iter: 42
Iter: 43
Iter: 44
Iter: 45
Iter: 46
Iter: 47
Iter: 48
Iter: 49
Iter: 50
Iter: 51
Iter: 52
Iter: 53
Iter: 54
Iter: 55
Iter: 56
Iter: 57
Iter: 58
Iter: 59
Iter: 60
Iter: 61
Iter: 62
Iter: 63
Iter: 64
Iter: 65
Iter: 66
Iter: 67
Iter: 68
Iter: 69
Iter: 70
Iter: 71
Iter: 72
Iter: 73
Iter: 74
Iter: 75
Iter: 76
Iter: 77
Iter: 78
Iter: 79
Iter: 80
Iter: 81
Iter: 82
Iter: 83
Iter: 84
Iter: 85
Iter: 86
Iter: 87
Iter: 88
Iter: 89
Iter: 90
Iter: 91
Iter: 92
Iter: 93
Iter: 94
Iter: 95
Iter: 96
Iter: 97
Iter: 98
Iter: 99
Iter: 100
Iter: 101
Iter: 102

In [24]:
predictions, true_labels = modelEvaluate(model, valid_dataloader, 2)

In [25]:
f1_score(true_labels, predictions, average = 'macro')

0.49074074074074076

In [26]:
def loadModel(model_path, task = 1):
  """ Code to load a model based on the saved points """
  if task == 1:
    classNum = 2
  elif task == 2:
    classNum = 4
  else:
    raise NameError("No such task")
  model = TextClassification(classNum)
  model.load_state_dict(torch.load(model_path))
  return model

In [27]:
model1 = loadModel(path, 2)

In [28]:
predictions, true_labels = modelEvaluate(model1, valid_dataloader, 2)
f1_score(true_labels, predictions, average = 'macro')

0.49074074074074076

# Things left to do 

- [  ] use the test data and check @Ujwal
- [ ] Write the scripts to convert this to the submission format @ TR
- [ ]  Add perspective data
- [ ] Explore hyperparameters for Learning rate 
  - [x] Adaptive learning rate @Sayar 
  - [ ]  Look at different non linearities 
  - [ ] explore dropout
- [ ] Currently using 2 Linear layers, can 1 do better? 
- [ ] Explore different massive multiling models
  - [ ] Make a list of models to experiment with @TR, @Zubair
- [ ] Carry out per language analysis and find the stats. @TR
- [ ] Currently using CrossEntropy as the Loss function, but BCE should do better in a multi task setting.Explore that. @ Ujwal 
- [ ] Explore multi-task setting 






In [29]:
# ^_^ Thank You