In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 5.3 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 43.8 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 5.8 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 48.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 52.0 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Fo

In [None]:
from google.colab import drive 
drive.mount('/content/drive')

Mounted at /content/drive


## Import

In [None]:
import numpy as np
import random
import scipy.sparse as sp
import torch
import os
import time
import pandas as pd
import pickle
import copy

In [None]:
SEED = 42
os.environ['PYTHONHASHSEED']=str(SEED)
os.environ['TF_CUDNN_DETERMINISTIC'] = '1'  # TF 2.1
random.seed(SEED)
#seed(SEED)
np.random.seed(SEED)

In [None]:
import torch
import torch.nn.functional as F
from torch import optim
import torch.nn as nn
from transformers import GPT2Tokenizer, GPT2Model, GPT2Config
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import f1_score  
from sklearn.metrics import roc_auc_score
from torch.utils.data import DataLoader, TensorDataset

##Dataset

In [None]:
def subwords_to_merge(tokenized_sequence, sentence, verbose):
  for i in range(len(tokenized_sequence)) :
    while 'Ġ' in tokenized_sequence[i] :
      tokenized_sequence[i] = tokenized_sequence[i].replace('Ġ','')
  words = sentence.split(' ')
  if verbose :
    print("words", words)
    print("token", tokenized_sequence)
  i = 0
  j = 0
  cpt = 0
  n_words = len(words)
  list_subwords_to_merge = []

  while cpt != n_words :
    if tokenized_sequence[j] == words[i] :
      cpt = cpt + 1
      i = i + 1
      j = j + 1
    else :
      tmp_word = tokenized_sequence[j]
      tmp_merge = [j]
      while tmp_word != words[i] :
        j = j+1
        tmp_word = tmp_word + tokenized_sequence[j]
        tmp_merge.append(j)
      list_subwords_to_merge.append(tmp_merge)
      cpt = cpt + 1
      i = i + 1
      j = j + 1
  return list_subwords_to_merge


def get_embedding(comment, tokenizer, model, verbose):
  tokenized_sequence = tokenizer.tokenize(comment)
  subwords_indices = subwords_to_merge(tokenized_sequence, comment, verbose)
  encoded_input = tokenizer(comment, return_tensors='pt')
  output = model(**encoded_input)
  outputseq = output.last_hidden_state[0]

  final_embd = []
  i = 0

  while i != len(outputseq):
    inside = False
    for k in subwords_indices :
      first, last = k[0], k[-1]
      if i in range(first, last) :
        inside = True
        if (last+1)>len(outputseq):
          merge, _ = torch.max(outputseq[first::], 0)
        else:
          merge, _ = torch.max(outputseq[first:(last+1)], 0)
        final_embd.append(list(merge.detach().numpy()))
        i =  i + len(k)
        
    if inside == False:
      final_embd.append(list(outputseq[i].detach().numpy()))
      i = i + 1

  final_embd = torch.tensor(np.array(final_embd))
  return final_embd

In [None]:
def sentence_process(sentence):
  comment = sentence.replace('\n', ' ')
  while '\"' in comment :
    comment = comment.replace('\"', '')
  while "\'" in comment :
    comment = comment.replace("\'", '')
  while ':' in comment :
    comment = comment.replace(':', '')
  while '.' in comment :
    comment = comment.replace('.', '')
  while '@' in comment :
    comment = comment.replace('@', '')
  while '+' in comment :
    comment = comment.replace('+', '')
  while '=' in comment:
    comment = comment.replace('=', '')
  while '&' in comment :
    comment = comment.replace('&', '')
  while ')' in comment or '(' in comment:
    comment = comment.replace(')', '').replace('(', '')
  while ',' in comment or ':' in comment or ';' in comment:
    comment = comment.replace(":", '').replace(',', '').replace(';', '')
  for c in comment :
    if c.isascii() == False :
      comment = comment.replace(c, '')
  n1 = comment.count('!')
  n2 = comment.count('?')
  for i in range(n1):
    comment = comment.replace('!', ' !')
  for i in range(n2):
    comment = comment.replace('?', ' ?')
  while '  ' in comment :
    comment = comment.replace('  ', ' ')
  return comment
  
class AffineTransform(object):
    def __init__(self, filepath, save_dir):
        self.filepath = filepath
        self.save_dir = save_dir

    def process(self):

      if not os.path.exists(self.save_dir + 'Affine_X.pt'):
        tic = time.time()
        train_df = pd.read_csv(self.filepath)

        #delete comments that belongs to several classes
        idx_to_del = []
        for row in train_df.itertuples():
          if sum(row[3::])> 1 :
            idx_to_del.append(row[0])
          
        train_df = train_df.drop(idx_to_del)

        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        model = GPT2Model.from_pretrained('gpt2')

        cpt = 0
        n_too_long = 0


        toxic = train_df[train_df['toxic'] == 0]
        obscene = toxic[toxic['obscene']==0]
        threat = obscene[obscene['threat']==0]
        insult = threat[threat['insult']==0]
        neutral = insult[insult['identity_hate']==0] #df with neutral comment

        toxic = train_df[train_df['toxic'] == 1]
        obscene = train_df[train_df['obscene'] == 1]
        threat = train_df[train_df['threat'] == 1]
        insult = train_df[train_df['insult'] == 1]
        identity_hate = train_df[train_df['identity_hate'] == 1]

        final_df = pd.concat([toxic, obscene, threat, insult, identity_hate, neutral]) #merge all the df together in the right order
        X = []
        labels = []
        max = 0
        for row in final_df.itertuples():
      
          cpt = cpt + 1
          print("cpt:", cpt)
         
          if cpt in [1344, 1345, 1445, 1446, 1528, 1529, 3243, 3244, 3773, 3776, 11888] :
            continue
          if len(X) == 10000 or len(labels) == 10000 :
            if np.shape(X)[0] == np.shape(labels)[0] :
              break
            else : 
              print("Embedding matrix and labels different shapes")
              return -1
          
          
          comment = sentence_process(row[2])
          words = comment.split(' ')
         
          while '' in words :
            words.remove('')
          
          for w in words :
            if w.isnumeric() and len(w)>=5 :
              words.remove(w)
          
          words = list(filter(lambda s: 'http' not in s, words))
            
          comment = " ".join(words)

          if len(words)>100 : #comments with more than 1024 words = cannot use with GPT2 or more than 996 : outliers comments
            print("Too long")
            n_too_long += 1
            continue


          #print("comment:", comment)
          if row[3] == 0 and row[4] == 0 and row[5] ==0 and row[6] ==0 and row[7] == 0 and row[8] == 0:
            labels.append(0)
          elif row[3] == 1 :
            labels.append(1)
          elif row[4] == 1 :
            labels.append(6)
          elif row[5] == 1 :
            labels.append(2)
          elif row[6] == 1 :
            labels.append(3)
          elif row[7] == 1 :
            labels.append(4)
          else: 
            labels.append(5)
      
          #Get embeddings for the sentence
          verbose = False
          outputseq = get_embedding(comment, tokenizer, model, verbose)
          if outputseq.shape[0]>max :
            max = outputseq.shape[0]
          
          X.append(torch.flatten(outputseq))
        for i, vector in enumerate(X) :
          X[i] = torch.cat((X[i], torch.zeros(max*768 - list(X[i].shape)[0])))
        X = torch.stack(X)
        labels = torch.tensor(labels)
        torch.save(X, self.save_dir + 'Affine_X.pt')
        torch.save(labels, self.save_dir + 'Affine_labels.pt')
      else :
        X = torch.load(self.save_dir + 'Affine_X.pt')
        labels = torch.load(self.save_dir + 'Affine_labels.pt')
        
      return X, labels

dataset = AffineTransform('/content/drive/My Drive/IASD_tmp/NLP/train.csv', '/content/drive/My Drive/IASD_tmp/NLP/').process()

## MLP

In [None]:
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(MLP, self).__init__()
        self.linear1 = nn.Linear(input_dim, hidden_dim)
        self.batchnorm = nn.BatchNorm1d(hidden_dim)
        self.dropout = nn.Dropout(0.2)
        self.linear2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, input):
        h = self.linear1(input)
        h = self.batchnorm(h)
        h = F.relu(h)
        h = self.dropout(h)
        h = self.linear2(h)
        return h     

In [None]:
def train(model, params, trainloader, class_weights) :
  optimizer = params['optimizer'](model.parameters(), lr=params['learning_rate'])

  for epoch in range(params['n_epochs']): 
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        inputs, labels = data
        inputs = inputs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = F.cross_entropy(outputs, labels, weight = class_weights)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        if i % 100 == 0 and params['verbose']:  
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
            running_loss = 0.0

In [None]:
def evaluate(testloader, model):
  num_correct = 0
  num_tests = 0
  for batched_graph, labels in testloader:
    batched_graph = batched_graph.to(device)
    labels = labels.to(device)
    with torch.no_grad():
      pred = model(batched_graph)
    num_correct += (pred.argmax(1) == labels).sum().item()
    num_tests += len(labels)

  return num_correct / num_tests

In [None]:
def predict(testloader, model):
  #preds = []
  true_labels = []
  for batched_graph, labels in testloader:
    true_labels.append(labels)
    batched_graph = batched_graph.to(device)
    labels = labels.to(device)
    with torch.no_grad():
      pred = model(batched_graph)
  #preds = torch.stack(preds)
  true_labels = torch.cat(true_labels, dim=0)
  return pred, true_labels

In [None]:
list_params = [{'n_epochs':20, 'learning_rate':0.001, 'batch_size':1024, 'hidden_dim':5000, 'optimizer': optim.Adam, 'verbose':False},
               {'n_epochs':20, 'learning_rate':0.01, 'batch_size':1024, 'hidden_dim':5000, 'optimizer': optim.Adam, 'verbose':False},
               {'n_epochs':20, 'learning_rate':0.001, 'batch_size':1024, 'hidden_dim':5000, 'optimizer': optim.SGD, 'verbose':False}, 
               {'n_epochs':20, 'learning_rate':0.001, 'batch_size':1024, 'hidden_dim':1000, 'optimizer': optim.Adam, 'verbose':False},
               {'n_epochs':20, 'learning_rate':0.001, 'batch_size':1024, 'hidden_dim':500, 'optimizer': optim.Adam, 'verbose':False},
               ]
random_states = [0,1,2]

for p, params in enumerate(list_params) :
  tic = time.time()

  test_seed_acc, val_seed_acc = [], []
  test_seed_f1, val_seed_f1 = [], []
  test_seed_auc, val_seed_auc = [], []
  for r in random_states :
    X, labels = shuffle(dataset[0], dataset[1], random_state=r)

    torch.manual_seed(r)
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    num_examples = X.shape[0]
    num_train = int(0.80 * num_examples)
    num_val = int((num_examples-num_train)/2)
    X_train, y_train = X[:num_train], labels[:num_train]
    class_weights = torch.from_numpy(compute_class_weight('balanced', classes = np.unique(y_train), y=y_train.cpu().numpy())).to(dtype=torch.float32).to(device)
    X_val, y_val = X[num_train:(num_train+num_val)], labels[num_train : (num_train+num_val)]
    X_test, y_test = X[(num_train+num_val) : (num_train+2*num_val)], labels[(num_train+num_val):(num_train+2*num_val)]
    train_dataset = TensorDataset(X_train, y_train)
    trainloader = DataLoader(train_dataset, batch_size=params['batch_size'])

    val_dataset = TensorDataset(X_val, y_val)
    val_loader = DataLoader(val_dataset, batch_size=params['batch_size'])

    test_dataset = TensorDataset(X_test, y_test)
    testloader = DataLoader(test_dataset, batch_size=params['batch_size'])
    model = MLP(X_train.shape[1], params['hidden_dim'], 6).to(device)
    train(model, params, trainloader, class_weights)

    val_seed_acc.append(evaluate(val_loader, model))
    test_seed_acc.append(evaluate(testloader, model))

    val_preds, val_labels = predict(val_loader, model)
    val_preds = val_preds.cpu()
    val_labels = val_labels.cpu()
    test_preds, test_labels = predict(testloader, model)
    test_preds = test_preds.cpu()
    test_labels = test_labels.cpu()

    _, one_hot_val_preds = torch.max(val_preds, 1)
    _, one_hot_test_preds = torch.max(test_preds, 1)
    val_seed_f1.append(f1_score(val_labels, one_hot_val_preds, average='weighted'))
    test_seed_f1.append(f1_score(test_labels, one_hot_test_preds, average='weighted'))

    prob_val = nn.Softmax(dim=1)(val_preds)
    prob_test = nn.Softmax(dim=1)(test_preds)
    val_seed_auc.append(roc_auc_score(val_labels, prob_val, multi_class="ovo", average="weighted"))
    test_seed_auc.append(roc_auc_score(test_labels, prob_test, multi_class="ovo", average="weighted"))

  #Mesure incertitude en fct seed
  validation_acc = np.mean(val_seed_acc)
  test_acc = np.mean(test_seed_acc)

  validation_f1 = np.mean(val_seed_f1)
  test_f1 = np.mean(test_seed_f1)

  validation_auc = np.mean(val_seed_auc)
  test_auc = np.mean(test_seed_auc)
  print("Performances for combination", p, "\n")
  print("Mean val acc :", validation_acc, "+-", np.std(val_seed_acc))
  print("F1-score val acc :", validation_f1, "+-", np.std(val_seed_f1))

  print("Mean test acc :", test_acc, "+-", np.std(test_seed_acc))
  print("F1-score test acc :", test_f1, "+-", np.std(test_seed_f1))

  print("Mean val AUC:", validation_auc, "+-", np.std(val_seed_auc))
  print("Mean test AUC:", test_auc, "+-", np.std(test_seed_auc), "\n")

tac = time.time()
print("Grid search done after", tac - tic)

Performances for combination 0 

Mean val acc : 0.7673333333333333 +- 0.016336734339790476
F1-score val acc : 0.7568226030987794 +- 0.01728311060612912
Mean test acc : 0.765 +- 0.016309506430300102
F1-score test acc : 0.750491897153062 +- 0.011696155538374973
Mean val AUC: 0.7077846814644148 +- 0.007641971705947499
Mean test AUC: 0.7026848016648742 +- 0.003939586463223441 

Performances for combination 1 

Mean val acc : 0.7000000000000001 +- 0.022759613353482054
F1-score val acc : 0.7038190983546223 +- 0.024817617665963604
Mean test acc : 0.6916666666666668 +- 0.024957742063113155
F1-score test acc : 0.7064710288277379 +- 0.019314017655419426
Mean val AUC: 0.6756504573047463 +- 0.031185920245175937
Mean test AUC: 0.6858500289684829 +- 0.022337543956296464 

Performances for combination 2 

Mean val acc : 0.33066666666666666 +- 0.03308910528994232
F1-score val acc : 0.4342316087651759 +- 0.03450807131931555
Mean test acc : 0.33066666666666666 +- 0.028015868519267597
F1-score test acc :

## RandomForest

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

random_states = [0,1,2]

accuracy = []
F1_Score=[]
AUC=[]

tic = time.time()

for r in random_states :

  print("Random state : ", r, "\n")
  X, labels = dataset[0].numpy(), dataset[1].numpy()
  X, labels = shuffle(X, labels, random_state=r)
  np.random.seed(r)
  random.seed(r)
  num_examples = np.shape(X)[0]
  num_train = int(0.80 * num_examples)
  num_val = int((num_examples-num_train)/2)
  X_train, y_train = X[:num_train], labels[:num_train]
  X_test, y_test = X[(num_train+num_val) : (num_train+2*num_val)], labels[(num_train+num_val):(num_train+2*num_val)]

  pipeline = Pipeline([('clf', RandomForestClassifier(random_state = r, class_weight="balanced"))])

  params = {'clf__n_estimators': [200, 500], 'clf__max_depth':[50, 20]}
  rskf = StratifiedKFold(n_splits=3, random_state=r, shuffle=True)

  cv = GridSearchCV(pipeline, params, cv = rskf, scoring = 'accuracy')
  cv.fit(X_train, y_train)

  preds_proba = cv.predict_proba(X_test)
  preds = cv.predict(X_test)

  accuracy.append(accuracy_score(y_test, preds))
  F1_Score.append(f1_score(y_test, preds, average='weighted'))
  AUC.append(roc_auc_score(y_test,preds_proba,multi_class="ovo",average="weighted"))

print("Accuracy : ",np.mean(accuracy),"with std",np.std(accuracy))
print("f1_score : ",np.mean(F1_Score),"with std",np.std(F1_Score))
print("AUC : ",np.mean(AUC),"with std",np.std(AUC))
tac = time.time()
print('Finished in', tac-tic, 'seconds')

Random state :  0 

Random state :  1 

Random state :  2 

Accuracy :  0.7119999999999999 with std 0.0014142135623730961
f1_score :  0.6900679124886073 with std 0.0028305514232209524
AUC :  0.6835419164144985 with std 0.014798742444330985
Finished in 16106.7343044281 seconds


## LSTM (not enough RAM)

In [None]:
class LSTM(nn.Module):

    def __init__(self, params, embedding_dim=768, n_classes = 7):
        super(LSTM, self).__init__()
        self.hidden_dim = params['hidden_dim']
        self.lstm = nn.LSTM(embedding_dim, params['hidden_dim'], num_layers = params['num_layers'], dropout = params['dropout'], bidirectional = params['bidirectional'])
        self.linear = nn.Linear(params['hidden_dim'], n_classes)

    def forward(self, sentence):
        h, _ = self.lstm(sentence)
        h = self.linear(h)
        return h

In [None]:
def train(model, params, trainloader) :
  criterion = F.cross_entropy()
  optimizer = model['optimizer'](model.parameters(), lr=params['learning_rate'])

  for epoch in range(params['n_epochs']): 
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        inputs, labels = data
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        if i % 2000 == 1999:    # print every 2000 mini-batches
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
            running_loss = 0.0

In [None]:
def evaluate(testloader, model):
  num_correct = 0
  num_tests = 0
  for batched_graph, labels in testloader:
    batched_graph = batched_graph.to(device)
    labels = labels.to(device)
    with torch.no_grad():
      pred = model(batched_graph)
    num_correct += (pred.argmax(1) == labels).sum().item()
    num_tests += len(labels)

  return num_correct / num_tests

In [None]:
def predict(testloader, model):
  preds = []
  true_labels = []
  for batched_graph, labels in testloader:
    true_labels.append(labels)
    batched_graph = batched_graph.to(device)
    labels = labels.to(device)
    with torch.no_grad():
      pred = model(batched_graph)
      preds.append(preds)
  preds = torch.cat(preds, dim=0)
  true_labels = torch.cat(true_labels, dim=0)
  return preds, true_labels