In [55]:
import pandas as pd
import numpy as np
import random
import torch

import os


In [56]:
datasets_path = os.path.join(os.getcwd(),"datasets")

df = pd.DataFrame()

for dataset in os.listdir(datasets_path):
    path = os.path.join(datasets_path,dataset)
    df = pd.concat([df,pd.read_csv(path)])


In [57]:
df = df[["Clip_Name","text","Label","Use"]]
df = df.dropna()
df

Unnamed: 0,Clip_Name,text,Label,Use
0,Ses02F_impro01_F000,"Hi. Excuse me. Um, I'd like to put in this app...",Neutral,train
1,Ses02F_impro01_F018,"Well, why didn't the D.M.V. put that you neede...",Negative,test
2,Ses02F_impro01_F019,"Yeah, but your birth certificate--I mean, who ...",Negative,train
3,Ses02F_impro01_F020,With your driver's license and your passport. ...,Negative,train
4,Ses02F_impro01_F021,Who--you always use your driver's license. I m...,Negative,train
...,...,...,...,...
580,Ses03M_script03_2_M040,Turn it--Turn it off.,Negative,train
581,Ses03M_script03_2_M041,Very amusing indeed.,Negative,train
582,Ses03M_script03_2_M042,"You know what? You're a vile, little, evil-min...",Negative,train
583,Ses03M_script03_2_M043,You're not going nowhere. No you're not.,Negative,train


In [58]:
labelEncoder = {"Negative" : 0, "Neutral" : 1, "Positive" : 2}
df["Label"] = [labelEncoder[label] for label in df["Label"]]

In [59]:
df

Unnamed: 0,Clip_Name,text,Label,Use
0,Ses02F_impro01_F000,"Hi. Excuse me. Um, I'd like to put in this app...",1,train
1,Ses02F_impro01_F018,"Well, why didn't the D.M.V. put that you neede...",0,test
2,Ses02F_impro01_F019,"Yeah, but your birth certificate--I mean, who ...",0,train
3,Ses02F_impro01_F020,With your driver's license and your passport. ...,0,train
4,Ses02F_impro01_F021,Who--you always use your driver's license. I m...,0,train
...,...,...,...,...
580,Ses03M_script03_2_M040,Turn it--Turn it off.,0,train
581,Ses03M_script03_2_M041,Very amusing indeed.,0,train
582,Ses03M_script03_2_M042,"You know what? You're a vile, little, evil-min...",0,train
583,Ses03M_script03_2_M043,You're not going nowhere. No you're not.,0,train


In [60]:
SEED = 19

if torch.cuda.is_available():
    print(torch.cuda.get_device_name(0))
    device = torch.device("cuda")
    torch.cuda.manual_seed_all(SEED)
else: 
    device = torch.device("cpu")

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

GeForce GTX 1080 Ti


<torch._C.Generator at 0x7f45ab4dac00>

In [61]:
torch.cuda.device_count()

1

In [62]:
from transformers import BertTokenizer

def bert_tokenization(df, maxLen):
    sentences = df['text']
    labels = df['Label'].tolist()
    
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',do_lower_case=True)
    input_ids =  [tokenizer.encode(sent, add_special_tokens=True,max_length=maxLen,pad_to_max_length=True,truncation=True) for sent in sentences]
    attention_mask = []
    attention_masks = [[float(i>0) for i in seq] for seq in input_ids]
    return input_ids, attention_masks, labels

In [63]:
maxLen = 0

for text in df['text']:
    if len(text.split()) > maxLen:
        maxLen = len(text.split())
print(maxLen)

84


In [64]:
train_data = df[df["Use"] == "train"]
val_data = df[df["Use"] == "validation"]
test_data = df[df["Use"] == "test"]

In [65]:
train_inputs, train_masks, train_labels = bert_tokenization(train_data,maxLen)
val_inputs, val_masks, val_labels = bert_tokenization(val_data,maxLen)
test_inputs, test_masks, test_labels = bert_tokenization(test_data,maxLen)



In [66]:
train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)

val_inputs = torch.tensor(val_inputs)
val_labels = torch.tensor(val_labels)
val_masks = torch.tensor(val_masks)

test_inputs = torch.tensor(test_inputs)
test_labels = torch.tensor(test_labels)
test_masks = torch.tensor(test_masks)

In [68]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Select a batch size for training. For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32
batch_size = 32

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory
train_data = TensorDataset(train_inputs,train_masks,train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data,sampler=train_sampler,batch_size=batch_size)

val_data = TensorDataset(val_inputs,val_masks,val_labels)
val_sampler = RandomSampler(val_data)
val_dataloader = DataLoader(val_data,sampler=val_sampler,batch_size=batch_size)

In [69]:
from transformers import BertConfig,AdamW, BertForSequenceClassification,get_linear_schedule_with_warmup
# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3).to(device)

# Parameters:
lr = 2e-5
adam_epsilon = 1e-8

# Number of training epochs (authors recommend between 2 and 4)
epochs = 4

num_warmup_steps = 0
num_training_steps = len(train_dataloader)*epochs

### In Transformers, optimizer and schedules are splitted and instantiated like this:
optimizer = AdamW(model.parameters(), lr=lr,eps=adam_epsilon,correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)  # PyTorch scheduler

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [70]:
from sklearn.metrics import confusion_matrix,classification_report
# Import and evaluate each test batch using Matthew's correlation coefficient
from sklearn.metrics import accuracy_score,matthews_corrcoef
from tqdm import tqdm, trange,tnrange,tqdm_notebook

In [72]:
## Store our loss and accuracy for plotting
train_loss_set = []
learning_rate = []

# Gradients gets accumulated by default
model.zero_grad()

# tnrange is a tqdm wrapper around the normal python range
for _ in tnrange(1,epochs+1,desc='Epoch'):
  print("<" + "="*22 + F" Epoch {_} "+ "="*22 + ">")
  # Calculate total loss for this epoch
  batch_loss = 0

  for step, batch in enumerate(train_dataloader):
    # Set our model to training mode (as opposed to evaluation mode)
    model.train()
    
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch

    # Forward pass
    outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
    loss = outputs[0]
    
    # Backward pass
    loss.backward()
    
    # Clip the norm of the gradients to 1.0
    # Gradient clipping is not in AdamW anymore
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    
    # Update parameters and take a step using the computed gradient
    optimizer.step()
    
    # Update learning rate schedule
    scheduler.step()

    # Clear the previous accumulated gradients
    optimizer.zero_grad()
    
    # Update tracking variables
    batch_loss += loss.item()

  # Calculate the average loss over the training data.
  avg_train_loss = batch_loss / len(train_dataloader)

  #store the current learning rate
  for param_group in optimizer.param_groups:
    print("\n\tCurrent Learning rate: ",param_group['lr'])
    learning_rate.append(param_group['lr'])
    
  train_loss_set.append(avg_train_loss)
  print(F'\n\tAverage Training loss: {avg_train_loss}')
    
  # Validation

  # Put model in evaluation mode to evaluate loss on the validation set
  model.eval()

  # Tracking variables 
  eval_accuracy,eval_mcc_accuracy,nb_eval_steps = 0, 0, 0

  # Evaluate data for one epoch
  for batch in val_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Telling the model not to compute or store gradients, saving memory and speeding up validation
    with torch.no_grad():
      # Forward pass, calculate logit predictions
      logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    
    # Move logits and labels to CPU
    logits = logits[0].to('cpu').numpy()
    label_ids = b_labels.to('cpu').numpy()

    pred_flat = np.argmax(logits, axis=1).flatten()
    labels_flat = label_ids.flatten()
    
    df_metrics=pd.DataFrame({'Epoch':epochs,'Actual_class':labels_flat,'Predicted_class':pred_flat})
    
    tmp_eval_accuracy = accuracy_score(labels_flat,pred_flat)
    tmp_eval_mcc_accuracy = matthews_corrcoef(labels_flat, pred_flat)
    
    eval_accuracy += tmp_eval_accuracy
    eval_mcc_accuracy += tmp_eval_mcc_accuracy
    nb_eval_steps += 1

  print(F'\n\tValidation Accuracy: {eval_accuracy/nb_eval_steps}')
  print(F'\n\tValidation MCC Accuracy: {eval_mcc_accuracy/nb_eval_steps}')

  if __name__ == '__main__':


Epoch:   0%|          | 0/4 [00:00<?, ?it/s]


	Current Learning rate:  1e-05

	Average Training loss: 0.7195086744991509

	Validation Accuracy: 0.6586174242424242

	Validation MCC Accuracy: 0.4321687470909374

	Current Learning rate:  5e-06

	Average Training loss: 0.40955423463035273

	Validation Accuracy: 0.6839488636363636

	Validation MCC Accuracy: 0.47594014900969

	Current Learning rate:  0.0

	Average Training loss: 0.23337434010731206

	Validation Accuracy: 0.6647727272727273

	Validation MCC Accuracy: 0.43963233776496513

	Current Learning rate:  0.0

	Average Training loss: 0.19314471370465047

	Validation Accuracy: 0.6624053030303031

	Validation MCC Accuracy: 0.445162445352343
