<a href="https://colab.research.google.com/github/yrlin411/2022_AI-FinalProject/blob/main/bertcopy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.4-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 4.9 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 67.1 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 6.7 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 53.4 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
   

In [None]:
import numpy as np # for linear algebra
import pandas as pd # for data processing
import seaborn as sns # for visualizing
import matplotlib.pyplot as plt # for visualizing
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn.functional as F
from transformers import BertTokenizer, BertConfig, AdamW, BertForSequenceClassification ,get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score ,matthews_corrcoef
from tqdm.notebook import tqdm, trange, tnrange, tqdm_notebook
import random
import os
import io
from babel.dates import format_time
import time
% matplotlib inline

In [None]:
# identify and specify the GPU as the device, later in training loop we will load data into device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SEED = 19

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if device == torch.device("cuda"):
    torch.cuda.manual_seed_all(SEED)

In [None]:
data_train = pd.read_csv('train.txt', names = ['Text', 'Emotion'], delimiter = ';')
data_test = pd.read_csv('test.txt', names = ['Text', 'Emotion'], delimiter = ';')
data_val = pd.read_csv('val.txt', names = ['Text', 'Emotion'], delimiter = ';')

DATA = pd.concat([data_train, data_test, data_val])

In [None]:
labelencoder = LabelEncoder()
DATA['EmoLabel'] = labelencoder.fit_transform(DATA['Emotion']) # create a column encoded

texts = DATA.Text.values
emos = DATA.EmoLabel.values

# encoding(change words into sequence of numbers)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)
encoded = [tokenizer.encode(text, add_special_tokens = True, max_length = 256, padding = 'max_length') for text in texts]

# create a padding attention mask for encoding
mask = []
mask = [[float(i > 0) for i in num] for num in encoded]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(encoded, emos, random_state = 41, test_size = 0.1)
train_masks, validation_masks, _, _ = train_test_split(mask, encoded, random_state = 41, test_size = 0.1)
train_inputs = torch.tensor(train_inputs) 
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

#saving memory during training by using iterators instead of for loops
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_index = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler = train_index, batch_size = 32)
valid_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
valid_index = RandomSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler = valid_index, batch_size = 32)

In [None]:
# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 6).to(device)
model.zero_grad()

# epoch = 3, warmup ratio = 0.1
epoch = 3
lr = 2e-5
optimizer = AdamW(model.parameters(), lr = 2e-5, eps = 1e-8, correct_bias = False)  
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0.1 * len(train_dataloader), num_training_steps = epoch *len(train_dataloader))

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
train_loss_set = []
learning_rate = []

# tnrange is a tqdm wrapper around the normal python range
for _ in tnrange(1,epoch+1,desc='Epoch'):
  print(" [ " + F" Epoch {_} " + " ] \n" + "\nTraining!\n")
  
  batch_loss = 0
  t0 = time.time()

  """Train"""

  for step, batch in enumerate(train_dataloader):
    model.train() # set to training mode
    if step % 40 == 0 and not step == 0:
      elapsed = format_time(time.time() - t0)
      print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
    
    batch = tuple(t.to(device) for t in batch)
    inputBatch, maskBatch, labelBatch = batch

    outputs = model(inputBatch, token_type_ids = None, attention_mask = maskBatch, labels = labelBatch)
    loss = outputs[0] # Forward pass
    
    loss.backward() # Backward pass
    
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # Clip the norm of the gradients to 1.0
    
    optimizer.step() # Update parameters and take a step with computed gradient
    scheduler.step()  # Update learning rate schedule

    optimizer.zero_grad() # Clear the previous accumulated gradients
    
    batch_loss += loss.item()

  avg_train_loss = batch_loss / len(train_dataloader)
  training_time = format_time(time.time() - t0)

  for param_group in optimizer.param_groups:
    print("\nCurrent Learning rate: ",param_group['lr'])
    learning_rate.append(param_group['lr'])
    
  train_loss_set.append(avg_train_loss)
  print(F'\nAverage Training loss: {avg_train_loss}')
  print(F"\nTraining epoch took: {training_time}")
  
  """Validation"""
  model.eval()
  print("\nValidating!\n")

  # Tracking variables 
  Accuracy = 0
  steps = 0

  for vbatch in valid_dataloader:
    vbatch = tuple(v.to(device) for v in vbatch)
    vInputBatch, vMaskBatch, vLabelBatch = vbatch
    
    with torch.no_grad(): # saving memory and speeding up validation
      logits = model(vInputBatch, token_type_ids = None, attention_mask = vMaskBatch) # Forward pass
    
    # Move logits and labels to CPU
    logits = logits[0].to('cpu').numpy()
    labels = vLabelBatch.to('cpu').numpy()

    pred_flat = np.argmax(logits, axis=1).flatten()
    labels_flat = labels.flatten()
    
    accuracy = accuracy_score(labels_flat,pred_flat)
    Accuracy += accuracy
    steps += 1

  print(F'\nValidation Accuracy: {Accuracy/steps}\n')

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

 [  Epoch 1  ] 

Training!

  Batch    40  of    563.    Elapsed: 12:00:47 AM.
  Batch    80  of    563.    Elapsed: 12:01:35 AM.
  Batch   120  of    563.    Elapsed: 12:02:25 AM.
  Batch   160  of    563.    Elapsed: 12:03:16 AM.
  Batch   200  of    563.    Elapsed: 12:04:06 AM.
  Batch   240  of    563.    Elapsed: 12:04:56 AM.
  Batch   280  of    563.    Elapsed: 12:05:47 AM.
  Batch   320  of    563.    Elapsed: 12:06:37 AM.
  Batch   360  of    563.    Elapsed: 12:07:27 AM.
  Batch   400  of    563.    Elapsed: 12:08:18 AM.
  Batch   440  of    563.    Elapsed: 12:09:08 AM.
  Batch   480  of    563.    Elapsed: 12:09:58 AM.
  Batch   520  of    563.    Elapsed: 12:10:49 AM.
  Batch   560  of    563.    Elapsed: 12:11:39 AM.

Current Learning rate:  1.3793103448275862e-05

Average Training loss: 0.40219176030299275

Training epoch took: 12:11:42 AM

Validating!


Validation Accuracy: 0.9330357142857143

 [  Epoch 2  ] 

Training!

  Batch    40  of    563.    Elapsed: 12:00:50 A

In [None]:
emo2label = {
  "anger": 0,
  "fear": 1,
  "joy": 2,
  "love": 3,
  "sadness": 4,
  "surprise": 5
}

label2emo = {
  0: "anger",
  1: "fear",
  2: "joy",
  3: "love",
  4: "sadness",
  5: "surprise"
}

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

model_save_folder = '/content/gdrive/MyDrive/fuv=ck'
tokenizer_save_folder = 'tokenizer/'

path_model = F'/content/gdrive/MyDrive/fuv=ck/{model_save_folder}'
path_tokenizer = F'/content/gdrive/MyDrive/fuv=ck/{tokenizer_save_folder}'

#create the dir

!mkdir -p {path_model}
!mkdir -p {path_tokenizer}

## Now let's save our model and tokenizer to a directory
model.save_pretrained(path_model)
tokenizer.save_pretrained(path_tokenizer)

model_save_name = 'fineTuneModel.pt'
path = path_model = F'/content/gdrive/MyDrive/fuv=ck/{model_save_name}'
torch.save(model, path);

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
m = torch.load(path, map_location='cpu')
m.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [None]:
import math
def logit2prob(logit):
  odds = math.exp(logit)
  prob = odds / (1 + odds)
  return(prob)

In [None]:
def softmax(inputs):
    return np.exp(inputs - np.max(inputs))/np.sum(np.exp(inputs - np.max(inputs)))

from torch.nn import functional as F
def BERT(sentence):
  # # inputs = tokenizer(input("Hey! How's everything going?\n\n"), return_tensors="pt")
  inputs = tokenizer(sentence, return_tensors="pt")
  with torch.no_grad():
      output = m(**inputs)
      logit = output['logits'].tolist()
      inputs = np.array(logit)
      outputs = softmax(inputs)
      
      maxIndex = 6
      Max = 0
      np.array_str(outputs).replace(' ',',')
      print(outputs)
      for i in range(6):
        print(outputs[0][i])
        if outputs[0][i] > Max:
          Max = outputs[0][i]
          maxIndex = i
      moodString = label2emo[maxIndex]
      print(moodString)
 
  #     # print("\nHmm...")
  #     # time.sleep(1)
  #     
  #     print("Seems like your mood is " + moodString + "!")
  # return moodString
  # input = tokenizer.encode_plus(sentence, return_tensors = "pt")
  # mask_index = torch.where(input["input_ids"][0] == tokenizer.mask_token_id)
  # logits = m(**input)
  # logits = logits.logits
  # softmax = F.softmax(logits, dim = -1)
  # mask_word = softmax[0, mask_index[0]]
  # top_word = torch.argmax(mask_word, dim=0)
  # print(tokenizer.decode(top_word))

BERT("Jackson had never been so happy, and at ease, and she loved Connor with all her heart, yet she was a little sad, feeling they were growing apart.")

[[3.82672863e-04 1.97587691e-04 4.78089231e-04 6.52456410e-04
  9.98162096e-01 1.27097509e-04]]
0.000382672862758786
0.00019758769145480153
0.00047808923104360017
0.0006524564099355605
0.9981620962956802
0.00012709750912711322
sadness
