In [None]:
!pip install transformers
!pip install datasets
!pip3 install torch

Collecting transformers
  Downloading transformers-4.9.0-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 9.1 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 55.7 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 62.3 MB/s 
Collecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 51.6 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninstalled P

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
import sys
import string
import json

In [None]:
import transformers
from transformers import BertTokenizer, BertModel, BertForSequenceClassification, BertPreTrainedModel, BertConfig
import torch
from torch import nn
from sklearn.metrics import accuracy_score,classification_report

In [None]:
from transformers import AdamW
from transformers import get_scheduler
from datasets import load_metric
from tqdm.notebook import tqdm

In [None]:
train_path = './../datasets/Emotion_classification_ISEAR/corrected_isear-train.csv'
val_path = './../datasets/Emotion_classification_ISEAR/corrected_isear-val.csv'
test_path = './../datasets/Emotion_classification_ISEAR/corrected_isear-test.csv'

In [None]:
def read_data(data):
    df = pd.read_csv(data, names=['Label', 'Text', 'NaN'], error_bad_lines=False)
    df.drop(columns=['NaN'], axis=1, inplace=True)
    return df

In [None]:
train = read_data(train_path)
val = read_data(val_path)
test = read_data(test_path)

In [None]:
def transform_label(data):
    mapping = {'joy': 0, 'fear': 1, 'shame': 2, 'disgust': 3, 'sadness': 4, 'anger': 5, 'guilt':6}
    for k,v in mapping.items():
    if k == data:
        return v
    return -1

In [None]:
def clean_data(data):
    data['Label'] = data.apply(lambda x: transform_label(x['Label']), axis = 1)
    return data

In [None]:
train = clean_data(train)
val = clean_data(val)
test = clean_data(test)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
def create_input_id(data) :
    max_len = 0
    for text in data['Text']:
        input_ids = tokenizer.encode(text, add_special_tokens=True)
        max_len = max(max_len, len(input_ids))
    return max_len

In [None]:
maximum_length_generated_train = create_input_id(train)
maximum_length_generated_val = create_input_id(val)
maximum_length_generated_test = create_input_id(test)

In [None]:
maximum_length_generated_train, maximum_length_generated_val, maximum_length_generated_test

(195, 188, 141)

In [None]:
def preprocessing_bert(data, max_len):
    text = data['Text'].values
    emotion = data['Label'].values
    encode = tokenizer.batch_encode_plus(text, add_special_tokens=True, 
                                       padding='max_length', max_length= max_len,
                                       pad_to_max_length=True, return_token_type_ids=True,
                                       return_attention_mask=True, return_tensors='pt')
    updated_input_id = encode['input_ids']
    attention_masks = encode['attention_mask']
    token_type_ids = encode['token_type_ids']

    return updated_input_id, attention_masks, token_type_ids, emotion

In [None]:
updated_input_id_train, attention_masks_train, token_type_ids_train, emotion_train = preprocessing_bert(train, 
                                             maximum_length_generated_train)

updated_input_id_val, attention_masks_val, token_type_ids_val, emotion_val = preprocessing_bert(val, 
                                             maximum_length_generated_val)

updated_input_id_test, attention_masks_test, token_type_ids_test, emotion_test = preprocessing_bert(test, 
                                             maximum_length_generated_test)

In [None]:
class DataExploration:
    def __init__(self, updated_input_ids, attention_masks, token_type_ids, emotion, is_test=False):
        self.tokens = updated_input_ids
        self.masks = attention_masks
        self.token_type_ids = token_type_ids
        self.labels = emotion

    def __len__(self):
        return self.tokens.shape[0]

    def __getitem__(self, idx):
        return {
            "input_ids": self.tokens[idx],
            "attention_mask": self.masks[idx],
            "token_type_ids": self.token_type_ids[idx],
            "labels": torch.tensor(self.labels[idx]).unsqueeze(0)
        } 

In [None]:
data_train = DataExploration(updated_input_id_train, attention_masks_train, token_type_ids_train, emotion_train)
data_loader_train = torch.utils.data.DataLoader(data_train, batch_size=16, shuffle=True)

data_val = DataExploration(updated_input_id_val, attention_masks_val, token_type_ids_val, emotion_val)
data_loader_val = torch.utils.data.DataLoader(data_val, batch_size=16, shuffle=True)

data_test = DataExploration(updated_input_id_test, attention_masks_test, token_type_ids_test, emotion_test)
data_loader_test = torch.utils.data.DataLoader(data_test, batch_size=16, shuffle=True)

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=7,
                                                      output_attentions=False,
                                                      output_hidden_states=False)
model.classifier
num_epochs= 3
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device);

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
def get_params(data_loader):
    num_training_steps = num_epochs * len(data_loader)
    optimizer = AdamW(model.parameters(), lr=5e-5)
    lr_scheduler = get_scheduler(
      "linear",
      optimizer=optimizer,
      num_warmup_steps=0,
      num_training_steps=num_training_steps)
  
    return num_training_steps, optimizer, lr_scheduler

In [None]:
num_training_steps_train, optimizer_train, lr_scheduler_train = get_params(data_loader_train)
num_training_steps_val, optimizer_val, lr_scheduler_val = get_params(data_loader_val)
num_training_steps_test, optimizer_test, lr_scheduler_test = get_params(data_loader_test)

In [None]:
def train(num_training_steps, data_loader, optimizer, lr_scheduler):
    progress_bar = tqdm(range(num_training_steps))
    metric = load_metric("accuracy")
    for epoch in range(num_epochs):
        progress_bar.set_description("Processing %s" % epoch)
        model.train()
          for bid, batch in enumerate(data_loader):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.set_postfix(loss=loss.item())
            progress_bar.update(1)
            if bid%100==0:
                model.eval()
                for batch in data_loader_val:
                    batch = {k: v.to(device) for k, v in batch.items()}
                    with torch.no_grad():
                        outputs = model(**batch)
                    logits = outputs.logits
                    predictions = torch.argmax(logits, dim=-1)
                    metric.add_batch(predictions=predictions, references=batch["labels"].view(-1))
                print(metric.compute())
    progress_bar.close()

In [None]:
train(num_training_steps_train, data_loader_train, optimizer_train, lr_scheduler_train)

  0%|          | 0/1005 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

{'accuracy': 0.14547038327526132}
{'accuracy': 0.6114982578397212}
{'accuracy': 0.6681184668989547}
{'accuracy': 0.6829268292682927}
{'accuracy': 0.6777003484320557}
{'accuracy': 0.6942508710801394}
{'accuracy': 0.6942508710801394}
{'accuracy': 0.6968641114982579}
{'accuracy': 0.7029616724738676}
{'accuracy': 0.705574912891986}
{'accuracy': 0.7125435540069687}
{'accuracy': 0.7064459930313589}


In [None]:
def predict(num_training_steps, data_loader, optimizer, lr_scheduler):
    metric = load_metric("accuracy")
    model.eval()
    for batch in data_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"].view(-1))
    print(metric.compute())

In [None]:
predict(num_training_steps_test, data_loader_test, optimizer_test, lr_scheduler_test)

{'accuracy': 0.7029616724738676}


In [None]:
torch.save(model, 'emotion_classifier_isear')