In [1]:
!pip install transformers
!pip install datasets
!pip3 install torch

Collecting transformers
  Downloading transformers-4.9.0-py3-none-any.whl (2.6 MB)
[?25l[K     |▏                               | 10 kB 35.6 MB/s eta 0:00:01[K     |▎                               | 20 kB 36.9 MB/s eta 0:00:01[K     |▍                               | 30 kB 41.5 MB/s eta 0:00:01[K     |▌                               | 40 kB 41.2 MB/s eta 0:00:01[K     |▋                               | 51 kB 43.2 MB/s eta 0:00:01[K     |▊                               | 61 kB 45.9 MB/s eta 0:00:01[K     |▉                               | 71 kB 36.7 MB/s eta 0:00:01[K     |█                               | 81 kB 31.6 MB/s eta 0:00:01[K     |█▏                              | 92 kB 33.2 MB/s eta 0:00:01[K     |█▎                              | 102 kB 30.8 MB/s eta 0:00:01[K     |█▍                              | 112 kB 30.8 MB/s eta 0:00:01[K     |█▌                              | 122 kB 30.8 MB/s eta 0:00:01[K     |█▋                              | 133 kB 30.8 

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
import sys
import string
import json
import spacy
sp = spacy.load('en_core_web_sm')

In [3]:
import transformers
from transformers import BertTokenizer, BertModel, BertForSequenceClassification, BertPreTrainedModel, BertConfig
import torch
from torch import nn
from sklearn.metrics import accuracy_score,classification_report

In [4]:
from transformers import AdamW
from transformers import get_scheduler
from datasets import load_metric
from tqdm.notebook import tqdm

In [5]:
train_path = './../datasets/Emotion_classification_ISEAR/corrected_isear-train.csv'
val_path = './../datasets/Emotion_classification_ISEAR/corrected_isear-val.csv'
test_path = './../datasets/Emotion_classification_ISEAR/corrected_isear-test.csv'

In [6]:
def read_data(data):
    df = pd.read_csv(data, names=['Label', 'Text', 'NaN'], error_bad_lines=False)
    df.drop(columns=['NaN'], axis=1, inplace=True)
    return df

In [7]:
train = read_data(train_path)
val = read_data(val_path)
test = read_data(test_path)

In [8]:
val.Label.unique()

array(['joy', 'guilt', 'sadness', 'shame', 'fear', 'anger', 'disgust'],
      dtype=object)

In [9]:
train.Label.unique()

array(['joy', 'fear', 'shame', 'disgust', 'guilt', 'anger', 'sadness'],
      dtype=object)

In [10]:
test.Label.unique()

array(['anger', 'shame', 'guilt', 'disgust', 'joy', 'sadness', 'fear'],
      dtype=object)

In [11]:
len(train), len(val), len(test)

(5357, 1148, 1148)

In [12]:
def remove_puncts(data):
    new_data = re.sub(r'[^\w\s]', '', data)
    return new_data

In [13]:
def remove_nums(data):
    pattern = r'[0-9]'
    new_data = re.sub(pattern, '', data)
    return new_data

In [14]:
def remove_stop_words(data):
    all_stopwords = sp.Defaults.stop_words
    tokens = data.split(" ")
    tokens_filtered= [word for word in tokens if not word in all_stopwords]
    return (" ").join(tokens_filtered)

In [15]:
def remove_PRON(data):
    pattern = r'-PRON-'
    new_data = re.sub(pattern, '', data)
    return new_data

In [16]:
def lemmatize(data):
    new_string = ''
    doc = sp(data)
    for token in doc:
        new_string= new_string +" "+ str(token.lemma_)
    return new_string

In [17]:
def transform_label(data):
    mapping = {'joy': 0, 'fear': 1, 'shame': 2, 'disgust': 3, 'sadness': 4, 'anger': 5, 'guilt':6}
    for k,v in mapping.items():
        if k == data:
            return v
    return -1

In [18]:
def clean_data(data):
    data['Text'] = data['Text'].str.lower()
    data['Text'] = data.apply(lambda x: remove_nums(x['Text']), axis=1)
    data['Text'] = data.apply(lambda x: remove_puncts(x['Text']), axis=1)
    data['Text'] = data.apply(lambda x: remove_stop_words(x['Text']), axis=1)
    data['Text'] = data.apply(lambda x: lemmatize(x['Text']), axis=1)
    data['Text'] = data.apply(lambda x: remove_PRON(x['Text']), axis=1)
    data['Label'] = data.apply(lambda x: transform_label(x['Label']), axis = 1)
    return data

In [19]:
train = clean_data(train)
val = clean_data(val)
test = clean_data(test)

In [20]:
train.head(2)

Unnamed: 0,Label,Text
0,0,understood admit university
1,1,broke window neighbouring house fear mothers ...


In [21]:
len(train), len(val), len(test)

(5357, 1148, 1148)

In [22]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [23]:
def create_input_id(data) :
    max_len = 0
    for text in data['Text']:
        input_ids = tokenizer.encode(text, add_special_tokens=True)
        max_len = max(max_len, len(input_ids))
    return max_len

In [24]:
maximum_length_generated_train = create_input_id(train)
maximum_length_generated_val = create_input_id(val)
maximum_length_generated_test = create_input_id(test)

In [25]:
maximum_length_generated_train, maximum_length_generated_val, maximum_length_generated_test

(72, 67, 56)

In [26]:
def preprocessing_bert(data, max_len):
    text = data['Text'].values
    emotion = data['Label'].values
    encode = tokenizer.batch_encode_plus(text, add_special_tokens=True, 
                                       padding='max_length', max_length= max_len,
                                       pad_to_max_length=True, return_token_type_ids=True,
                                       return_attention_mask=True, return_tensors='pt')
    updated_input_id = encode['input_ids']
    attention_masks = encode['attention_mask']
    token_type_ids = encode['token_type_ids']

    return updated_input_id, attention_masks, token_type_ids, emotion

In [27]:
updated_input_id_train, attention_masks_train, token_type_ids_train, emotion_train = preprocessing_bert(train, 
                                             maximum_length_generated_train)

updated_input_id_val, attention_masks_val, token_type_ids_val, emotion_val = preprocessing_bert(val, 
                                             maximum_length_generated_val)

updated_input_id_test, attention_masks_test, token_type_ids_test, emotion_test = preprocessing_bert(test, 
                                             maximum_length_generated_test)

In [28]:
len(emotion_train), len(emotion_val), len(emotion_test)

(5357, 1148, 1148)

In [29]:
class DataExploration:
    def __init__(self, updated_input_ids, attention_masks, token_type_ids, emotion, is_test=False):
        self.tokens = updated_input_ids
        self.masks = attention_masks
        self.token_type_ids = token_type_ids
        self.labels = emotion

    def __len__(self):
        return self.tokens.shape[0]

    def __getitem__(self, idx):
        return {
            "input_ids": self.tokens[idx],
            "attention_mask": self.masks[idx],
            "token_type_ids": self.token_type_ids[idx],
            "labels": torch.tensor(self.labels[idx]).unsqueeze(0)
        } 

In [30]:
data_train = DataExploration(updated_input_id_train, attention_masks_train, token_type_ids_train, emotion_train)
data_loader_train = torch.utils.data.DataLoader(data_train, batch_size=16, shuffle=True)

data_val = DataExploration(updated_input_id_val, attention_masks_val, token_type_ids_val, emotion_val)
data_loader_val = torch.utils.data.DataLoader(data_val, batch_size=16, shuffle=True)

data_test = DataExploration(updated_input_id_test, attention_masks_test, token_type_ids_test, emotion_test)
data_loader_test = torch.utils.data.DataLoader(data_test, batch_size=16, shuffle=True)

In [31]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=7,
                                                      output_attentions=False,
                                                      output_hidden_states=False)
model.classifier
num_epochs= 3
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device);

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [32]:
def get_params(data_loader):
    num_training_steps = num_epochs * len(data_loader)
    optimizer = AdamW(model.parameters(), lr=5e-5)
    lr_scheduler = get_scheduler("linear", optimizer=optimizer,
      num_warmup_steps=0,num_training_steps=num_training_steps)

    return num_training_steps, optimizer, lr_scheduler

In [33]:
num_training_steps_train, optimizer_train, lr_scheduler_train = get_params(data_loader_train)
num_training_steps_val, optimizer_val, lr_scheduler_val = get_params(data_loader_val)
num_training_steps_test, optimizer_test, lr_scheduler_test = get_params(data_loader_test)

In [36]:
def train(num_training_steps, data_loader, optimizer, lr_scheduler):
    progress_bar = tqdm(range(num_training_steps))
    metric = load_metric("accuracy")
    for epoch in range(num_epochs):
        progress_bar.set_description("Processing %s" % epoch)
        model.train()
        for bid, batch in enumerate(data_loader):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.set_postfix(loss=loss.item())
            progress_bar.update(1)
            if bid%100==0:
                model.eval()
                for batch in data_loader_val:
                    batch = {k: v.to(device) for k, v in batch.items()}
                    with torch.no_grad():
                        outputs = model(**batch)
                    logits = outputs.logits
                    predictions = torch.argmax(logits, dim=-1)
                    metric.add_batch(predictions=predictions, references=batch["labels"].view(-1))
                print(metric.compute())
    progress_bar.close()

In [37]:
train(num_training_steps_train, data_loader_train, optimizer_train, lr_scheduler_train)

  0%|          | 0/1005 [00:00<?, ?it/s]

{'accuracy': 0.5905923344947736}
{'accuracy': 0.5958188153310104}
{'accuracy': 0.5932055749128919}
{'accuracy': 0.5966898954703833}
{'accuracy': 0.6019163763066202}
{'accuracy': 0.6019163763066202}
{'accuracy': 0.6019163763066202}
{'accuracy': 0.6019163763066202}
{'accuracy': 0.6019163763066202}
{'accuracy': 0.6019163763066202}
{'accuracy': 0.6019163763066202}
{'accuracy': 0.6019163763066202}


In [38]:
def predict(num_training_steps, data_loader, optimizer, lr_scheduler):
    metric = load_metric("accuracy")
    model.eval()
    for batch in data_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"].view(-1))
    print(metric.compute())

In [39]:
predict(num_training_steps_test, data_loader_test, optimizer_test, lr_scheduler_test)

{'accuracy': 0.6167247386759582}


In [40]:
torch.save(model, 'emotion_classifier_with_data_cleaning_isear')