In [1]:
!pip install transformers >> /dev/null

In [2]:
!pip install datasets >> /dev/null
!pip3 install torch >> /dev/null 

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
import sys
import string
import json
import spacy
sp = spacy.load('en_core_web_sm')

In [4]:
import transformers
from transformers import BertTokenizer, BertModel, BertForSequenceClassification, BertPreTrainedModel, BertConfig
import torch
from torch import nn
from sklearn.metrics import accuracy_score,classification_report

In [5]:
from transformers import AdamW
from transformers import get_scheduler
from datasets import load_metric
from tqdm.notebook import tqdm

In [6]:
source = './../datasets/Irony_detection_SemEval/semeval_taskA_corrected.csv'

In [7]:
def read_data(data):
    df = pd.read_csv(data, error_bad_lines=False)
    df.rename(columns={'Tweet text': 'Text'}, inplace=True)
    df.rename(columns={'Tweet index': 'Tweet_index'}, inplace= True)
    return df

In [8]:
data = read_data(source)

In [9]:
data.head()

Unnamed: 0,Tweet_index,Label,Text
0,1,1,Sweet United Nations video. Just in time for C...
1,2,1,@mrdahl87 We are rumored to have talked to Erv...
2,3,1,Hey there! Nice to see you Minnesota/ND Winter...
3,4,0,3 episodes left I'm dying over here
4,5,1,I can't breathe! was chosen as the most notabl...


In [10]:
data.shape

(3834, 3)

In [11]:
train = data[:2100]
val = data[2100:2800]
test = data[2900:]

In [12]:
len(train), len(val), len(test)

(2100, 700, 934)

In [13]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [14]:
def create_input_id(data):
    max_len = 0
    for text in data['Text']:
        input_ids = tokenizer.encode(text, add_special_tokens=True)
        max_len = max(max_len, len(input_ids))
    return max_len

In [15]:
maximum_length_generated_train = create_input_id(train)
maximum_length_generated_val = create_input_id(val)
maximum_length_generated_test = create_input_id(test)

In [16]:
maximum_length_generated_train, maximum_length_generated_val, maximum_length_generated_test

(73, 71, 75)

In [17]:
def preprocessing_bert(data, max_len):
    text = data['Text'].values
    irony = data['Label'].values
    encode = tokenizer.batch_encode_plus(text, add_special_tokens=True, 
                                       padding='max_length', max_length= max_len,
                                       pad_to_max_length=True, return_token_type_ids=True,
                                       return_attention_mask=True, return_tensors='pt')
    updated_input_id = encode['input_ids']
    attention_masks = encode['attention_mask']
    token_type_ids = encode['token_type_ids']

    return updated_input_id, attention_masks, token_type_ids, irony

In [18]:
updated_input_id_train, attention_masks_train, token_type_ids_train, irony_train = preprocessing_bert(train, 
                                             maximum_length_generated_train)

updated_input_id_val, attention_masks_val, token_type_ids_val, irony_val = preprocessing_bert(val, 
                                             maximum_length_generated_val)

updated_input_id_test, attention_masks_test, token_type_ids_test, irony_test = preprocessing_bert(test, 
                                             maximum_length_generated_test)

In [19]:
len(irony_train), len(irony_val), len(irony_test)

(2100, 700, 934)

In [21]:
class DataExploration:
    def __init__(self, updated_input_ids, attention_masks, token_type_ids, irony, is_test=False):
        self.tokens = updated_input_ids
        self.masks = attention_masks
        self.token_type_ids = token_type_ids
        self.labels = irony

    def __len__(self):
        return self.tokens.shape[0]

    def __getitem__(self, idx):
        return {
            "input_ids": self.tokens[idx],
            "attention_mask": self.masks[idx],
            "token_type_ids": self.token_type_ids[idx],
            "labels": torch.tensor(self.labels[idx]).unsqueeze(0)
        } 

In [22]:
data_train = DataExploration(updated_input_id_train, attention_masks_train, token_type_ids_train, irony_train)
data_loader_train = torch.utils.data.DataLoader(data_train, batch_size=16, shuffle=True)

data_val = DataExploration(updated_input_id_val, attention_masks_val, token_type_ids_val, irony_val)
data_loader_val = torch.utils.data.DataLoader(data_val, batch_size=16, shuffle=True)

data_test = DataExploration(updated_input_id_test, attention_masks_test, token_type_ids_test, irony_test)
data_loader_test = torch.utils.data.DataLoader(data_test, batch_size=16, shuffle=True)

In [23]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
model.classifier
num_epochs= 3

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device);

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [24]:
def get_params(data_loader):
    num_training_steps = num_epochs * len(data_loader)
    optimizer = AdamW(model.parameters(), lr=5e-5)
    lr_scheduler = get_scheduler(
      "linear",
      optimizer=optimizer,
      num_warmup_steps=0,
      num_training_steps=num_training_steps)

    return num_training_steps, optimizer, lr_scheduler

In [25]:
num_training_steps_train, optimizer_train, lr_scheduler_train = get_params(data_loader_train)
num_training_steps_val, optimizer_val, lr_scheduler_val = get_params(data_loader_val)
num_training_steps_test, optimizer_test, lr_scheduler_test = get_params(data_loader_test)

In [26]:
def train(num_training_steps, data_loader, optimizer, lr_scheduler):
    progress_bar = tqdm(range(num_training_steps))
    metric = load_metric("accuracy")
    for epoch in range(num_epochs):
        progress_bar.set_description("Processing %s" % epoch)
        model.train()
        for bid, batch in enumerate(data_loader):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

            progress_bar.set_postfix(loss=loss.item())
            progress_bar.update(1)
            if bid%100==0:
                model.eval()
                for batch in data_loader_val:
                    batch = {k: v.to(device) for k, v in batch.items()}
                    with torch.no_grad():
                        outputs = model(**batch)
                    logits = outputs.logits
                    predictions = torch.argmax(logits, dim=-1)
                    metric.add_batch(predictions=predictions, references=batch["labels"].view(-1))
                print(metric.compute())
    progress_bar.close()

In [27]:
train(num_training_steps_train, data_loader_train, optimizer_train, lr_scheduler_train)

  0%|          | 0/396 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

{'accuracy': 0.5085714285714286}
{'accuracy': 0.6457142857142857}
{'accuracy': 0.6857142857142857}
{'accuracy': 0.7228571428571429}
{'accuracy': 0.7357142857142858}
{'accuracy': 0.7228571428571429}


In [28]:
def predict(num_training_steps, data_loader, optimizer, lr_scheduler):
    metric = load_metric('accuracy')
    model.eval()
    for batch in data_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs  = model(**batch)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"].view(-1))
    print(metric.compute())

In [29]:
predict(num_training_steps_val, data_loader_val, optimizer_val, lr_scheduler_val)

{'accuracy': 0.7257142857142858}


In [30]:
predict(num_training_steps_test, data_loader_test, optimizer_test, lr_scheduler_test)

{'accuracy': 0.7184154175588865}


Pred on new data

In [31]:
tests = pd.read_csv("train_text.txt", delimiter="\t", header=None, names=["text"])
tests.head()

Unnamed: 0,text
0,“Worry is a down payment on a problem you may ...
1,My roommate: it's okay that we can't spell bec...
2,No but that's so cute. Atsu was probably shy a...
3,Rooneys fucking untouchable isn't he? Been fuc...
4,it's pretty depressing when u hit pan on ur fa...


In [32]:
def predict_on_emotion(text):
    tokens = tokenizer.encode_plus(text, padding = 'max_length', max_length=75, 
                                    return_attention_mask=True, return_token_type_ids= True)
    tokens = {k: torch.Tensor(v).view(1,-1).to(torch.long).to(device) for k, v in tokens.items()}
    model.eval()
    with torch.no_grad():
        outputs  = model(**tokens)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    return predictions.cpu().item()

In [33]:
for file in ["./../datasets/Emotion_classification_TweetEval/test_text.txt", "./../datasets/Emotion_classification_TweetEval/train_text.txt", "./../datasets/Emotion_classification_TweetEval/val_text.txt"]:
    file_name = file.split("Emotion_classification_TweetEval/")[1]
    file_name = file_name.split('.')[0]
    tests = pd.read_csv(file, delimiter="\t", header=None, names=["text"])
    train_results = []
    for i in tqdm(tests["text"], total=len(tests), leave=False):
        train_results.append(predict_on_emotion(i))
    tests["predictions"]=train_results
    tests.to_csv(f"{file_name}_preds.csv", index=None)

  0%|          | 0/1421 [00:00<?, ?it/s]

  0%|          | 0/3257 [00:00<?, ?it/s]

  0%|          | 0/374 [00:00<?, ?it/s]

In [34]:
tests["predictions"]=train_results

In [37]:
tests.shape

(374, 2)