In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
dataset = pd.read_parquet("emotion.parquet")

In [3]:
dataset = dataset.sample(frac=1, random_state=42)

In [4]:
# sentiment_mapping = {
#     'positive': 1,
#     'neutral': 0,
#     'negative': 2
# }

# dataset['airline_sentiment'] = dataset['airline_sentiment'].replace(sentiment_mapping)
# dataset.head()

In [4]:
train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42)

In [5]:
dataset.shape

(16000, 2)

In [6]:
print("Train set shape:", train_data.shape)
print("Test set shape:", test_data.shape)

Train set shape: (12800, 2)
Test set shape: (3200, 2)


In [7]:
# class AirlineReviewsDataset(Dataset):
#     def __init__(self, data, tokenizer):
#         self.data = data
#         self.tokenizer = tokenizer
#         self.max_length = 256

#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, index):
#         text = self.data.iloc[index]['text']
#         labels = self.data.iloc[index][['airline_sentiment']].values.astype(int)
#         encoding = self.tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=self.max_length)
#         input_ids = encoding['input_ids'][0]
#         attention_mask = encoding['attention_mask'][0]
#         # resize the tensors to the same size
#         input_ids = nn.functional.pad(input_ids, (0, self.max_length - input_ids.shape[0]), value=0)
#         attention_mask = nn.functional.pad(attention_mask, (0, self.max_length - attention_mask.shape[0]), value=0)
#         return input_ids, attention_mask, torch.tensor(labels)

In [9]:
class EmotionDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = 256

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data.iloc[index]['text']
        labels = self.data.iloc[index][['label']].values.astype(int)
        encoding = self.tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_length)
        input_ids = encoding['input_ids'][0]
        attention_mask = encoding['attention_mask'][0]
        # resize the tensors to the same size
        # input_ids = nn.functional.pad(input_ids, (0, self.max_length - input_ids.shape[0]), value=0)
        # attention_mask = nn.functional.pad(attention_mask, (0, self.max_length - attention_mask.shape[0]), value=0)
        return input_ids, attention_mask, torch.tensor(labels)

In [10]:
model_checkpoint = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_checkpoint)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_dataset = EmotionDataset(train_data, tokenizer)
test_dataset = EmotionDataset(test_data, tokenizer)

In [11]:
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [12]:
class BertClassifier(nn.Module):
    def __init__(self, num_labels):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(model_checkpoint)
        self.classifier = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size, 300),
            nn.ReLU(),
            nn.Linear(300, 100),
            nn.ReLU(),
            nn.Linear(100, 50),
            nn.ReLU(),
            nn.Linear(50, num_labels)

        )

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        x = outputs['last_hidden_state'][:, 0, :]
        x = self.classifier(x)
        return x

In [13]:
num_labels = 6
model = BertClassifier(num_labels).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 2e-5)

num_epochs = 3
n_total_steps = len(train_loader)

In [14]:
for epoch in range(num_epochs):

  for i, batch in enumerate (train_loader):

    input_ids, attention_mask, labels = batch
    input_ids = input_ids.to(device)

    attention_mask = attention_mask.to(device)

    labels = labels.view(-1).long()
    labels = labels.to(device)

    optimizer.zero_grad()

    logits = model(input_ids, attention_mask)

    loss = criterion(logits, labels)
    loss.backward()
    optimizer.step()


    if (i+1) % 100 == 0:
        print(f'epoch {epoch + 1}/ {num_epochs}, batch {i+1}/{n_total_steps}, loss = {loss.item():.4f}')

epoch 1/ 3, batch 100/400, loss = 1.3289
epoch 1/ 3, batch 200/400, loss = 0.9946
epoch 1/ 3, batch 300/400, loss = 0.4544
epoch 1/ 3, batch 400/400, loss = 0.3412
epoch 2/ 3, batch 100/400, loss = 0.1623


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x0000015B907D8FD0>>
Traceback (most recent call last):
  File "D:\pd\ML_part\venv\lib\site-packages\ipykernel\ipkernel.py", line 770, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


KeyboardInterrupt: 