### Homework #4: Pre-Post Product purchase Questions classification

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import BertTokenizer, BertForSequenceClassification

In [2]:
data = pd.read_csv("Pre_Post_Questions.csv", encoding = "ISO-8859-1")[:2000]

In [3]:
data = data.drop(columns=['id', 'item_name'])

In [4]:
data = data[data['label'].isin(['Pre', 'Post'])]

In [5]:
labels_dict = {
    'Pre': 0,
    'Post' : 1
}

In [6]:
data['label'] = [labels_dict[label] for label in data['label'].values]

In [7]:
train, test = train_test_split(data, test_size=0.2)

In [8]:
valid, test = train_test_split(test, test_size=0.5)

In [9]:
class CustomDataset(Dataset):
    

  def __init__(self, texts, targets, tokenizer, max_len=512):
    self.texts = texts
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    text = str(self.texts[idx])
    target = self.targets[idx]

    encoding = self.tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=self.max_len,
        return_token_type_ids=False,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
        truncation=True
    )

    return {
      'text': text,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long)
    }

In [10]:
class BertClassifier:

    def __init__(self, path, n_classes=2):
        self.path = path
        self.model = BertForSequenceClassification.from_pretrained(path)
        self.tokenizer = BertTokenizer.from_pretrained(path)
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.max_len = 512
        self.out_features = self.model.bert.encoder.layer[1].output.dense.out_features
        self.model.classifier = torch.nn.Linear(self.out_features, n_classes)
        self.model.to(self.device)

    
    def preparation(self, X_train, y_train, epochs):
        # create datasets
        self.train_set = CustomDataset(X_train, y_train, self.tokenizer)
        # create data loaders
        self.train_loader = DataLoader(self.train_set, batch_size=2, shuffle=True)
        # helpers initialization
        self.optimizer = AdamW(
            self.model.parameters(),
            lr=2e-5,
            weight_decay=0.005,
            correct_bias=True
            )
        self.scheduler = get_linear_schedule_with_warmup(
                self.optimizer,
                num_warmup_steps=500,
                num_training_steps=len(self.train_loader) * epochs
            )
        self.loss_fn = torch.nn.CrossEntropyLoss().to(self.device)


    def fit(self):
        self.model = self.model.train()
        losses = []
        correct_predictions = 0

        for data in tqdm(self.train_loader):
            input_ids = data["input_ids"].to(self.device)
            attention_mask = data["attention_mask"].to(self.device)
            targets = data["targets"].to(self.device)

            outputs = self.model(
                input_ids=input_ids,
                attention_mask=attention_mask
                )

            preds = torch.argmax(outputs.logits, dim=1)
            loss = self.loss_fn(outputs.logits, targets)

            correct_predictions += torch.sum(preds == targets)

            losses.append(loss.item())
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
            self.optimizer.step()
            self.scheduler.step()
            self.optimizer.zero_grad()

        train_acc = correct_predictions.double() / len(self.train_set)
        train_loss = np.mean(losses)
        return train_acc, train_loss
    

    def train(self, X_train, y_train, X_valid, y_valid, X_test, y_test, epochs=1):
        print('*' * 10)
        print(f'Model: {self.path}')
        self.preparation(X_train, y_train, epochs)
        for epoch in range(epochs):
            print(f'Epoch {epoch + 1}/{epochs}')
            train_acc, train_loss = self.fit()
            print(f'Train loss {train_loss} accuracy {train_acc}')
            predictions_valid = [self.predict(x) for x in X_valid]
            precision, recall, f1score = precision_recall_fscore_support(y_valid, predictions_valid, average='macro')[:3]
            print('Valid:')
            print(f'precision: {precision}, recall: {recall}, f1score: {f1score}')
            predictions_test = [self.predict(x) for x in X_test]
            precision, recall, f1score = precision_recall_fscore_support(y_test, predictions_test, average='macro')[:3]
            print('Test:')
            print(f'precision: {precision}, recall: {recall}, f1score: {f1score}')
        print('*' * 10)
    
    def predict(self, text):
        self.model = self.model.eval()
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        out = {
              'text': text,
              'input_ids': encoding['input_ids'].flatten(),
              'attention_mask': encoding['attention_mask'].flatten()
          }
        
        input_ids = out["input_ids"].to(self.device)
        attention_mask = out["attention_mask"].to(self.device)
        
        outputs = self.model(
            input_ids=input_ids.unsqueeze(0),
            attention_mask=attention_mask.unsqueeze(0)
        )
        
        prediction = torch.argmax(outputs.logits, dim=1).cpu().numpy()[0]

        return prediction

In [11]:
classifier = BertClassifier(
    path='./rubert',
#     path='cointegrated/rubert-tiny2',
    n_classes=2
)

Some weights of the model checkpoint at ./rubert were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from 

In [12]:
classifier.train(
        X_train=list(train['question']),
        y_train=list(train['label']),
        X_valid=list(valid['question']),
        y_valid=list(valid['label']),
        X_test=list(test['question']),
        y_test=list(test['label']),
        epochs=5
)

  0%|          | 0/798 [00:00<?, ?it/s]

**********
Model: ./rubert
Epoch 1/5


100%|██████████| 798/798 [32:01<00:00,  2.41s/it]   


Train loss 0.650359069881844 accuracy 0.5989974937343359
Valid:
precision: 0.6478632478632479, recall: 0.6369971491922711, f1score: 0.6396986570586308


  0%|          | 0/798 [00:00<?, ?it/s]

Test:
precision: 0.6824175824175824, recall: 0.6703612479474549, f1score: 0.6726850385386971
Epoch 2/5


100%|██████████| 798/798 [07:38<00:00,  1.74it/s]


Train loss 0.6975862497071686 accuracy 0.718671679197995
Valid:
precision: 0.6608150470219436, recall: 0.6354133671206842, f1score: 0.6381727725011307


  0%|          | 0/798 [00:00<?, ?it/s]

Test:
precision: 0.7017094017094017, recall: 0.6816502463054187, f1score: 0.6844835423407043
Epoch 3/5


100%|██████████| 798/798 [07:32<00:00,  1.77it/s]


Train loss 0.6499579365640636 accuracy 0.7619047619047619
Valid:
precision: 0.6659786304031083, recall: 0.6443353394572907, f1score: 0.6477987421383649


  0%|          | 0/798 [00:00<?, ?it/s]

Test:
precision: 0.7017094017094017, recall: 0.6816502463054187, f1score: 0.6844835423407043
Epoch 4/5


100%|██████████| 798/798 [07:30<00:00,  1.77it/s]


Train loss 0.6413676600878764 accuracy 0.7907268170426065
Valid:
precision: 0.696969696969697, recall: 0.6695174743955232, f1score: 0.6745595331612614


  0%|          | 0/798 [00:00<?, ?it/s]

Test:
precision: 0.7066938037087291, recall: 0.6876026272577997, f1score: 0.6906666666666668
Epoch 5/5


100%|██████████| 798/798 [07:34<00:00,  1.76it/s]


Train loss 0.6137621184539068 accuracy 0.8070175438596491
Valid:
precision: 0.696969696969697, recall: 0.6695174743955232, f1score: 0.6745595331612614
Test:
precision: 0.7066938037087291, recall: 0.6876026272577997, f1score: 0.6906666666666668
**********
