# Task 4
This serves as a template which will guide you through the implementation of this task. It is advised to first read the whole template and get a sense of the overall structure of the code before trying to fill in any of the TODO gaps.
This is the jupyter notebook version of the template. For the python file version, please refer to the file `template_solution.py`.

First, we import necessary libraries:

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertModel
from transformers import Trainer, TrainingArguments, AdamW 
from sklearn.metrics import mean_squared_error as mse
from dataset import ReviewDataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(torch.cuda.is_available())
BATCH_SIZE = 32  # TODO: Set the batch size according to both training performance and available memory
NUM_EPOCHS = 5 # TODO: Set the number of epochs

train_val = pd.read_csv("train.csv")
test_val = pd.read_csv("test_no_score.csv")

False


Depending on your approach, you might need to adapt the structure of this template or parts not marked by TODOs.
It is not necessary to completely follow this template. Feel free to add more code and delete any parts that are not required.

In [3]:
"""class ReviewDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)
    
    
    def __getitem__(self, idx):
        title = str(self.data.loc[idx, 'title'])
        sentence = str(self.data.loc[idx, 'sentence'])
        score = self.data.loc[idx].get('score',0.0)

        inputs = self.tokenizer(title, sentence, return_tensors="pt", padding = "max_length", truncation = True, max_length = self.max_len)
        

        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()
     
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'score': torch.tensor(score, dtype=torch.float)
        }"""

'class ReviewDataset(Dataset):\n    def __init__(self, data, tokenizer, max_len):\n        self.data = data\n        self.tokenizer = tokenizer\n        self.max_len = max_len\n\n    def __len__(self):\n        return len(self.data)\n    \n    \n    def __getitem__(self, idx):\n        title = str(self.data.loc[idx, \'title\'])\n        sentence = str(self.data.loc[idx, \'sentence\'])\n        score = self.data.loc[idx].get(\'score\',0.0)\n\n        inputs = self.tokenizer(title, sentence, return_tensors="pt", padding = "max_length", truncation = True, max_length = self.max_len)\n        \n\n        input_ids = inputs[\'input_ids\'].squeeze()\n        attention_mask = inputs[\'attention_mask\'].squeeze()\n     \n        return {\n            \'input_ids\': input_ids,\n            \'attention_mask\': attention_mask,\n            \'score\': torch.tensor(score, dtype=torch.float)\n        }'

In [4]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
max_len = 512  # Adjust

train_dataset = ReviewDataset(train_val, tokenizer, max_len)
test_dataset = ReviewDataset(test_val, tokenizer, max_len)

train_loader = DataLoader(dataset=train_dataset,
                          batch_size=BATCH_SIZE,
                          shuffle=True, num_workers=16, pin_memory=True)
test_loader = DataLoader(dataset=test_dataset,
                         batch_size=BATCH_SIZE,
                         shuffle=False, num_workers=16, pin_memory=True)
# Additional code if needed



In [5]:
class MyModule(nn.Module):
    
    def __init__(self, premodel):
        super().__init__()
        self.premodel = premodel
        self.fc1 = nn.Linear(self.premodel.config.hidden_size,1)
        self.dropout = nn.Dropout(0.1)

    def forward(self, input_ids, attention_mask):
        outputs = self.premodel(input_ids=input_ids, attention_mask=attention_mask)
        x = self.dropout(outputs.last_hidden_state[:,0])
        score = self.fc1(x)

        return score.squeeze(1)

In [6]:
if __name__ == '__main__':
    premodel = DistilBertModel.from_pretrained('distilbert-base-uncased')

    model = MyModule(premodel)
    torch.save(model.state_dict(), "Das_Model_Rasim.pth")
    model.load_state_dict(torch.load("Das_Model_Rasim.pth"))
    model.to(DEVICE)

    optimizer = AdamW(model.parameters(), lr=5e-6)
    criterion = nn.MSELoss()
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)

    for epoch in range(NUM_EPOCHS):
        
        vloss = 0.0
        print('Epoch :', epoch+1, '/',NUM_EPOCHS)
        
        for batch in tqdm(train_loader, total=len(train_loader)):
            model.train()
            ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            score = batch['score']
            ids, attention_mask, score = ids.to(DEVICE), attention_mask.to(DEVICE), score.to(DEVICE)
            optimizer.zero_grad()
            
            # Forward pass
            train_scores = model(input_ids = ids, attention_mask=attention_mask)
            loss = criterion(train_scores, score)
            vloss += loss.item()

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

        scheduler.step()
        print(f"Epoch {epoch+1}, Training Loss: {vloss / len(train_loader)}")

    model.eval()
    vloss = 0.0
    predictions = []
    actuals = []

    with torch.no_grad():
        results = []
        
        for batch in tqdm(test_loader, total=len(test_loader)):

            ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            
            outputs = model(ids,attention_mask)
            
            predictions.extend(outputs.cpu().numpy())
        
        with open("result_task4_1.txt", "w") as f:
            for val in predictions:
                f.write(f"{val}\n")
                



Epoch : 1 / 5


100%|██████████| 391/391 [1:27:19<00:00, 13.40s/it]


Epoch 1, Training Loss: 6.260075278446802
Epoch : 2 / 5


100%|██████████| 391/391 [1:25:49<00:00, 13.17s/it]


Epoch 2, Training Loss: 1.8470126925526982
Epoch : 3 / 5


100%|██████████| 391/391 [1:26:18<00:00, 13.24s/it]


Epoch 3, Training Loss: 1.34878795546339
Epoch : 4 / 5


100%|██████████| 391/391 [1:24:54<00:00, 13.03s/it]


Epoch 4, Training Loss: 1.0578099549426447
Epoch : 5 / 5


100%|██████████| 391/391 [1:25:03<00:00, 13.05s/it]


Epoch 5, Training Loss: 0.8563787655147446


100%|██████████| 32/32 [02:56<00:00,  5.52s/it]
