<a href="https://colab.research.google.com/github/saeedzou/DeepLearning1401-01/blob/main/Assignment%204/q3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [35]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
import gc
import torch
from torch import nn
import torch.nn.functional as F
from torchsummary import summary
!pip install -q transformers
from transformers import BertTokenizer
from transformers import BertModel
!pip install -q hazm
import hazm
import os
device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
!git clone https://github.com/aminrobatian/Persian_poems_corpus.git

Cloning into 'Persian_poems_corpus'...
remote: Enumerating objects: 159, done.[K
remote: Total 159 (delta 0), reused 0 (delta 0), pack-reused 159[K
Receiving objects: 100% (159/159), 45.21 MiB | 18.00 MiB/s, done.
Resolving deltas: 100% (3/3), done.


In [102]:
poets = ['parvin_norm.txt',
         'shahriar_norm.txt',
         'attar_norm.txt',
         'farrokhi_norm.txt',
         'saadi_norm.txt',
         'bahar_norm.txt',
         'jami_norm.txt',
         'sanaee_norm.txt',
         'moulavi_norm.txt',
         'naserkhosro_norm.txt']

In [106]:
data = []
for poet in poets:
  df = pd.read_csv(os.path.join('Persian_poems_corpus/normalized', poet), header=None, names=['text'])
  if len(df) % 2 == 1:
    df = df[:-1]
  df = pd.DataFrame({'text': [df.iloc[i]['text'] + ' [SEP] ' + df.iloc[i+1]['text'] for i in range(0, len(df), 2)]})
  data.append(df)
result = pd.concat([df.assign(index=i) for i, df in enumerate(data)], axis=0, ignore_index=True)
result = result.sample(frac=1).reset_index(drop=True)

In [107]:
# split data into train and val and test
trainset = result.iloc[:int(len(result)*0.8)].values
valset = result.iloc[int(len(result)*0.8):int(len(result)*0.9)].values
testset = result.iloc[int(len(result)*0.9):].values
# print the length of each dataset
print('train: ', len(trainset))
print('val: ', len(valset))
print('test: ', len(testset))

train:  197957
val:  24745
test:  24745


In [108]:
# calculate max length of poems
max_len = 0
for poem in trainset:
    if len(poem[0].split()) > max_len:
        max_len = len(poem[0].split())
print('max length of poems: ', max_len)
# print the poem with max length
print('poem with max length: ', trainset[np.argmax([len(poem[0].split()) for poem in trainset])][0])

max length of poems:  28
poem with max length:  از ما اگر یکی می ماند  شیطان هزار می زاید و اضافه می شود [SEP] در کدام رویا می توانم ببینم  که یک از چنک هزار نجات یابد


In [109]:
# define a dataset class
class PoemDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.texts = [tokenizer(text, padding='max_length', truncation=True, max_length=max_len, return_tensors='pt') for text in self.data[:, 0]]
        self.labels = [poet for poet in self.data[:, 1]]
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.texts[index], torch.tensor(self.labels[index]).long()

In [110]:
tokenizer = BertTokenizer.from_pretrained('HooshvareLab/bert-base-parsbert-uncased')
train_dataset = PoemDataset(trainset, tokenizer, max_len)
val_dataset = PoemDataset(valset, tokenizer, max_len)
test_dataset = PoemDataset(testset, tokenizer, max_len)

In [115]:
# dataloaders
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=True)

In [113]:
ParsBERT = BertModel.from_pretrained('HooshvareLab/bert-fa-base-uncased')
# freeze ParsBERT parameters
for param in ParsBERT.parameters():
    param.requires_grad = False
# define a model class
class PoemClassifier(nn.Module):
    def __init__(self, ParsBERT, num_classes):
        super(PoemClassifier, self).__init__()
        self.ParsBERT = ParsBERT
        self.classifier = nn.Linear(768, num_classes)
    
    def forward(self, input_ids, attention_mask):
        output = self.ParsBERT(input_ids=input_ids, attention_mask=attention_mask)
        output = self.classifier(output.pooler_output)
        return output

Some weights of the model checkpoint at HooshvareLab/bert-fa-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [116]:
# define a function for training
def train(model, train_loader, val_loader, optimizer, criterion, epochs, device):
    model = model.to(device)
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        train_acc = 0
        for i, (inputs, labels) in enumerate(train_loader):
            inputs = inputs.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()
            outputs = model(input_ids=inputs['input_ids'].squeeze(1).to(device), attention_mask=inputs['attention_mask'].to(device))
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            train_acc += (outputs.argmax(1) == labels).sum().item()
        train_loss /= len(train_loader)
        train_acc /= len(train_loader.dataset)
        val_loss, val_acc = evaluate(model, val_loader, criterion, device)
        print(f'Epoch: {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}')
    return train_loss, train_acc, val_loss, val_acc


# define a function for evaluating
def evaluate(model, val_loader, criterion, device):
    model = model.to(device)
    model.eval()
    val_loss = 0
    val_acc = 0
    with torch.no_grad():
        for i, (inputs, labels) in enumerate(val_loader):
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = model(input_ids=inputs['input_ids'].squeeze(1).to(device), attention_mask=inputs['attention_mask'].to(device))
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            val_acc += (outputs.argmax(1) == labels).sum().item()
    val_loss /= len(val_loader)
    val_acc /= len(val_loader.dataset)
    return val_loss, val_acc



In [None]:
# train the model
model = PoemClassifier(ParsBERT, len(poets))
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
criterion = nn.CrossEntropyLoss()
train_loss, train_acc, val_loss, val_acc = train(model, train_loader, val_loader, optimizer, criterion, 5, device)

In [None]:
# save the model
torch.save(model.state_dict(), 'poem_classifier.pt')
# test the model
test_loss, test_acc = evaluate(model, test_loader, criterion, device)
print(f'Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}')

In [None]:
from sklearn.metrics import classification_report

In [None]:
# define a function for predicting
def predict(model, poem, tokenizer, max_len, device):
    model = model.to(device)
    model.eval()
    poem = tokenizer(poem, padding='max_length', truncation=True, max_length=max_len, return_tensors='pt')
    poem = poem.to(device)
    output = model(input_ids=poem['input_ids'].squeeze(1), attention_mask=poem['attention_mask'])
    return output.argmax(1).item()

# report the results, classification report and confusion matrix
y_true = []
y_pred = []
for i, (inputs, labels) in enumerate(test_loader):
    inputs = inputs.to(device)
    labels = labels.to(device)
    outputs = model(input_ids=inputs['input_ids'].squeeze(1).to(device), attention_mask=inputs['attention_mask'].to(device))
    y_true.extend(labels.tolist())
    y_pred.extend(outputs.argmax(1).tolist())
print(classification_report(y_true, y_pred, target_names=poets))
print(confusion_matrix(y_true, y_pred))
print(f1_score(y_true, y_pred, average='macro'))

# Fine-tuning with SGD and Adam

In [None]:
# define another model where ParsBERT is not frozen
ParsBERT = BertModel.from_pretrained('HooshvareLab/bert-fa-base-uncased')
model = PoemClassifier(ParsBERT, len(poets))
# train the model once with SGD optimizer and once with Adam optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()
# we want to measure perplexity before and after training. select 1000 unbiased samples from the test set
# Get unique labels
labels = np.unique(test[:, 1])
# Initialize empty list to store samples
samples = []
# Loop over unique labels
for label in labels:
    # Get indices of samples with the current label
    indices = np.where(test[:, 1] == label)[0]
    # Randomly select 100 samples from those indices and get the corresponding samples
    samples.append(test[np.random.choice(indices, 100, replace=False)])
# Concatenate all samples into a single numpy array
samples_dataset = PoemDataset(np.concatenate(samples))
# Create a dataloader for the samples
samples_loader = DataLoader(samples_dataset, batch_size=32, shuffle=True)
# test the model
test_loss, test_acc = evaluate(model, samples_loader, criterion, device)
# calculate perplexity
perplexity = np.exp(test_loss)
print(f'Perplexity before training: {perplexity:.4f}')
# train the model once with SGD optimizer
train_loss, train_acc, val_loss, val_acc = train(model, train_loader, val_loader, optimizer, criterion, 10, device)
# save the model
torch.save(model.state_dict(), 'poem_classifier_sgd.pt')
# test the model
test_loss, test_acc = evaluate(model, samples_loader, criterion, device)
# calculate perplexity
perplexity = np.exp(test_loss)
print(f'Perplexity after training: {perplexity:.4f}')
print(f'Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}')
# report the results, classification report and confusion matrix
y_true = []
y_pred = []
for i, (inputs, labels) in enumerate(test_loader):
    inputs = inputs.to(device)
    labels = labels.to(device)
    outputs = model(input_ids=inputs['input_ids'].squeeze(1).to(device), attention_mask=inputs['attention_mask'].to(device))
    y_true.extend(labels.tolist())
    y_pred.extend(outputs.argmax(1).tolist())
print(classification_report(y_true, y_pred, target_names=poets))
print(confusion_matrix(y_true, y_pred))

In [None]:
# define another model where ParsBERT is not frozen
ParsBERT = BertModel.from_pretrained('HooshvareLab/bert-fa-base-uncased')
model = PoemClassifier(ParsBERT, len(poets))
# train the model once with SGD optimizer and once with Adam optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()
# we want to measure perplexity before and after training. select 1000 unbiased samples from the test set
# test the model
test_loss, test_acc = evaluate(model, samples_loader, criterion, device)
# calculate perplexity
perplexity = np.exp(test_loss)
print(f'Perplexity before training: {perplexity:.4f}')
# train the model once with Adam optimizer
train_loss, train_acc, val_loss, val_acc = train(model, train_loader, val_loader, optimizer, criterion, 10, device)
# save the model
torch.save(model.state_dict(), 'poem_classifier_adam.pt')
# test the model
test_loss, test_acc = evaluate(model, samples_loader, criterion, device)
# calculate perplexity
perplexity = np.exp(test_loss)
print(f'Perplexity after training: {perplexity:.4f}')
print(f'Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}')
# report the results, classification report and confusion matrix
y_true = []
y_pred = []
for i, (inputs, labels) in enumerate(test_loader):
    inputs = inputs.to(device)
    labels = labels.to(device)
    outputs = model(input_ids=inputs['input_ids'].squeeze(1).to(device), attention_mask=inputs['attention_mask'].to(device))
    y_true.extend(labels.tolist())
    y_pred.extend(outputs.argmax(1).tolist())
print(classification_report(y_true, y_pred, target_names=poets))
print(confusion_matrix(y_true, y_pred))