In [None]:
!pip install hazm
import functools
import sys
import hazm
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
from tqdm.notebook import tqdm
from sklearn.model_selection import KFold


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
seed = 0

torch.manual_seed(seed)

<torch._C.Generator at 0x7fb4b1a49790>

In [None]:
labels = ['negative', 'positive']

data = pd.read_csv('/content/drive/MyDrive/digikala/cleaned_digikala_dataset.csv', encoding='utf-8')
data['label_id'] = data['label'].apply(lambda t: labels.index(t))

# tokenize data
def tokenize(comment):
    return hazm.word_tokenize(comment)

# creating vocab 
min_freq = 5
data['tokens'] = data['comment'].apply(lambda t: tokenize(t))
special_tokens = ['<unk>', '<pad>']
vocab = torchtext.vocab.build_vocab_from_iterator(data['tokens'],
                                                  min_freq=min_freq,
                                                  specials=special_tokens)
unk_index = vocab['<unk>']
pad_index = vocab['<pad>']
vocab.set_default_index(unk_index)

# making input ids
def numeralize(tokens):
  ids = [vocab[token] for token in tokens]
  ids = np.pad(ids, (0, 256 - len(ids)), 'constant')
  return ids

data['ids']=data['tokens'].apply(lambda t: numeralize(t))
data['label']=data['label_id'].apply(lambda t: int(t))
data['length']=data['ids'].apply(lambda t: len(t))

data = data[['ids','label','length']]

new_data=[]
for [ids,label,length] in data.values:
    new_data.append({'ids':torch.tensor(ids),'label':torch.tensor(label),'length':torch.tensor(length)})

In [None]:
class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional,
                 dropout_rate, pad_index):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_index)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, bidirectional=bidirectional,
                            dropout=dropout_rate, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout_rate)
        
    def forward(self, ids, length):
        # ids = [batch size, seq len]
        # length = [batch size]
        embedded = self.dropout(self.embedding(ids))
        # embedded = [batch size, seq len, embedding dim]
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, length, batch_first=True, 
                                                            enforce_sorted=False)
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        # hidden = [n layers * n directions, batch size, hidden dim]
        # cell = [n layers * n directions, batch size, hidden dim]
        output, output_length = nn.utils.rnn.pad_packed_sequence(packed_output)
        # output = [batch size, seq len, hidden dim * n directions]
        if self.lstm.bidirectional:
            hidden = self.dropout(torch.cat([hidden[-1], hidden[-2]], dim=-1))
            # hidden = [batch size, hidden dim * 2]
        else:
            hidden = self.dropout(hidden[-1])
            # hidden = [batch size, hidden dim]
        prediction = self.fc(hidden)
        # prediction = [batch size, output dim]
        return prediction

In [None]:
vocab_size = len(vocab)
embedding_dim = 300
hidden_dim = 300
output_dim = 2
n_layers = 2
batch_size = 16

# this argument would be false for LSTM model 
bidirectional = True
dropout_rate = 0.5

In [None]:
# counting model params
model = LSTM(vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout_rate, 
             pad_index)
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 6,909,602 trainable parameters


In [None]:
def initialize_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_normal_(m.weight)
        nn.init.zeros_(m.bias)
    elif isinstance(m, nn.LSTM):
        for name, param in m.named_parameters():
            if 'bias' in name:
                nn.init.zeros_(param)
            elif 'weight' in name:
                nn.init.orthogonal_(param)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
criterion = nn.CrossEntropyLoss()
criterion = criterion.to(device)

In [None]:
def collate(batch, pad_index):
    batch_ids = [i['ids'] for i in batch]
    batch_ids = nn.utils.rnn.pad_sequence(batch_ids, padding_value=pad_index, batch_first=True)
    batch_length = [i['length'] for i in batch]
    batch_length = torch.stack(batch_length)
    batch_label = [i['label'] for i in batch]
    batch_label = torch.stack(batch_label)
    batch = {'ids': batch_ids,
             'length': batch_length,
             'label': batch_label}
    return batch

In [None]:
collate = functools.partial(collate, pad_index=pad_index)

In [None]:
def train_op(dataloader, model, criterion, optimizer, device):

    model.train()
    epoch_losses = 0
    epoch_accs = 0


    for batch in tqdm(dataloader, desc='training...', file=sys.stdout):
        ids = batch['ids'].to(device)
        length = batch['length']
        label = batch['label'].to(device)
        preds = model(ids, length)
        loss = criterion(preds, label)
        accuracy = get_accuracy(preds, label)
        
       
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_losses+=loss.item()
        epoch_accs+=accuracy.item()


    return epoch_losses/len(dataloader), epoch_accs/len(dataloader)

In [None]:
def evaluate(dataloader, model, criterion, device):
    
    model.eval()
    epoch_losses = 0
    epoch_accs = 0
    predictions = []
    labels = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc='evaluating...', file=sys.stdout):
            ids = batch['ids'].to(device)
            length = batch['length']
            label = batch['label'].to(device)
            preds = model(ids, length)
            loss = criterion(preds, label)
            accuracy = get_accuracy(preds, label)
            
            preds = preds.argmax(dim=-1)
            predictions.extend(preds)
            labels.extend(label)
            epoch_losses+=loss.item()
            epoch_accs+=accuracy.item()
    
    predictions = torch.stack(predictions).cpu().detach().numpy()
    labels = torch.stack(labels).cpu().detach().numpy()
    f_score = f1_score(labels, predictions, average="weighted")
    return epoch_losses/len(dataloader), epoch_accs/len(dataloader) , f_score

In [None]:
def get_accuracy(prediction, label):
    batch_size, _ = prediction.shape
    predicted_classes = prediction.argmax(dim=-1)
    correct_predictions = predicted_classes.eq(label).sum()
    accuracy = correct_predictions / batch_size
    return accuracy

# this accuracy function would be implemented for multi class data
def get_multi_accuracy(predictions, label):
    top_pred = predictions.argmax(1, keepdim = True)
    correct = top_pred.eq(label.view_as(top_pred)).sum()
    acc = correct.float() / label.shape[0]
    return acc

In [None]:
# initializing fastText
vectors = torchtext.vocab.FastText(language='fa')
pretrained_embedding = vectors.get_vecs_by_tokens(vocab.get_itos())

.vector_cache/wiki.fa.vec: 1.11GB [01:36, 11.5MB/s]                            
  0%|          | 0/420084 [00:00<?, ?it/s]Skipping token b'420084' with 1-dimensional vector [b'300']; likely a header
100%|██████████| 420084/420084 [00:36<00:00, 11545.21it/s]


In [None]:
k=5
splits=KFold(n_splits=k,shuffle=True,random_state=42)
foldperf={}

n_epochs = 10
best_test_loss = float('inf')

history = {'train_losses': [], 'test_losses': [],'train_accs':[],'test_accs':[],'test_f1s':[]}

for fold, (train_idx,val_idx) in enumerate(splits.split(np.arange(len(data)))):
    train_sampler = torch.utils.data.SubsetRandomSampler(train_idx)
    test_sampler = torch.utils.data.SubsetRandomSampler(val_idx)
    train_dataloader=torch.utils.data.DataLoader(new_data, batch_size=16,collate_fn=collate, sampler=train_sampler)
    test_dataloader=torch.utils.data.DataLoader(new_data, batch_size=16,collate_fn=collate, sampler=test_sampler)
        
    model = LSTM(vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout_rate, 
             pad_index)
    model.apply(initialize_weights)

    model.embedding.weight.data = pretrained_embedding

    lr = 5e-4

    optimizer = optim.Adam(model.parameters(), lr=lr)
    model = model.to(device)

    for epoch in range(n_epochs):
        train_loss, train_acc = train_op(train_dataloader, model, criterion, optimizer, device)
        test_loss, test_acc , test_f1 = evaluate(test_dataloader, model, criterion, device)

        
        history['train_losses'].append(train_loss)
        history['test_losses'].append(test_loss)
        history['train_accs'].append(train_acc)
        history['test_accs'].append(test_acc)
        history['test_f1s'].append(test_f1)

        print(f'epoch: {epoch+1}')
        print(f'train_loss: {train_loss:.3f}, train_acc: {train_acc:.3f}')
        print(f'valid_loss: {test_loss:.3f}, valid_acc: {test_acc:.3f}')

    foldperf['fold{}'.format(fold+1)] = history  


torch.save(model.state_dict(), 'bilstm.pt')

training...:   0%|          | 0/2465 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/617 [00:00<?, ?it/s]

epoch: 1
train_loss: 0.338, train_acc: 0.850
valid_loss: 0.227, valid_acc: 0.910


training...:   0%|          | 0/2465 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/617 [00:00<?, ?it/s]

epoch: 2
train_loss: 0.201, train_acc: 0.920
valid_loss: 0.184, valid_acc: 0.927


training...:   0%|          | 0/2465 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/617 [00:00<?, ?it/s]

epoch: 3
train_loss: 0.164, train_acc: 0.938
valid_loss: 0.170, valid_acc: 0.932


training...:   0%|          | 0/2465 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/617 [00:00<?, ?it/s]

epoch: 4
train_loss: 0.140, train_acc: 0.948
valid_loss: 0.172, valid_acc: 0.931


training...:   0%|          | 0/2465 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/617 [00:00<?, ?it/s]

epoch: 5
train_loss: 0.121, train_acc: 0.955
valid_loss: 0.172, valid_acc: 0.933


training...:   0%|          | 0/2465 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/617 [00:00<?, ?it/s]

epoch: 6
train_loss: 0.106, train_acc: 0.962
valid_loss: 0.183, valid_acc: 0.932


training...:   0%|          | 0/2465 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/617 [00:00<?, ?it/s]

epoch: 7
train_loss: 0.092, train_acc: 0.967
valid_loss: 0.196, valid_acc: 0.931


training...:   0%|          | 0/2465 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/617 [00:00<?, ?it/s]

epoch: 8
train_loss: 0.081, train_acc: 0.971
valid_loss: 0.198, valid_acc: 0.930


training...:   0%|          | 0/2465 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/617 [00:00<?, ?it/s]

epoch: 9
train_loss: 0.069, train_acc: 0.976
valid_loss: 0.260, valid_acc: 0.925


training...:   0%|          | 0/2465 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/617 [00:00<?, ?it/s]

epoch: 10
train_loss: 0.059, train_acc: 0.979
valid_loss: 0.273, valid_acc: 0.927


training...:   0%|          | 0/2465 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/617 [00:00<?, ?it/s]

epoch: 1
train_loss: 0.333, train_acc: 0.854
valid_loss: 0.246, valid_acc: 0.906


training...:   0%|          | 0/2465 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/617 [00:00<?, ?it/s]

epoch: 2
train_loss: 0.198, train_acc: 0.922
valid_loss: 0.186, valid_acc: 0.926


training...:   0%|          | 0/2465 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/617 [00:00<?, ?it/s]

epoch: 3
train_loss: 0.160, train_acc: 0.939
valid_loss: 0.178, valid_acc: 0.930


training...:   0%|          | 0/2465 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/617 [00:00<?, ?it/s]

epoch: 4
train_loss: 0.135, train_acc: 0.948
valid_loss: 0.177, valid_acc: 0.931


training...:   0%|          | 0/2465 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/617 [00:00<?, ?it/s]

epoch: 5
train_loss: 0.118, train_acc: 0.957
valid_loss: 0.204, valid_acc: 0.928


training...:   0%|          | 0/2465 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/617 [00:00<?, ?it/s]

epoch: 6
train_loss: 0.100, train_acc: 0.965
valid_loss: 0.198, valid_acc: 0.928


training...:   0%|          | 0/2465 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/617 [00:00<?, ?it/s]

epoch: 7
train_loss: 0.088, train_acc: 0.969
valid_loss: 0.229, valid_acc: 0.923


training...:   0%|          | 0/2465 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/617 [00:00<?, ?it/s]

epoch: 8
train_loss: 0.083, train_acc: 0.971
valid_loss: 0.226, valid_acc: 0.927


training...:   0%|          | 0/2465 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/617 [00:00<?, ?it/s]

epoch: 9
train_loss: 0.064, train_acc: 0.979
valid_loss: 0.240, valid_acc: 0.928


training...:   0%|          | 0/2465 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/617 [00:00<?, ?it/s]

epoch: 10
train_loss: 0.054, train_acc: 0.981
valid_loss: 0.271, valid_acc: 0.924


training...:   0%|          | 0/2465 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/617 [00:00<?, ?it/s]

epoch: 1
train_loss: 0.322, train_acc: 0.861
valid_loss: 0.220, valid_acc: 0.913


training...:   0%|          | 0/2465 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/617 [00:00<?, ?it/s]

epoch: 2
train_loss: 0.202, train_acc: 0.922
valid_loss: 0.202, valid_acc: 0.924


training...:   0%|          | 0/2465 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/617 [00:00<?, ?it/s]

epoch: 3
train_loss: 0.161, train_acc: 0.939
valid_loss: 0.230, valid_acc: 0.916


training...:   0%|          | 0/2465 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/617 [00:00<?, ?it/s]

epoch: 4
train_loss: 0.137, train_acc: 0.949
valid_loss: 0.179, valid_acc: 0.931


training...:   0%|          | 0/2465 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/617 [00:00<?, ?it/s]

epoch: 5
train_loss: 0.115, train_acc: 0.958
valid_loss: 0.179, valid_acc: 0.931


training...:   0%|          | 0/2465 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/617 [00:00<?, ?it/s]

epoch: 6
train_loss: 0.097, train_acc: 0.965
valid_loss: 0.207, valid_acc: 0.930


training...:   0%|          | 0/2465 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/617 [00:00<?, ?it/s]

epoch: 7
train_loss: 0.085, train_acc: 0.970
valid_loss: 0.238, valid_acc: 0.927


training...:   0%|          | 0/2465 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/617 [00:00<?, ?it/s]

epoch: 8
train_loss: 0.073, train_acc: 0.975
valid_loss: 0.237, valid_acc: 0.927


training...:   0%|          | 0/2465 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/617 [00:00<?, ?it/s]

epoch: 9
train_loss: 0.061, train_acc: 0.979
valid_loss: 0.303, valid_acc: 0.925


training...:   0%|          | 0/2465 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/617 [00:00<?, ?it/s]

epoch: 10
train_loss: 0.053, train_acc: 0.981
valid_loss: 0.273, valid_acc: 0.927


training...:   0%|          | 0/2465 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/617 [00:00<?, ?it/s]

epoch: 1
train_loss: 0.332, train_acc: 0.856
valid_loss: 0.237, valid_acc: 0.903


training...:   0%|          | 0/2465 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/617 [00:00<?, ?it/s]

epoch: 2
train_loss: 0.211, train_acc: 0.917
valid_loss: 0.196, valid_acc: 0.922


training...:   0%|          | 0/2465 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/617 [00:00<?, ?it/s]

epoch: 3
train_loss: 0.167, train_acc: 0.936
valid_loss: 0.179, valid_acc: 0.928


training...:   0%|          | 0/2465 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/617 [00:00<?, ?it/s]

epoch: 4
train_loss: 0.142, train_acc: 0.947
valid_loss: 0.174, valid_acc: 0.931


training...:   0%|          | 0/2465 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/617 [00:00<?, ?it/s]

epoch: 5
train_loss: 0.121, train_acc: 0.955
valid_loss: 0.195, valid_acc: 0.931


training...:   0%|          | 0/2465 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/617 [00:00<?, ?it/s]

epoch: 6
train_loss: 0.105, train_acc: 0.961
valid_loss: 0.190, valid_acc: 0.928


training...:   0%|          | 0/2465 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/617 [00:00<?, ?it/s]

epoch: 7
train_loss: 0.091, train_acc: 0.967
valid_loss: 0.226, valid_acc: 0.926


training...:   0%|          | 0/2465 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/617 [00:00<?, ?it/s]

epoch: 8
train_loss: 0.076, train_acc: 0.973
valid_loss: 0.225, valid_acc: 0.923


training...:   0%|          | 0/2465 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/617 [00:00<?, ?it/s]

epoch: 9
train_loss: 0.065, train_acc: 0.977
valid_loss: 0.219, valid_acc: 0.924


training...:   0%|          | 0/2465 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/617 [00:00<?, ?it/s]

epoch: 10
train_loss: 0.055, train_acc: 0.981
valid_loss: 0.341, valid_acc: 0.919


training...:   0%|          | 0/2465 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/617 [00:00<?, ?it/s]

epoch: 1
train_loss: 0.328, train_acc: 0.858
valid_loss: 0.242, valid_acc: 0.908


training...:   0%|          | 0/2465 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/617 [00:00<?, ?it/s]

epoch: 2
train_loss: 0.202, train_acc: 0.920
valid_loss: 0.181, valid_acc: 0.928


training...:   0%|          | 0/2465 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/617 [00:00<?, ?it/s]

epoch: 3
train_loss: 0.162, train_acc: 0.938
valid_loss: 0.171, valid_acc: 0.933


training...:   0%|          | 0/2465 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/617 [00:00<?, ?it/s]

epoch: 4
train_loss: 0.138, train_acc: 0.950
valid_loss: 0.177, valid_acc: 0.934


training...:   0%|          | 0/2465 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/617 [00:00<?, ?it/s]

epoch: 5
train_loss: 0.117, train_acc: 0.957
valid_loss: 0.194, valid_acc: 0.932


training...:   0%|          | 0/2465 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/617 [00:00<?, ?it/s]

epoch: 6
train_loss: 0.101, train_acc: 0.963
valid_loss: 0.199, valid_acc: 0.931


training...:   0%|          | 0/2465 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/617 [00:00<?, ?it/s]

epoch: 7
train_loss: 0.089, train_acc: 0.969
valid_loss: 0.192, valid_acc: 0.930


training...:   0%|          | 0/2465 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/617 [00:00<?, ?it/s]

epoch: 8
train_loss: 0.075, train_acc: 0.975
valid_loss: 0.252, valid_acc: 0.927


training...:   0%|          | 0/2465 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/617 [00:00<?, ?it/s]

epoch: 9
train_loss: 0.067, train_acc: 0.978
valid_loss: 0.267, valid_acc: 0.928


training...:   0%|          | 0/2465 [00:00<?, ?it/s]

evaluating...:   0%|          | 0/617 [00:00<?, ?it/s]

epoch: 10
train_loss: 0.059, train_acc: 0.980
valid_loss: 0.335, valid_acc: 0.925


In [None]:
def average(numList):
  return sum(numList)/len(numList)

print('accuracy average: ',average(history['test_accs']))
print('loss average: ',average(history['test_losses']))
print('f1 average: ',average(history['test_f1s']))

accuracy average:  0.9257354132901132
loss average:  0.2170059351643317
f1 average:  0.9255275252752752
