In [1]:
!pip install gensim
!pip install LughaatNLP



In [34]:
import pandas as pd
import re
from LughaatNLP import LughaatNLP
from gensim.models import KeyedVectors
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import accuracy_score

In [None]:
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


## Initial Preprocessing with Regex & LughaatNLP

In [None]:
train = pd.read_csv('/content/drive/MyDrive/imdb_urdu_reviews_train.csv')
val = pd.read_csv('/content/drive/MyDrive/imdb_urdu_reviews_test.csv')

In [None]:
def clean(text):
  pattern = r'[A-Za-z0-9\W]+'
  return re.sub(pattern,' ',text)

In [None]:
train_sentences = []
val_sentences = []
preproc = LughaatNLP()

for i in range(len(train)):
  train_sentences.append(preproc.lemmatize_sentence(preproc.remove_stopwords(clean(train['review'][i]))))

for i in range(len(val)):
  val_sentences.append(preproc.lemmatize_sentence(preproc.remove_stopwords(clean(val['review'][i]))))

NameError: name 'LughaatNLP' is not defined

In [None]:
train_labels = list(train['sentiment'].map({'positive': 1, 'negative': 0}))
val_labels = list(val['sentiment'].map({'positive': 1, 'negative': 0}))

In [None]:
df_train = {'text': train_sentences, 'label': train_labels}
df_val = {'text': val_sentences, 'label': val_labels}
df_train = pd.DataFrame(df_train)
df_val = pd.DataFrame(df_val)
df_train.to_csv('train.csv')
df_val.to_csv('val.csv')

## Word Embeddings & Secondary Preprocessing

In [57]:
train = pd.read_csv('/kaggle/input/assignment3-dl-misc/train.csv')
val = pd.read_csv('/kaggle/input/assignment3-dl-misc/val.csv')

train_sentences, val_sentences = list(train['text']), list(val['text'])
train_labels, val_labels = list(train['label']), list(val['label'])

del train
del val

In [58]:
model = KeyedVectors.load_word2vec_format("/kaggle/input/urdu-word2vec-200dim/urdu_220m_wv_200d.bin", binary=True)
vocab = model.index_to_key
pad_vector = np.zeros(200)

def convert_to_vectors(sentences, model, max_length, pad_vector):
    vectors = []
    for sentence in sentences:
        sentence_vectors = []
        for word in sentence.split():
            try:
                word_vector = model[word]
                sentence_vectors.append(word_vector)
            except KeyError:
                continue
        if len(sentence_vectors) > max_length:
            sentence_vectors = sentence_vectors[:max_length]
        else:
            sentence_vectors += [pad_vector] * (max_length - len(sentence_vectors))
        vectors.append(sentence_vectors)
    return vectors

max_sentence_length = 100

train_vectors = convert_to_vectors(train_sentences, model, max_sentence_length, pad_vector)
val_vectors = convert_to_vectors(val_sentences, model, max_sentence_length, pad_vector)

del train_sentences
del val_sentences

In [59]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## LSTM

In [64]:
import torch.nn.functional as F
class SentimentAnalysisModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(SentimentAnalysisModel, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc1 = nn.Linear(hidden_dim, 256)
        self.fc2 = nn.Linear(256, 4096)
        self.classifier = nn.Linear(4096, output_dim)

    def forward(self, x):
        _, (hidden, _) = self.lstm(x)
        x = self.fc1(hidden[-1])
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.classifier(x)
        return x

In [65]:
input_dim = 200
hidden_dim = 128
output_dim = 2

model = SentimentAnalysisModel(input_dim, hidden_dim, output_dim)
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

In [62]:
train_data = TensorDataset(torch.tensor(train_vectors), torch.tensor(train_labels))
val_data = TensorDataset(torch.tensor(val_vectors), torch.tensor(val_labels))

del train_vectors
del val_vectors
del train_labels
del val_labels

train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
val_loader = DataLoader(val_data, batch_size=32, shuffle=False)

In [66]:
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    for batch_inputs, batch_labels in train_loader:
        batch_inputs = batch_inputs.to(device, dtype=torch.float)
        batch_labels = batch_labels.to(device)
        optimizer.zero_grad()
        outputs = model(batch_inputs)
        loss = criterion(outputs, batch_labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch_inputs, batch_labels in val_loader:
            batch_inputs = batch_inputs.to(device, dtype=torch.float)
            batch_labels = batch_labels.to(device)
            outputs = model(batch_inputs)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.tolist())
            all_labels.extend(batch_labels.tolist())

    val_accuracy = accuracy_score(all_labels, all_preds)

    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss/len(train_loader):.4f}, Validation Accuracy: {val_accuracy:.4f}')

Epoch 1/100, Loss: 0.6932, Validation Accuracy: 0.5155
Epoch 2/100, Loss: 0.6924, Validation Accuracy: 0.5219
Epoch 3/100, Loss: 0.6915, Validation Accuracy: 0.5247
Epoch 4/100, Loss: 0.6903, Validation Accuracy: 0.5246
Epoch 5/100, Loss: 0.6891, Validation Accuracy: 0.5283
Epoch 6/100, Loss: 0.6876, Validation Accuracy: 0.5269
Epoch 7/100, Loss: 0.6859, Validation Accuracy: 0.5254
Epoch 8/100, Loss: 0.6842, Validation Accuracy: 0.5313
Epoch 9/100, Loss: 0.6821, Validation Accuracy: 0.5344
Epoch 10/100, Loss: 0.6800, Validation Accuracy: 0.5359
Epoch 11/100, Loss: 0.6775, Validation Accuracy: 0.5378
Epoch 12/100, Loss: 0.6743, Validation Accuracy: 0.5344
Epoch 13/100, Loss: 0.6706, Validation Accuracy: 0.5386
Epoch 14/100, Loss: 0.6660, Validation Accuracy: 0.5415
Epoch 15/100, Loss: 0.6604, Validation Accuracy: 0.5544
Epoch 16/100, Loss: 0.6533, Validation Accuracy: 0.5472
Epoch 17/100, Loss: 0.6437, Validation Accuracy: 0.5594
Epoch 18/100, Loss: 0.6133, Validation Accuracy: 0.5904
E

In [67]:
torch.save(model, 'lstm.pth')

## BI-LSTM

In [113]:
import torch.nn.functional as F

class SentimentAnalysisModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(SentimentAnalysisModel, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.fc1 = nn.Linear(hidden_dim * 2, 256)
        self.dropout = nn.Dropout(p=0.3)  
        self.classifier = nn.Linear(256, output_dim)

    def forward(self, x):
        _, (hidden, _) = self.lstm(x)
        
        hidden = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
        
        x = self.fc1(hidden)
        x = F.relu(x)
        x = self.dropout(x)  
        x = self.classifier(x)
        return x


In [114]:
input_dim = 200
hidden_dim = 64
output_dim = 2

model = SentimentAnalysisModel(input_dim, hidden_dim, output_dim)
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001)

In [116]:
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    for batch_inputs, batch_labels in train_loader:
        batch_inputs = batch_inputs.to(device, dtype=torch.float)
        batch_labels = batch_labels.to(device)
        optimizer.zero_grad()
        outputs = model(batch_inputs)
        loss = criterion(outputs, batch_labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch_inputs, batch_labels in val_loader:
            batch_inputs = batch_inputs.to(device, dtype=torch.float)
            batch_labels = batch_labels.to(device)
            outputs = model(batch_inputs)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.tolist())
            all_labels.extend(batch_labels.tolist())

    val_accuracy = accuracy_score(all_labels, all_preds)

    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss/len(train_loader):.4f}, Validation Accuracy: {val_accuracy:.4f}')

Epoch 1/100, Loss: 0.4245, Validation Accuracy: 0.7925
Epoch 2/100, Loss: 0.4226, Validation Accuracy: 0.7983
Epoch 3/100, Loss: 0.4205, Validation Accuracy: 0.7978
Epoch 4/100, Loss: 0.4189, Validation Accuracy: 0.7985
Epoch 5/100, Loss: 0.4161, Validation Accuracy: 0.7994
Epoch 6/100, Loss: 0.4150, Validation Accuracy: 0.8014
Epoch 7/100, Loss: 0.4131, Validation Accuracy: 0.8009
Epoch 8/100, Loss: 0.4102, Validation Accuracy: 0.7992
Epoch 9/100, Loss: 0.4083, Validation Accuracy: 0.8032
Epoch 10/100, Loss: 0.4067, Validation Accuracy: 0.7997
Epoch 11/100, Loss: 0.4040, Validation Accuracy: 0.8057
Epoch 12/100, Loss: 0.4022, Validation Accuracy: 0.8033
Epoch 13/100, Loss: 0.4013, Validation Accuracy: 0.8046
Epoch 14/100, Loss: 0.3982, Validation Accuracy: 0.8039
Epoch 15/100, Loss: 0.3969, Validation Accuracy: 0.8050
Epoch 16/100, Loss: 0.3949, Validation Accuracy: 0.8042
Epoch 17/100, Loss: 0.3931, Validation Accuracy: 0.8031
Epoch 18/100, Loss: 0.3917, Validation Accuracy: 0.8069
E

In [117]:
torch.save(model, 'bilstm.pth')