# Nepali Sentiment analysis with transformer architecture


In [1]:
%pip install --quiet gensim nltk optuna seaborn torch

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gensim
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from gensim.models import Word2Vec
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from torch.utils.data import DataLoader, TensorDataset, random_split
from sklearn.model_selection import train_test_split
import nltk
import re
from nltk.corpus import stopwords
import math
import pickle
from collections import Counter

Check if CUDA is available


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

Device: cuda


In [4]:
import os
os.chdir("../../..")

In [5]:
df = pd.read_csv('dataset/processed/combined_sentiment_dataset.csv')
df.sample(5)

Unnamed: 0,Sentences,Sentiment
8933,कोभिड का कारण नौ महिनापछि नेपाली राष्ट्रिय क्र...,1
34053,इन्डियामा कोभिड केस बढ्नु हामिलाई पनि चिन्ताको...,2
22013,कोभिड नियन्त्रणमा सरकार गम्भीर नभएको आरोप,2
27865,जापान कोभिड दैनिक अपडेट मे संक्रमितहरुको संख्य...,2
35258,कोभिड प्रभावित देशको भ्रमण नगर्न आफ्ना नागरिकल...,1


In [6]:
df['Sentiment'].value_counts()

Sentiment
1    15857
2    14393
0    10458
Name: count, dtype: int64

### Preprocessing


In [7]:
nltk.download('stopwords')
stop_words = set(stopwords.words('nepali'))
not_stop_words = {'छैन', 'थिएन', 'यद्यपि', 'यसबाहेक'}
stop_words -= not_stop_words

[nltk_data] Downloading package stopwords to C:\Users\Suyash
[nltk_data]     Shrestha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
def preprocess(text: str):
    # Regular expression to keep only Nepali characters (Devanagari script)
    text = re.sub(r'[^\u0900-\u097F\s]', '', text)

    # Remove stopwords
    text = ' '.join(word for word in text.split() if word not in stop_words)

    # Basic letter normalization
    letters_to_normalize = {
        "ी": "ि", "ू": "ु", "श": "स", "ष": "स", "व": "ब", "ङ": "न", "ञ": "न", "ण": "न", "ृ": "र",
        "ँ": "", "ं": "", "ः": "", "ं": ""
    }
    for l1, l2 in letters_to_normalize.items():
        text = text.replace(l1, l2)

    return text

In [9]:
from tqdm import tqdm
tqdm.pandas()
df['Sentences'] = df['Sentences'].progress_apply(preprocess)

100%|█████████████████████████████████████████████████████████████████████████| 40708/40708 [00:00<00:00, 66849.90it/s]


In [10]:
df.sample(5)

Unnamed: 0,Sentences,Sentiment
10478,कोभिड हाम्रो कहानि,0
24651,कोरोना भाइरस गुल्मिका पुरुसको कोभिड कारन बुटबल...,2
34373,बिज्नको सुझाबअनुसार कोभिड रोकथाम नियन्त्रन मन्...,1
14385,कोरोना भाइरस कोभिड सक्रमन रोकथाम नियन्त्रनका म...,2
11558,अभिभाबक त्रसित जनाउदै काकडभिट्टा माध्यमिक बिद्...,1


## Tokanization


In [11]:
sentences = [item for item in df['Sentences']]
sentiments = [item for item in df['Sentiment']]

tokenized_sentences = [sentence.split() for sentence in sentences]

In [12]:
word2vec_model = Word2Vec(
    sentences=tokenized_sentences,
    vector_size=64, window=5, min_count=1, workers=4
)

In [13]:
# # Convert each token to its word2vec embedding
# max_length = 32
# embedded_reviews = []

# for tokens in tokenized_sentences:
#     review_vector = []
#     for token in tokens:
#         if token in word2vec_model.wv:
#             review_vector.append(word2vec_model.wv[token])
#         else:
#             # If token not found, use a zero vector
#             review_vector.append(np.zeros(word2vec_model.vector_size))
#     # Pad the review to max_length with zero vectors
#     if len(review_vector) > max_length:
#         review_vector = review_vector[:max_length]
#     else:
#         review_vector.extend([np.zeros(word2vec_model.vector_size)] * (max_length - len(review_vector)))


#     embedded_reviews.append(review_vector)

max_length = 32
embedded_reviews = []

for tokens in tokenized_sentences:
    review_vector = [word2vec_model.wv[token] if token in word2vec_model.wv else np.zeros(
        word2vec_model.vector_size) for token in tokens]
    review_vector = review_vector[:max_length] if len(review_vector) > max_length else review_vector + [
        np.zeros(word2vec_model.vector_size)] * (max_length - len(review_vector))
    embedded_reviews.append(review_vector)

embedded_reviews_array = np.array(embedded_reviews)
print("Embedded Reviews Shape:", embedded_reviews_array.shape)

Embedded Reviews Shape: (40708, 32, 64)


# Transformer


In [14]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=32):
        super(PositionalEncoding, self).__init__()
        self.d_model = d_model
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2)
                             * -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x * math.sqrt(self.d_model) + self.pe[:x.size(1), :]

In [15]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0
        self.d_k = d_model // num_heads
        self.num_heads = num_heads
        self.q_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.out = nn.Linear(d_model, d_model)

    def attention(self, q, k, v):
        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)
        attn = F.softmax(scores, dim=-1)
        return torch.matmul(attn, v)

    def forward(self, q, k, v):
        bs = q.size(0)
        q = self.q_linear(q).view(bs, -1, self.num_heads,
                                  self.d_k).transpose(1, 2)
        k = self.k_linear(k).view(bs, -1, self.num_heads,
                                  self.d_k).transpose(1, 2)
        v = self.v_linear(v).view(bs, -1, self.num_heads,
                                  self.d_k).transpose(1, 2)
        scores = self.attention(q, k, v)
        concat = scores.transpose(1, 2).contiguous().view(
            bs, -1, self.num_heads * self.d_k)
        return self.out(concat)

In [16]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.ff = nn.Sequential(nn.Linear(d_model, d_ff),
                                nn.ReLU(), nn.Linear(d_ff, d_model))
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, x):
        attn_output = self.self_attn(x, x, x)
        x = self.norm1(x + self.dropout1(attn_output))
        ff_output = self.ff(x)
        return self.norm2(x + self.dropout2(ff_output))

In [17]:
class SentimentAnalysisModel(nn.Module):
    def __init__(self, d_model, num_heads, num_layers, num_classes, d_ff, max_len, dropout):
        super(SentimentAnalysisModel, self).__init__()
        self.positional_encoding = PositionalEncoding(d_model, max_len)
        self.layers = nn.ModuleList(
            [EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.fc = nn.Linear(d_model, num_classes)

    def forward(self, x):
        x = self.positional_encoding(x)
        for layer in self.layers:
            x = layer(x)
        x = x.mean(dim=1)
        return self.fc(x)

In [18]:
# Parameters
d_model = 64
num_heads = 4
num_layers = 2
num_classes = 2
d_ff = 128
max_len = max_length
dropout = 0.1

In [19]:
# Model, Criterion, Optimizer
model = SentimentAnalysisModel(
    d_model, num_heads, num_layers, num_classes, d_ff, max_len, dropout).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [20]:
device

device(type='cuda')

In [21]:
# Convert data to tensors and create DataLoader
sentiments_tensor = torch.tensor(sentiments, dtype=torch.long)
embedded_reviews_tensor = torch.tensor(
    embedded_reviews_array, dtype=torch.float32)

In [22]:
# Dataset and Split
dataset = TensorDataset(embedded_reviews_tensor, sentiments_tensor)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

In [23]:
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

In [24]:
num_epochs = 16

# Training Loop
for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0

    # Training Phase
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item() * inputs.size(0)

    # Validation Phase
    model.eval()
    total_val_loss = 0
    correct_predictions = 0

    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            total_val_loss += loss.item() * inputs.size(0)

            # Calculate accuracy
            _, predicted = torch.max(outputs, 1)
            correct_predictions += (predicted == labels).sum().item()

    train_loss = total_train_loss / len(train_loader.dataset)
    val_loss = total_val_loss / len(val_loader.dataset)
    val_accuracy = correct_predictions / len(val_loader.dataset)

    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
# Model Evaluation on Test Set

predictions = []
actuals = []
model.eval()

with torch.no_grad():
    for inputs, labels in val_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        predictions.extend(predicted.cpu().numpy())
        actuals.extend(labels.cpu().numpy())

f1 = f1_score(actuals, predictions)
precision = precision_score(actuals, predictions)
recall = recall_score(actuals, predictions)
conf_matrix = confusion_matrix(actuals, predictions)

In [None]:
print(f"F1 Score: {f1:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print("Confusion Matrix:")

In [None]:
plt.figure(figsize=(3, 2))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

In [None]:
# Classification Report and Confusion Matrix
print("Classification Report:")
print(classification_report(actuals, predictions, digits=4))
print("Confusion Matrix:")
print(confusion_matrix(actuals, predictions))

In [None]:
with open('src/models/transformer_model.pkl', 'wb') as f:
    pickle.dump(model, f)