In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, GRU, LSTM, Bidirectional, Dense, Dropout

In [4]:
df = pd.read_csv('urdu-sentiment-corpus-v1.tsv', sep='\t', encoding='utf-8')
texts = df['Tweet'].astype(str).values
labels = df['Class'].astype(str).values

encoder = LabelEncoder()
labels = encoder.fit_transform(labels)
target_names = ['P', 'N', 'O']
num_classes = len(np.unique(labels))
labels = to_categorical(labels, num_classes=num_classes)
max_words = 10000
max_len = 100

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
X = pad_sequences(sequences, maxlen=max_len)
y = labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=40)

In [None]:
rnn_model = Sequential()
rnn_model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
rnn_model.add(SimpleRNN(64))
rnn_model.add(Dropout(0.5))
rnn_model.add(Dense(num_classes, activation='softmax'))
rnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

rnn_history = rnn_model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.2)
rnn_y_pred = rnn_model.predict(X_test)
rnn_y_pred_classes = np.argmax(rnn_y_pred, axis=1)
rnn_y_true = np.argmax(y_test, axis=1)

rnn_report = classification_report(
    rnn_y_true,
    rnn_y_pred_classes,
    labels=[0, 1, 2],
    target_names=target_names,
    output_dict=True
)

rnn_df_report = pd.DataFrame(rnn_report).transpose()
print("\nPerformance Metrics (RNN):")
print(rnn_df_report[['precision', 'recall', 'f1-score']])

Epoch 1/5




[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 36ms/step - accuracy: 0.4343 - loss: 1.1754 - val_accuracy: 0.4400 - val_loss: 0.8567
Epoch 2/5
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.5543 - loss: 0.8808 - val_accuracy: 0.4333 - val_loss: 0.9112
Epoch 3/5
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.8404 - loss: 0.5912 - val_accuracy: 0.4800 - val_loss: 0.8636
Epoch 4/5
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.9746 - loss: 0.2229 - val_accuracy: 0.4600 - val_loss: 1.0166
Epoch 5/5
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 0.9824 - loss: 0.1028 - val_accuracy: 0.5133 - val_loss: 1.2543
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step

Performance Metrics (RNN):
              precision    recall  f1-score
P              0.577236  0.541985  0.559055
N              0.00000

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [6]:
gru_model = Sequential()
gru_model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
gru_model.add(GRU(64))
gru_model.add(Dropout(0.5))
gru_model.add(Dense(num_classes, activation='softmax'))
gru_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

gru_history = gru_model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.2)
gru_y_pred = gru_model.predict(X_test)
gru_y_pred_classes = np.argmax(gru_y_pred, axis=1)
gru_y_true = np.argmax(y_test, axis=1)

gru_report = classification_report(
    gru_y_true,
    gru_y_pred_classes,
    labels=[0, 1, 2],
    target_names=target_names,
    output_dict=True
)

gru_df_report = pd.DataFrame(gru_report).transpose()
print("\nPerformance Metrics (GRU):")
print(gru_df_report[['precision', 'recall', 'f1-score']])

Epoch 1/5




[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 63ms/step - accuracy: 0.4148 - loss: 1.3336 - val_accuracy: 0.5533 - val_loss: 1.0836
Epoch 2/5
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 38ms/step - accuracy: 0.5506 - loss: 0.9543 - val_accuracy: 0.4400 - val_loss: 0.7967
Epoch 3/5
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 39ms/step - accuracy: 0.5550 - loss: 0.7990 - val_accuracy: 0.4733 - val_loss: 0.7697
Epoch 4/5
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 41ms/step - accuracy: 0.6361 - loss: 0.6956 - val_accuracy: 0.4600 - val_loss: 0.7839
Epoch 5/5
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 40ms/step - accuracy: 0.7600 - loss: 0.5761 - val_accuracy: 0.5400 - val_loss: 0.7614
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step

Performance Metrics (GRU):
              precision    recall  f1-score
P              0.587786  0.587786  0.587786
N              0.00000

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [7]:
lstm_model = Sequential()
lstm_model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
lstm_model.add(LSTM(64))
lstm_model.add(Dropout(0.5))
lstm_model.add(Dense(num_classes, activation='softmax'))
lstm_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

lstm_history = lstm_model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.2)
lstm_y_pred = lstm_model.predict(X_test)
lstm_y_pred_classes = np.argmax(lstm_y_pred, axis=1)
lstm_y_true = np.argmax(y_test, axis=1)

lstm_report = classification_report(
    lstm_y_true,
    lstm_y_pred_classes,
    labels=[0, 1, 2],
    target_names=target_names,
    output_dict=True
)

lstm_df_report = pd.DataFrame(lstm_report).transpose()
print("\nPerformance Metrics (LSTM):")
print(lstm_df_report[['precision', 'recall', 'f1-score']])

Epoch 1/5




[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 62ms/step - accuracy: 0.4149 - loss: 1.3189 - val_accuracy: 0.4800 - val_loss: 0.8102
Epoch 2/5
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 44ms/step - accuracy: 0.5178 - loss: 0.8545 - val_accuracy: 0.5200 - val_loss: 0.7601
Epoch 3/5
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 40ms/step - accuracy: 0.5158 - loss: 0.8151 - val_accuracy: 0.5533 - val_loss: 0.7499
Epoch 4/5
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 40ms/step - accuracy: 0.6076 - loss: 0.7583 - val_accuracy: 0.5267 - val_loss: 0.7474
Epoch 5/5
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 40ms/step - accuracy: 0.8643 - loss: 0.4741 - val_accuracy: 0.6267 - val_loss: 0.7601
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step

Performance Metrics (LSTM):
              precision    recall  f1-score
P              0.614035  0.534351  0.571429
N              0.0000

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [8]:
bilstm_model = Sequential()
bilstm_model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
bilstm_model.add(Bidirectional(LSTM(64)))
bilstm_model.add(Dropout(0.5))
bilstm_model.add(Dense(num_classes, activation='softmax'))
bilstm_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

bilstm_history = bilstm_model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.2)
bilstm_y_pred = bilstm_model.predict(X_test)
bilstm_y_pred_classes = np.argmax(bilstm_y_pred, axis=1)
bilstm_y_true = np.argmax(y_test, axis=1)

bilstm_report = classification_report(
    bilstm_y_true,
    bilstm_y_pred_classes,
    labels=[0, 1, 2],
    target_names=target_names,
    output_dict=True
)

bilstm_df_report = pd.DataFrame(bilstm_report).transpose()
print("\nPerformance Metrics (BiLSTM):")
print(bilstm_df_report[['precision', 'recall', 'f1-score']])

Epoch 1/5




[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 69ms/step - accuracy: 0.4618 - loss: 1.2049 - val_accuracy: 0.4400 - val_loss: 0.7852
Epoch 2/5
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 50ms/step - accuracy: 0.5199 - loss: 0.8077 - val_accuracy: 0.5467 - val_loss: 0.7609
Epoch 3/5
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 48ms/step - accuracy: 0.5097 - loss: 0.7972 - val_accuracy: 0.5667 - val_loss: 0.7573
Epoch 4/5
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 54ms/step - accuracy: 0.6640 - loss: 0.7047 - val_accuracy: 0.5267 - val_loss: 0.7549
Epoch 5/5
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 47ms/step - accuracy: 0.9006 - loss: 0.4335 - val_accuracy: 0.5733 - val_loss: 1.0041
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 45ms/step

Performance Metrics (BiLSTM):
              precision    recall  f1-score
P              0.681818  0.229008  0.342857
N              0.00

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [28]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, XLMRobertaTokenizer, XLMRobertaForSequenceClassification, Trainer, TrainingArguments
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader

In [None]:
mbert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
df = pd.read_csv("urdu-sentiment-corpus-v1.tsv", sep="\t")
df = df[df["Class"].isin(["P", "N", "O"])]
df = df.dropna(subset=["Tweet", "Class"])

label_map = {"P": 0, "N": 1, "O": 2}
df["Class"] = df["Class"].map(label_map)
texts = df["Tweet"].tolist()
labels = df["Class"].tolist()

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        if idx >= len(self.texts) or idx >= len(self.labels):
            raise IndexError(f"Index {idx} out of bounds for dataset of size {len(self.texts)}")
        encoding = self.tokenizer.encode_plus(
            self.texts[idx],
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

mbert_batch_size = 8
mbert_epochs = 3
mbert_dataset = TextDataset(texts, labels, tokenizer=mbert_tokenizer, max_len=max_len)
mbert_train_size = int(0.8 * len(mbert_dataset))
mbert_val_size = len(mbert_dataset) - mbert_train_size
train_dataset, val_dataset = torch.utils.data.random_split(mbert_dataset, [mbert_train_size, mbert_val_size])

train_loader = DataLoader(train_dataset, batch_size=mbert_batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=mbert_batch_size)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

mbert_model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=num_classes)
mbert_model = mbert_model.to(device)
mbert_optimizer = AdamW(mbert_model.parameters(), lr=2e-5)

for epoch in range(mbert_epochs):
    mbert_model.train()
    total_loss = 0
    for batch in train_loader:
        mbert_optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = mbert_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        mbert_optimizer.step()

        total_loss += loss.item()
    
    print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}")

mbert_model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = mbert_model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

mbert_report = classification_report(all_labels, all_preds, target_names=target_names, output_dict=True)
mbert_report_df = pd.DataFrame(mbert_report).transpose()
print(df[['precision', 'recall', 'f1-score']])

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Loss: 81.4203


KeyboardInterrupt: 

In [None]:
tokenizer_xlm = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
max_len = 100
batch_size = 16
num_labels = 3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class XLMRDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

dataset = XLMRDataset(texts, labels, tokenizer_xlm, max_len)
train_size = int(0.75 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

model_xlm = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=num_labels)
model_xlm = model_xlm.to(device)
optimizer = AdamW(model_xlm.parameters(), lr=2e-5)

epochs = 3
for epoch in range(epochs):
    model_xlm.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model_xlm(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}")

model_xlm.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model_xlm(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

report = classification_report(all_labels, all_preds, target_names=target_names, output_dict=True)
report_df = pd.DataFrame(report).transpose()
print("\nPerformance Metrics (XLM-RoBERTa):")
print(report_df[['precision', 'recall', 'f1-score']])

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


IndexError: index 964 is out of bounds for dimension 0 with size 8