<a href="https://colab.research.google.com/github/shahzadsiddiqi/BERT-LLM-tasks/blob/main/Urdu_text_classfication.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from datasets import Dataset

# Define Urdu text dataset directly in the program
data = {
    "review": [
        "یہ چیز میرے توقعات پر پوری اتری۔",
        "یہ سب سے بری سروس ہے جو میں نے دیکھی۔",
        "کوالٹی بہت اعلیٰ تھی۔",
        "مصنوع بہت خراب تھی، مجھے پسند نہیں آئی۔",
        "فوری ڈیلیوری کی، میں متاثر ہوا۔",
        "ڈیلیوری وقت پر نہیں ہوئی۔",
        "قیمت بہت مناسب ہے۔",
        "پیکجنگ اچھی نہیں تھی۔",
        "یہ واقعی قابل تعریف پروڈکٹ ہے۔",
        "یہ خریداری میرے لئے وقت اور پیسہ کا ضیاع تھا۔",
    ],
    "sentiment": [1, 0, 1, 0, 1, 0, 1, 0, 1, 0],  # 1 = Positive, 0 = Negative
}

# Split data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data["review"], data["sentiment"], test_size=0.2, random_state=42
)

# Convert data into Hugging Face Dataset format
train_dataset = Dataset.from_dict({"review": train_texts, "sentiment": train_labels})
test_dataset = Dataset.from_dict({"review": test_texts, "sentiment": test_labels})


model_name = "bert-base-multilingual-cased"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)


# Load pre-trained Urdu BERT model and tokenizer
# model_name = "urduhack/bert-base-urdu-cased"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Tokenize the datasets
def tokenize_function(examples):
    return tokenizer(
        examples["review"], padding="max_length", truncation=True, max_length=128
    )

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Format datasets for PyTorch
train_dataset.set_format(
    type="torch", columns=["input_ids", "attention_mask", "sentiment"]
)
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "sentiment"])

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train and evaluate the model
trainer.train()
results = trainer.evaluate()
print("Evaluation Results:", results)

# Test the model with new sentences
test_sentences = [
    "یہ واقعی حیرت انگیز تھا۔",
    "مجھے یہ کبھی پسند نہیں آیا۔",
]
test_inputs = tokenizer(test_sentences, padding=True, truncation=True, max_length=128, return_tensors="pt")
test_inputs = {key: val.to("cuda" if torch.cuda.is_available() else "cpu") for key, val in test_inputs.items()}
model = model.to("cuda" if torch.cuda.is_available() else "cpu")
outputs = model(**test_inputs)
predictions = torch.argmax(outputs.logits, dim=1)

# Print predictions
print("Predictions:", predictions.tolist())  # Convert Tensor to list for better readability


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/8 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit: