<a href="https://colab.research.google.com/github/prajwolsubedi/ClzProject/blob/main/BalancedFinalTrain.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets evaluate torch


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m8

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
import numpy as np
from datasets import Dataset
import evaluate

# Load and clean the dataset
data = pd.read_csv('/content/sample_data/balanced_data.csv')  # Replace with your dataset path
data_clean = data.dropna(subset=['Sentences']).reset_index(drop=True)

# Map labels to 0, 1, 2
label_map = {-1.0: 0, 0.0: 1, 1.0: 2}
data_clean['Sentiment'] = data_clean['Sentiment'].map(label_map)

# Function for stratified split
def stratified_split(data, label_col, test_size, val_size, random_state=42):
    train, temp = train_test_split(
        data, test_size=test_size + val_size, stratify=data[label_col], random_state=random_state
    )
    val, test = train_test_split(
        temp, test_size=test_size / (test_size + val_size), stratify=temp[label_col], random_state=random_state
    )
    return train, val, test

# Split data
train_data, val_data, test_data = stratified_split(data_clean, 'Sentiment', test_size=0.1, val_size=0.1)

# Prepare datasets for Hugging Face
def prepare_dataset(dataframe):
    return Dataset.from_pandas(dataframe[['Sentences', 'Sentiment']].rename(columns={'Sentences': 'text', 'Sentiment': 'label'}))

train_dataset = prepare_dataset(train_data)
val_dataset = prepare_dataset(val_data)
test_dataset = prepare_dataset(test_data)

# Initialize tokenizer and model
model_name = "xlm-roberta-base"  # or "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

# Tokenize datasets
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Define the accuracy metric
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",  # Evaluate the model at the end of each epoch
    save_strategy="epoch",  # Save the model at the end of each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    save_total_limit=2,
    load_best_model_at_end=True,  # Load the best model based on evaluation metrics
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Evaluate the model
print("Evaluation on Test Set")
results = trainer.evaluate(test_dataset)
print(results)

# Display category-wise accuracy
def category_accuracy(dataset, model, tokenizer, category_label):
    filtered = dataset.filter(lambda x: x['label'] == category_label)
    preds = []
    true_labels = []
    for item in filtered:
        inputs = tokenizer(item['text'], return_tensors="pt", padding=True, truncation=True)
        with torch.no_grad():
            logits = model(**inputs).logits
        preds.append(torch.argmax(logits).item())
        true_labels.append(item['label'])
    correct = sum([pred == true for pred, true in zip(preds, true_labels)])
    accuracy = correct / len(true_labels)
    return accuracy

print("Category-Wise Accuracy:")
for label, category in enumerate(["Negative", "Neutral", "Positive"]):
    acc = category_accuracy(test_dataset, model, tokenizer, label)
    print(f"{category}: {acc:.2%}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/17996 [00:00<?, ? examples/s]

Map:   0%|          | 0/2250 [00:00<?, ? examples/s]

Map:   0%|          | 0/2250 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Accuracy
1,0.7462,0.697273,0.701333
2,0.6451,0.674643,0.731111
3,0.5435,0.735594,0.728889


Evaluation on Test Set


{'eval_loss': 0.6780783534049988, 'eval_accuracy': 0.7404444444444445, 'eval_runtime': 14.3617, 'eval_samples_per_second': 156.667, 'eval_steps_per_second': 19.636, 'epoch': 3.0}
Category-Wise Accuracy:


Filter:   0%|          | 0/2250 [00:00<?, ? examples/s]

KeyError: 'text'

In [None]:
# Save the model and tokenizer
model.save_pretrained('./saved_model')
tokenizer.save_pretrained('./saved_model')

# Compress the saved model folder into a zip file
!zip -r saved_model.zip ./saved_model


  adding: saved_model/ (stored 0%)
  adding: saved_model/config.json (deflated 52%)
  adding: saved_model/tokenizer.json (deflated 76%)
  adding: saved_model/model.safetensors (deflated 29%)
  adding: saved_model/tokenizer_config.json (deflated 77%)
  adding: saved_model/sentencepiece.bpe.model (deflated 49%)
  adding: saved_model/special_tokens_map.json (deflated 52%)


In [None]:
from google.colab import files
files.download('saved_model.zip')



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load the fine-tuned model and tokenizer
model_name = 'saved_model'  # Replace with the path to your saved model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Sentences to test the model
test_sentences = {
    "Positive": [
        "यो मेरो जीवनको सबैभन्दा राम्रो दिन हो।",  # This is the best day of my life.
        "म तपाईलाई धेरै माया गर्छु।",  # I love you very much.
        "यहाँको मौसम आज अति राम्रो छ।",  # The weather here is very good today.
        "म आज निकै खुशी छु।",  # I am very happy today.
        "यो फिल्म अत्यन्त रमाइलो थियो।",  # This movie was extremely fun.
    ],
    "Negative": [
        "म आज धेरै दुःखी छु।",  # I am very sad today.
        "यहाँको सेवा एकदम नराम्रो छ।",  # The service here is very bad.
        "यो पुस्तक अत्यन्त बोरिंग थियो।",  # This book was extremely boring.
        "म कसम खाएर यो कुरा नगर्ने छु।",  # I swear I won't do this.
        "मलाई यो समस्या निकै तनावपूर्ण लाग्छ।",  # This issue feels very stressful to me.
    ],
    "Neutral": [
        "मैले आज बिहान १० बजे उठें।",  # I woke up at 10 AM today.
        "मेरो कक्षा सोमबार देखि शुक्रबार सम्म हुन्छ।",  # My class is from Monday to Friday.
        "मेरो नाम रामु हो।",  # My name is Ramu.
        "हामी एकै ठाउँमा बसिरहेका छौँ।",  # We are sitting in the same place.
        "म काठमाडौमा बस्छु।",  # I live in Kathmandu.
    ]
}

# Function to predict sentiment
def predict_sentiment(sentences):
    # Tokenize sentences and get model predictions
    inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt", max_length=128)
    with torch.no_grad():
        logits = model(**inputs).logits
    # Convert logits to probabilities (softmax)
    probs = torch.nn.functional.softmax(logits, dim=-1)
    predictions = torch.argmax(probs, dim=-1)

    # Convert predictions to sentiment labels (0 = Negative, 1 = Neutral, 2 = Positive)
    label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
    return [label_map[pred.item()] for pred in predictions]

# Test the model on each category of sentences
for sentiment, sentences in test_sentences.items():
    print(f"Predictions for {sentiment} Sentences:")
    predictions = predict_sentiment(sentences)
    for sentence, prediction in zip(sentences, predictions):
        print(f"Sentence: {sentence} -> Predicted Sentiment: {prediction}")
    print("\n")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


OSError: saved_model is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`