<a href="https://colab.research.google.com/github/sahar-mariam/kannada-sentiment-analysis/blob/main/IndicBERT_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
# Import required libraries
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Check if GPU is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the dataset
file_path = "/content/balanced_dataset.csv"  # Update path if needed
df = pd.read_csv(file_path)


Using device: cuda


In [26]:
# Inspect unique sentiment labels in the dataset
print("Unique sentiment labels in the dataset before preprocessing:")
print(df['sentiment'].unique())

# Preprocess the sentiment column
# Replace numeric or unexpected labels with Kannada equivalents
label_mapping_raw = {
    0: "ತಟಸ್ಥ",        # Neutral
    1: "ಧನಾತ್ಮಕ",      # Positive
    2: "ಋಣಾತ್ಮಕ",      # Negative (alternative to "ನಕಾರಾತ್ಮಕ")
    "ನಕಾರಾತ್ಮಕ": "ಋಣಾತ್ಮಕ"  # Normalize alternative negative labels
}

Unique sentiment labels in the dataset before preprocessing:
['ತಟಸ್ಥ' 'ಧನಾತ್ಮಕ' 'ಋಣಾತ್ಮಕ']


In [27]:
# Replace numeric and alternative labels with standardized Kannada labels
df['sentiment'] = df['sentiment'].replace(label_mapping_raw)

# Verify the updated unique sentiment labels
print("Unique sentiment labels in the dataset after preprocessing:")
print(df['sentiment'].unique())

# Define the mapping for Kannada sentiment labels to English sentiment labels
sentiment_mapping = {
    "ತಟಸ್ಥ": "neutral",       # Neutral sentiment
    "ಧನಾತ್ಮಕ": "positive",    # Positive sentiment
    "ಋಣಾತ್ಮಕ": "negative"     # Negative sentiment
}

Unique sentiment labels in the dataset after preprocessing:
['ತಟಸ್ಥ' 'ಧನಾತ್ಮಕ' 'ಋಣಾತ್ಮಕ']


In [28]:
# Ensure all labels in the dataset are covered
try:
    label_mapping = {idx: sentiment_mapping[label] for idx, label in enumerate(df['sentiment'].astype('category').cat.categories)}
except KeyError as e:
    raise ValueError(f"Error: Sentiment label '{e.args[0]}' is not in sentiment_mapping. Please update the mapping.")

# Load the fine-tuned IndicBERT model
model_name = "ai4bharat/indic-bert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_mapping)).to(device)

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
# Function to predict sentiment for user input
def predict_sentiment(user_input, model, tokenizer, label_mapping):
    """
    Predict sentiment for a given Kannada text input.

    Args:
        user_input (str): Kannada text input from the user.
        model: Fine-tuned IndicBERT model.
        tokenizer: IndicBERT tokenizer.
        label_mapping (dict): Mapping of numeric labels to sentiment names.

    Returns:
        str: Predicted sentiment.
        float: Confidence score of the prediction.
    """
    # Tokenize the input
    inputs = tokenizer(user_input, truncation=True, padding=True, max_length=128, return_tensors="pt").to(device)

    # Predict using the model
    with torch.no_grad():
        outputs = model(**inputs)

    # Get prediction scores
    logits = outputs.logits
    probs = torch.nn.functional.softmax(logits, dim=-1)
    predicted_class = torch.argmax(probs).item()
    confidence_score = probs[0][predicted_class].item()

    # Map numeric label to sentiment name
    predicted_label = label_mapping[predicted_class]
    return predicted_label, confidence_score

In [31]:
# Take user input in Kannada
print("Enter Kannada text for sentiment analysis:")
user_input = input()

# Predict sentiment and score
predicted_label, confidence_score = predict_sentiment(user_input, model, tokenizer, label_mapping)

# Display the results
print(f"Predicted Sentiment: {predicted_label}")
print(f"Confidence Score: {confidence_score:.2f}")

Enter Kannada text for sentiment analysis:
ನಾನು ಚೆನ್ನಾಗಿ ಮಾಡಲಿಲ್ಲ
Predicted Sentiment: negative
Confidence Score: 0.34
