In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install unsloth

In [None]:
!pip install pip3-autoremove
!pip-autoremove torch torchvision torchaudio -y
!pip install torch torchvision torchaudio xformers --index-url https://download.pytorch.org/whl/cu121

In [None]:
from unsloth import FastLanguageModel
import torch

In [None]:
import pandas as pd
from datasets import Dataset
from trl import SFTTrainer
from transformers import TrainingArguments


In [None]:
import gc
from sklearn.model_selection import train_test_split
from transformers import TrainingArguments, Trainer

In [None]:
# Check if GPU is available
print("Is CUDA available? ", torch.cuda.is_available())
print("Number of GPUs: ", torch.cuda.device_count())
print("Current device: ", torch.cuda.current_device())
print("Device name: ", torch.cuda.get_device_name(torch.cuda.current_device()))


In [None]:
gc.collect()
torch.cuda.empty_cache()

In [None]:
model_path = "/kaggle/input/llama-3.1/transformers/8b-instruct/2"
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None

In [None]:
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
map_dict = {
    "as": "Assamese",
    "bd": "Bodo",
    "bn": "Bengali",
    "gu": "Gujarati",
    "hi": "Hindi",
    "kn": "Kannada",
    "ml": "Malayalam",
    "mr": "Marathi",
    "or": "Odia",
    "pa": "Punjabi",
    "ta": "Tamil",
    "te": "Telugu",
    "ur": "Urdu"
}

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_path,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training, LoraConfig, PeftModel, get_peft_model
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

In [None]:
file_path = "/kaggle/input/multi-lingual-sentiment-analysis/train.csv"
test_file_path = "/kaggle/input/multi-lingual-sentiment-analysis/test.csv"

In [None]:
from datasets import load_dataset

dataset = load_dataset("csv", data_files=file_path,split="train")

print(dataset)

In [None]:
from datasets import load_dataset

# Load your dataset from the local CSV file
dataset = load_dataset("csv", data_files="/kaggle/input/multi-lingual-sentiment-analysis/train.csv", split="train")

# Define the mapping dictionary (ensure this is properly initialized)
map_dict = {
    "hi": "Hindi", "bn": "Bengali", "mr": "Marathi", "te": "Telugu",
    "ta": "Tamil", "gu": "Gujarati", "ur": "Urdu", "kn": "Kannada",
    "or": "Odia", "ml": "Malayalam", "pa": "Punjabi", "as": "Assamese",
    "sa": "Sanskrit"
}

def format_example(example):
    full_lang = map_dict.get(example["language"], example["language"])
    return {
        "conversations": [
            {"from": "system", "value": 
                "You are a highly accurate multilingual sentiment analysis expert specializing in 13 Indian languages. "
                "Your task is to classify the sentiment expressed in the given text as either 'Positive' or 'Negative'.\n\n"
                "Instructions:\n\n"
                "1. Strict Output Format: You MUST respond with only ONE word: 'Positive' or 'Negative'. Do not include any other text, explanations, or apologies.\n"
                "2. Linguistic Awareness: Deeply consider the context, tone, and linguistic nuances of the specific Indian language. "
                "Be aware of idioms, cultural expressions, and regional variations that influence sentiment.\n"
                "3. Subtlety and Nuance: Pay close attention to subtle cues that might indicate sentiment, including word choice, phrasing, and implied meaning.\n"
                "4. Consistency and Objectivity: Maintain consistent criteria for sentiment classification across all languages, avoiding personal biases or subjective interpretations.\n"
                "5. Error Handling: If the text is neutral, ambiguous, or devoid of sentiment, classify it based on your best judgment, erring on the side of 'Negative' if uncertain.\n"
                "6. Response Under All Circumstances: ALWAYS return either 'Positive' or 'Negative'. Never return an empty string or failure state.\n\n"
                "Example:"
                "Input Text: यह फिल्म बहुत अच्छी है।"
                "Response: Positive"
                "Languages: Hindi, Bengali, Marathi, Telugu, Tamil, Gujarati, Urdu, Kannada, Odia, Malayalam, Punjabi, Assamese, Sanskrit."
            },
            {"from": "human", "value": example["sentence"]},
            {"from": "gpt", "value": example["label"]}
        ],
        "language": full_lang  # Keeping the language field
    }

# Apply the transformation
formatted_dataset = dataset.map(format_example, remove_columns=dataset.column_names)

# Display the first example to verify
print(formatted_dataset[0])


In [None]:
def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }

In [None]:
from unsloth.chat_templates import standardize_sharegpt
formatted_dataset = standardize_sharegpt(formatted_dataset)
formatted_dataset = formatted_dataset.map(formatting_prompts_func, batched = True,)

In [None]:
formatted_dataset[5]['text']

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none", 
    
    use_gradient_checkpointing = "unsloth", 
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = formatted_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        #num_train_epochs = 2, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

In [None]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

In [None]:
tokenizer.decode(trainer.train_dataset[5]["input_ids"])

In [None]:
trainer_stats = trainer.train()

In [None]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)
FastLanguageModel.for_inference(model)

In [None]:
labels = []
sentences = pd.read_csv("/kaggle/input/multi-lingual-sentiment-analysis/test.csv")['sentence'].tolist()

In [None]:
import re

In [None]:
for sen in sentences:
        messages = [
            {"role": "system", "content": "You are a multilingual sentiment analysis assistant trained for 13 Indian languages. Your role is to accurately classify sentiment as Positive or Negative based on textual input."},
            {"role": "user", "content": sen},
        ]
        inputs = tokenizer.apply_chat_template(
            messages,
            tokenize=True,
            add_generation_prompt=True,
            return_tensors="pt",
        ).to("cuda")

        outputs = model.generate(input_ids=inputs, max_new_tokens=64, use_cache=True, temperature=0.1, top_p=0.9)
        output_text = tokenizer.decode(outputs[0])
        print(output_text)
        # match = re.search(r'<\\|start_header_id\\|>assistant<\\|end_header_id\\|>(.*?)<\\|eot_id\\|>', output_text, re.DOTALL)
        match = re.search(
        r'<\|start_header_id\|>assistant<\|end_header_id\|>(.*?)<\|eot_id\|>',
        output_text,
        re.DOTALL
    )
        if match and match.group(1):
            labels.append(match.group(1).strip())
        else:
            print("No response found for sentence:", sen)
            labels.append("No response found.")

In [None]:
submission = pd.read_csv("/kaggle/input/multi-lingual-sentiment-analysis/sample_submission.csv")

In [None]:
submission['label'] = labels

In [None]:
submission

In [None]:
submission.to_csv("submission.csv",index=False)