<a href="https://colab.research.google.com/github/subh-775/Academicia/blob/main/hinglish_finetuned_V2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets
!pip install huggingface_hub -q

In [2]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, PeftModel, PeftConfig
import wandb
from huggingface_hub import HfFolder, Repository

# Configuration
MODEL_NAME = "facebook/opt-350m"  # Base model to fine-tune (you can change this)
DATASET_NAME = "one-thing/chatbot_arena_conversations_hinglish"
OUTPUT_DIR = "./hinglish"
LORA_R = 8  # LoRA attention dimension
LORA_ALPHA = 16  # Alpha parameter for LoRA scaling
LORA_DROPOUT = 0.05
HF_TOKEN = os.environ.get("HF_Tokens")  # Set your HF token as an environment variable
HF_MODEL_REPO = "Subh775/hinglish-finetuned-V2"  # Replace with your desired repo name

def setup_wandb():
    """Initialize Weights & Biases for experiment tracking."""
    wandb.init(
        project="hinglish-llm-finetuning",
        config={
            "model": MODEL_NAME,
            "dataset": DATASET_NAME,
            "lora_r": LORA_R,
            "lora_alpha": LORA_ALPHA,
            "lora_dropout": LORA_DROPOUT
        }
    )

In [3]:
def setup_tokenizer():
    """Set up the tokenizer for the model."""
    print(f"Loading tokenizer for model: {MODEL_NAME}")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    # Add padding token if not present
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    return tokenizer

def prepare_dataset(tokenizer):
    """Load and prepare the dataset for fine-tuning."""
    print("Loading dataset...")
    dataset = load_dataset(DATASET_NAME)

    # Print dataset information
    print(f"Dataset structure: {dataset}")

    # Check a few examples to understand the structure
    print("Sample example:")
    print(dataset["train"][0] if "train" in dataset else dataset["default"][0])

    # The dataset has 'user_hinglish' and 'assistant_hinglish' fields
    # Format the dataset for instruction fine-tuning
    def format_instruction(example):
        instruction = "Below is a conversation between a human and an AI assistant in Hinglish. The assistant is helpful, respectful, and honest."
        conversation = f"Human: {example['user_hinglish']}\nAssistant: {example['assistant_hinglish']}"
        return {"text": f"{instruction}\n\n{conversation}"}

    # Apply formatting to the dataset
    if "train" in dataset:
        formatted_dataset = dataset.map(format_instruction)
        # Create validation split if none exists
        if "validation" not in dataset:
            splits = formatted_dataset["train"].train_test_split(test_size=0.1)
            formatted_dataset = {
                "train": splits["train"],
                "validation": splits["test"]
            }
    else:
        # If there's only a default split
        formatted_dataset = dataset["default"].map(format_instruction)
        # Create train/val split
        splits = formatted_dataset.train_test_split(test_size=0.1)
        formatted_dataset = {
            "train": splits["train"],
            "validation": splits["test"]
        }

    # Define tokenizing function with the provided tokenizer
    def tokenize_function(examples):
        return tokenizer(
            examples["text"],
            truncation=True,
            max_length=512,
            padding="max_length"
        )

    # Tokenize the dataset
    tokenized_dataset = {}
    for split in formatted_dataset:
        tokenized_dataset[split] = formatted_dataset[split].map(
            tokenize_function,
            batched=True,
            remove_columns=formatted_dataset[split].column_names
        )

    print(f"Tokenized dataset: {tokenized_dataset}")
    return tokenized_dataset

def setup_model(tokenizer):
    """Set up the model for fine-tuning."""
    print(f"Loading base model: {MODEL_NAME}")

    # Load model with bfloat16 precision if supported by the GPU
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        torch_dtype=torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8 else torch.float16,
        device_map="auto"
    )

    # LoRA configuration
    peft_config = LoraConfig(
        r=LORA_R,
        lora_alpha=LORA_ALPHA,
        lora_dropout=LORA_DROPOUT,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=["q_proj", "v_proj"]  # Target attention modules
    )

    # Apply LoRA adapters
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()

    return model

def train_model(model, tokenizer, tokenized_dataset):
    """Configure training and train the model."""
    # Training arguments
    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        eval_strategy="steps",  # Changed from evaluation_strategy to eval_strategy
        eval_steps=500,
        learning_rate=2e-5,
        weight_decay=0.01,
        warmup_steps=100,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        gradient_accumulation_steps=4,
        num_train_epochs=10,
        logging_steps=100,
        save_steps=1000,
        save_total_limit=11,
        fp16=True,
        report_to="wandb",
        push_to_hub=False,  #changed to True
    )

    # Data collator
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )

    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["validation"] if "validation" in tokenized_dataset else None,
        data_collator=data_collator,
    )

    print("Starting training...")
    trainer.train()

    # Save model and tokenizer
    trainer.save_model(OUTPUT_DIR)
    tokenizer.save_pretrained(OUTPUT_DIR)

    return trainer

def push_to_hub(trainer, tokenizer):
    """Push the model to Hugging Face Hub."""
    if not HF_TOKEN:
        print("Warning: No HF_TOKEN found. Skipping push to Hub.")
        return

    print(f"Pushing model to HF Hub: {HF_MODEL_REPO}")

    # Set up the Hugging Face token
    HfFolder.save_token(HF_TOKEN)

    # Create model repo if it doesn't exist
    repo = Repository(
        local_dir=OUTPUT_DIR,
        clone_from=HF_MODEL_REPO,
        use_auth_token=HF_TOKEN,
        git_user="HuggingFace",
        git_email="no-reply@huggingface.co"
    )

    # Push to Hub
    trainer.push_to_hub(repo_id=HF_MODEL_REPO)

    # Also push the tokenizer
    tokenizer.push_to_hub(repo_id=HF_MODEL_REPO)

    print(f"Model and tokenizer successfully pushed to {HF_MODEL_REPO}")

def main():
    # Set up Weights & Biases for tracking
    setup_wandb()

    # Initialize tokenizer first
    tokenizer = setup_tokenizer()

    # Then prepare the dataset with the initialized tokenizer
    tokenized_dataset = prepare_dataset(tokenizer)

    # Set up model
    model = setup_model(tokenizer)

    # Train model
    trainer = train_model(model, tokenizer, tokenized_dataset)

    # Push to Hub
    push_to_hub(trainer, tokenizer)

    # Close Weights & Biases
    wandb.finish()

if __name__ == "__main__":
    main()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msubh_775[0m ([33msubh_775-com[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Loading tokenizer for model: facebook/opt-350m


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/644 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

Loading dataset...


README.md:   0%|          | 0.00/463 [00:00<?, ?B/s]

data.csv:   0%|          | 0.00/24.6M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11332 [00:00<?, ? examples/s]

Dataset structure: DatasetDict({
    train: Dataset({
        features: ['User_english', 'user_hinglish', 'assistant_english', 'assistant_hinglish'],
        num_rows: 11332
    })
})
Sample example:
{'User_english': 'What is the difference between OpenCL and CUDA?', 'user_hinglish': 'OpenCL aur CUDA mein kya anter hai', 'assistant_english': 'OpenCL and CUDA are both programming languages for parallel computing on GPUs, but they differ in several key ways:\n\n1.   Architecture: OpenCL is a general-purpose parallel computing language, while CUDA is designed specifically for GPU computing. OpenCL can run on various types of processors, including CPUs, GPUs, and FPGAs, whereas CUDA is only compatible with NVIDIA GPUs.\n2.   Language: OpenCL is a C++ language, while CUDA is a C++ language with additional features for GPU computing. CUDA provides additional libraries and tools for handling graphics and other tasks unique to GPUs.\n3.   Driver support: OpenCL has a wider range of device supp

Map:   0%|          | 0/11332 [00:00<?, ? examples/s]

Map:   0%|          | 0/10198 [00:00<?, ? examples/s]

Map:   0%|          | 0/1134 [00:00<?, ? examples/s]

Tokenized dataset: {'train': Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 10198
}), 'validation': Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 1134
})}
Loading base model: facebook/opt-350m


pytorch_model.bin:   0%|          | 0.00/663M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/662M [00:00<?, ?B/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


trainable params: 786,432 || all params: 331,982,848 || trainable%: 0.2369
Starting training...




Step,Training Loss,Validation Loss
500,2.8468,2.71917
1000,2.7663,2.651658
1500,2.7319,2.610802
2000,2.6871,2.581876


Step,Training Loss,Validation Loss
500,2.8468,2.71917
1000,2.7663,2.651658
1500,2.7319,2.610802
2000,2.6871,2.581876
2500,2.6689,2.560869
3000,2.66,2.547307
3500,2.6444,2.535878
4000,2.6425,2.528587
4500,2.6154,2.521893
5000,2.6201,2.517677




0,1
eval/loss,█▆▄▃▃▂▂▂▁▁▁▁
eval/runtime,▂▄█▁▅▃▅▃▁▂▆▇
eval/samples_per_second,▇▅▁█▄▆▄▆█▇▃▂
eval/steps_per_second,▇▅▁█▄▆▄▆█▇▃▃
train/epoch,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
train/global_step,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇█████
train/grad_norm,▁▂▁▁▂▂▃▃▄▂▃▄▄▄▄▄▄▆▅▅▅▅▅▅█▅▅▄▅▄▅▅▅▅▆▆▅▆▅▇
train/learning_rate,█████▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▅▅▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁▁
train/loss,█▆▅▄▃▃▃▃▃▂▂▂▂▂▂▂▂▁▂▂▂▁▂▂▂▁▂▁▂▂▂▁▁▁▁▁▁▂▁▁

0,1
eval/loss,2.51245
eval/runtime,33.8787
eval/samples_per_second,33.472
eval/steps_per_second,8.383
total_flos,9.514298149699584e+16
train/epoch,9.9851
train/global_step,6370.0
train/grad_norm,1.04176
train/learning_rate,0.0
train/loss,2.5857


In [11]:
# push to HF_Hub
import os
from huggingface_hub import HfApi

# Set your HF token as an environment variable
os.environ["HF_TOKEN"] = "HF_Tokens"  # Replace with your actual token

# Initialize API with token
api = HfApi(token=os.getenv("HF_TOKEN"))

# Upload a local folder to a subdirectory inside the model repo
api.upload_folder(
    folder_path="/content/hinglish",                 # Local folder to upload
    repo_id="Subh775/hinglish-finetuned-V2",             # Your model repo ID
    repo_type="model",
    # path_in_repo="https://huggingface.co/Subh775/hinglish-finetuned-V2/tree/main"                        # Destination path inside the repo
)

CommitInfo(commit_url='https://huggingface.co/Subh775/hinglish-finetuned-V2/commit/39aae780e99bd6b0f5a519a3df60c66bb969a4f2', commit_message='Upload folder using huggingface_hub', commit_description='', oid='39aae780e99bd6b0f5a519a3df60c66bb969a4f2', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Subh775/hinglish-finetuned-V2', endpoint='https://huggingface.co', repo_type='model', repo_id='Subh775/hinglish-finetuned-V2'), pr_revision=None, pr_num=None)