Dataset Generation


In [18]:
import json
import csv

def process_jsonl_to_csv(input_file_path, output_csv_path):
    """
    Process WVQ_Arabic_150.jsonl to create CSV with concatenated inputs and outputs.
    
    Args:
        input_file_path (str): Path to the WVQ_Arabic_150.jsonl file
        output_csv_path (str): Path for the output CSV file
    """
    
    csv_data = []
    
    # Read the JSONL file line by line
    with open(input_file_path, 'r', encoding='utf-8') as file:
        for line_num, line in enumerate(file, 1):
            try:
                # Parse each line as JSON
                item = json.loads(line.strip())
                
                # Extract messages
                messages = item.get("messages", [])
                
                # Initialize content variables
                system_content = ""
                user_content = ""
                assistant_content = ""
                
                # Extract content from each message role
                for message in messages:
                    role = message.get("role", "")
                    content = message.get("content", "")
                    
                    if role == "system":
                        system_content = "System : " + content
                    elif role == "user":
                        user_content = "User : " + content
                    elif role == "assistant":
                        assistant_content = "Response : " + content
                
                # Concatenate system and user content with a space
                combined_input = system_content + " "  + user_content + " " + assistant_content
                
                # Add to CSV data
                csv_data.append({
                    "input": combined_input,
                    #"output": assistant_content
                })
                
            except json.JSONDecodeError as e:
                print(f"Error parsing line {line_num}: {e}")
                continue
            except Exception as e:
                print(f"Error processing line {line_num}: {e}")
                continue
    
    # Write to CSV file
    with open(output_csv_path, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['input']
        #fieldnames = ['input', 'output']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        # Write header
        writer.writeheader()
        
        # Write data rows
        for row in csv_data:
            writer.writerow(row)
    
    print(f"Successfully processed {len(csv_data)} items")
    print(f"CSV file created: {output_csv_path}")
    
    return csv_data
# Usage:
csv_data = process_jsonl_to_csv("/home/devbox/test/Diversifying_uttr/Thesis/data/WVQ_Germany_150.jsonl", "/home/devbox/test/Diversifying_uttr/Thesis/data/WVQ_Germany_150.csv")


Successfully processed 300 items
CSV file created: /home/devbox/test/Diversifying_uttr/Thesis/data/WVQ_Germany_150.csv


In [19]:
import os
import pandas as pd

def stackcsv(content_folder):
    global combined_csv
    combined_csv= []
    entries = os.listdir(content_folder)
    for i in entries:
        csv_path = os.path.join(content_folder, i)
        solo_csv = pd.read_csv(csv_path,index_col=None,header = None)
        combined_csv.append(solo_csv)
    csv_final = pd.concat(combined_csv,axis = 0,sort = False)
    return csv_final.to_csv("Ger_Spa_Arb.csv", header = None)
stackcsv("/home/devbox/test/Diversifying_uttr/Thesis/data/")

In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [20]:
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    LlamaForCausalLM,
    LlamaTokenizer,
    LlamaConfig,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)

ds = load_dataset("ritwik-ghosh/CulturalDataset", split="train")
ds


Ger_Spa_Arb_single_col.csv:   0%|          | 0.00/300k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/902 [00:00<?, ? examples/s]

Dataset({
    features: ['0', 'input'],
    num_rows: 902
})

In [21]:
import wandb

wandb.init(project="CulturalLLM_training")


In [None]:
import os, fire
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    LlamaForCausalLM,
    LlamaTokenizer,
    LlamaConfig,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig
from accelerate import init_empty_weights,infer_auto_device_map,load_checkpoint_in_model,dispatch_model

# Model from Hugging Face hub or Model path
base_model = 'Qwen/Qwen2.5-0.5B'
new_model = 'Diversifying_uttr/Thesis/SavedTensors/'

SRC_COL      = "input_text"             # 2nd column in that dataset
TGT_COL      = "output_text"

def format_example(example):
    """
    Turn one row into a single training string.
    Many instruction datasets use an 'Instruction ➜ Response' pattern
    so we do similar here.
    """
    input_prompt = (
      "### Instruction:\n"
      f"{example['Instruction']}\n\n"
      f"### Response:\n"
      f"{example['Response']}")
    
    return {"text" : input_prompt}

#formatted_ds = ds.map(format_example)


def run(base_model, new_model, data_files=None):

    #dataset = load_dataset('ritwik-ghosh/CulturalAlignment', split='train')

    compute_dtype = getattr(torch, "float16")

    '''quant_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=False,
    )'''

    max_memory = {i: '46000MB' for i in range(torch.cuda.device_count())}
    model = LlamaForCausalLM.from_pretrained(
        base_model,
        #quantization_config=quant_config,
        # device_map={"": 0}
        device_map="auto",
        max_memory=max_memory
    )
    #model.quantization_config = quant_config
    model.config.use_cache = False

    tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    peft_params = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.1,
        r=64,
        bias="none",
        task_type="CAUSAL_LM",
    )
    training_params = SFTConfig(
        output_dir="/home/devbox/test/Diversifying_uttr/Thesis/data/results-finetune",
        num_train_epochs=6,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=1,
        optim="paged_adamw_32bit",
        save_steps=25,
        logging_steps=25,
        learning_rate=2e-4,
        weight_decay=0.001,
        fp16=False,
        bf16=False,
        max_grad_norm=0.3,
        max_steps=-1,
        warmup_ratio=0.03,
        group_by_length=True,
        lr_scheduler_type="constant",
        report_to="wandb",
        dataset_text_field="input"
    )

    trainer = SFTTrainer(
        model=model,
        train_dataset=ds,
        peft_config=peft_params,
        #formatting_func=format_example,
        #max_seq_length=None,
        #dataset_text_field="input",
        processing_class=tokenizer,
        args=training_params,
        #packing=False,
    )
    trainer.train()

    trainer.model.save_pretrained(new_model)
    trainer.tokenizer.save_pretrained(new_model)

def eval():
    prompt = "Who is Leonardo Da Vinci?"
    tokenizer = LlamaTokenizer.from_pretrained("")
    model = LlamaForCausalLM.from_pretrained("", device_map="auto")
    pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
    result = pipe(f"<s>[INST] {prompt} [/INST]")
    print(result[0]['generated_text'])


if __name__ == '__main__':

    fire.Fire(run(base_model, new_model))  




You are using a model of type qwen2 to instantiate a model of type llama. This is not supported for all configurations of models and can yield errors.


PermissionError: [Errno 13] Permission denied: '/data'