In [None]:
%pip install datasets scikit-learn pandas

In [None]:
from datasets import load_dataset, Dataset, DatasetDict, concatenate_datasets
from sklearn.model_selection import train_test_split
import pandas as pd
# Function to convert the first dataset format
def convert_dataset1(original_dataset , add_id_if_missing=False):
    converted_samples = []
    idx=0
    for sample in original_dataset:
        sample_id = sample.get("id", f"generated_id_{idx}") if add_id_if_missing else sample["id"]
        idx+=1
        new_messages = []
        for conversation in sample["conversations"]:
            new_messages.append({
                "content": conversation["value"],
                "role": "user" if conversation["from"] == "human" else "assistant"
            })
        converted_samples.append({"id": sample_id, "messages": new_messages})
    return pd.DataFrame(converted_samples)

# Function to convert the second dataset format
def convert_dataset2(original_dataset):
    converted_samples = []
    for sample in original_dataset:
        # Hindi conversation
        converted_samples.append({
            "id": sample["id"],
            "messages": [
                {"content": sample["input"], "role": "user"},
                {"content": sample["output"], "role": "assistant"}
            ]
        })
        # Hinglish conversation with altered ID
        hinglish_id = f"{sample['id']}_hinglish"
        converted_samples.append({
            "id": hinglish_id,
            "messages": [
                {"content": sample["input_hinglish"], "role": "user"},
                {"content": sample["output_hinglish"], "role": "assistant"}
            ]
        })
    return pd.DataFrame(converted_samples)

# Load datasets from Hugging Face
dataset1 = load_dataset("FreedomIntelligence/evol-instruct-hindi", split="train")
dataset2 = load_dataset("NebulaByte/alpaca-gpt4-hindi-hinglish", split="train")
dataset3 = load_dataset("FreedomIntelligence/evol-instruct-hindi", split="train")
dataset4 = load_dataset("smangrul/hindi_instruct_v1",split="train")
dataset5 = load_dataset("smangrul/hindi_instruct_v1",split="test")
dataset6 = load_dataset("SherryT997/HelpSteer-hindi", split="train")

# Convert datasets
converted_df1 = convert_dataset1(dataset1)
converted_df2 = convert_dataset2(dataset2)
converted_df3 = convert_dataset1(dataset3)
converted_df6 = convert_dataset1(dataset6,add_id_if_missing=True)
# Merge datasets
merged_df = pd.concat([converted_df1, converted_df2, converted_df3, dataset4, dataset5, converted_df6], ignore_index=True)
merged_dataset = Dataset.from_pandas(merged_df)

train_df, test_df = train_test_split(merged_df, test_size=0.2, random_state=42)

# Create Hugging Face datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

dataset_dict = DatasetDict({
    "train_sft": train_dataset,
    "test_sft": test_dataset
})

dataset_name = "BB_HindiHinglishV2"  # Replace with your desired dataset name
dataset_dict.push_to_hub(dataset_name)
