In [16]:
import pandas as pd
import json
import glob

# --- CONFIGURATION ---
# Update this path to where your CSV files are stored
csv_files = [
    "csv files/menu_Sheet(speaker_02).csv",
    "csv files/menu_Sheet(speaker_03).csv",
    "csv files/menu_Sheet(speaker_04).csv",
    "csv files/menu_Sheet(speaker_09).csv",
    # add remaining CSV files here if you have more
]

output_file = "csv files/combined_dataset.jsonl"  # output jsonl file

# --- STEP 1: READ AND COMBINE ALL CSV FILES ---
df_list = [pd.read_csv(file) for file in csv_files]
combined_df = pd.concat(df_list, ignore_index=True)

# --- STEP 2: FILTER NECESSARY COLUMNS ---
filtered_df = combined_df[["Sentence ID", "Sentence"]].copy()

# --- STEP 3: CONVERT TO JSONL FORMAT ---
with open(output_file, "w", encoding="utf-8") as f:
    for _, row in filtered_df.iterrows():
        json_line = {
            "audio_filepath": f"data/{row['Sentence ID']}.wav",
            "human_transcript": row["Sentence"]
        }
        f.write(json.dumps(json_line, ensure_ascii=False) + "\n")

print(f"✅ JSONL file created successfully: {output_file}")


✅ JSONL file created successfully: csv files/combined_dataset.jsonl


In [17]:
# import json

# jsonl_file = "combined_dataset.jsonl"  # change if file is in another path

# with open(jsonl_file, "r", encoding="utf-8") as f:
#     for line in f:
#         data = json.loads(line)
#         print(data)
import json
import random

jsonl_file = "csv files/combined_dataset.jsonl"  # change if file is in another path

# Read all lines into memory
with open(jsonl_file, "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f]

# Choose 20 random samples (or less if file has fewer lines)
sample_size = min(20, len(data))
random_samples = random.sample(data, sample_size)

# Pretty-print results
for sample in random_samples:
    print(json.dumps(sample, indent=2, ensure_ascii=False))


{
  "audio_filepath": "data/SENT_0909.wav",
  "human_transcript": "two Carrot Halwa, four Phirni, five Arrabbiata Pasta, one Beetroot Halwa, three Mango Mousse Pastry"
}
{
  "audio_filepath": "data/SENT_1852.wav",
  "human_transcript": "ek Vegetable Biryani, do Appam with Coconut Milk"
}
{
  "audio_filepath": "data/SENT_0410.wav",
  "human_transcript": "two Corn Dog, three Chili Con Carne, four Apple Pie, one Dr Pepper"
}
{
  "audio_filepath": "data/SENT_0874.wav",
  "human_transcript": "naalu Miso Soup, moonu Fries, anju Sake, rendu Dorayaki, oru Chicken Dominator"
}
{
  "audio_filepath": "data/SENT_0811.wav",
  "human_transcript": "oru Tandoori Roti, rendu Khichdi"
}
{
  "audio_filepath": "data/SENT_0239.wav",
  "human_transcript": "oru Nadan Kozhi Curry, rendu Duck Roast, moonu Sambar, naalu Rasam Sadam, anju Paniyaram, aaru Vegetable Biryani"
}
{
  "audio_filepath": "data/SENT_0800.wav",
  "human_transcript": "four Jeera Rice lah, one Fish Curry, two Keema Matar, three Kadai Chicke

In [None]:
import zipfile
import os

# --- CONFIGURATION ---
zip_path = "data.zip"  # <-- Change this to your zip file name
extract_dir = "data"           # Folder where files will be extracted

# --- STEP 1: CREATE FOLDER IF NOT EXISTS ---
if not os.path.exists(extract_dir):
    os.makedirs(extract_dir)

# --- STEP 2: UNZIP FILE ---
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print(f"✅ Successfully unzipped '{zip_path}' into '{extract_dir}/'")


In [19]:
import json
import os

# --- CONFIGURATION ---
input_jsonl = "csv files/combined_dataset.jsonl"   # your existing JSONL file
output_jsonl = "csv files/combined_dataset_cleaned.jsonl"  # cleaned output file

# --- STEP 1: LOAD JSONL ---
valid_entries = []
missing_files = []

with open(input_jsonl, "r", encoding="utf-8") as f:
    for line in f:
        entry = json.loads(line.strip())
        audio_path = entry["audio_filepath"]

        if os.path.exists(audio_path):
            valid_entries.append(entry)
        else:
            missing_files.append(audio_path)

# --- STEP 2: WRITE CLEANED JSONL ---
with open(output_jsonl, "w", encoding="utf-8") as f:
    for entry in valid_entries:
        f.write(json.dumps(entry, ensure_ascii=False) + "\n")

print(f"✅ Cleaned JSONL file created: {output_jsonl}")
print(f"✅ Total valid entries: {len(valid_entries)}")

if missing_files:
    print("⚠️ Missing audio files:")
    for missing in missing_files:
        print(f"  - {missing}")
else:
    print("🎉 All audio files were found.")


✅ Cleaned JSONL file created: csv files/combined_dataset_cleaned.jsonl
✅ Total valid entries: 885
⚠️ Missing audio files:
  - data/nan.wav
  - data/SENT_0621.wav
  - data/SENT_0622.wav
  - data/SENT_0623.wav
  - data/SENT_0624.wav
  - data/SENT_0625.wav
  - data/SENT_0626.wav
  - data/SENT_0627.wav
  - data/SENT_0628.wav
  - data/SENT_0629.wav
  - data/SENT_0630.wav
  - data/SENT_0631.wav
  - data/SENT_0632.wav
  - data/SENT_0633.wav
  - data/SENT_0634.wav
  - data/SENT_0635.wav
  - data/SENT_0636.wav
  - data/SENT_0637.wav
  - data/SENT_0638.wav
  - data/SENT_0647.wav
  - data/SENT_0648.wav
  - data/SENT_0649.wav
  - data/SENT_0654.wav
  - data/SENT_0655.wav
  - data/nan.wav
  - data/SENT_0748.wav
  - data/SENT_0781.wav
  - data/SENT_0890.wav
  - data/SENT_0891.wav
  - data/SENT_0892.wav
  - data/SENT_0893.wav
  - data/SENT_0894.wav
  - data/SENT_0895.wav
  - data/SENT_0896.wav
  - data/SENT_0897.wav
  - data/SENT_0898.wav
  - data/SENT_0899.wav
  - data/nan.wav
  - data/nan.wav


In [20]:
import os
import json
import random

# ---- Step 2: Load JSONL (line by line) ----
json_file = "csv files/combined_dataset_cleaned.jsonl"  # change this if your JSON has another name

data = []
with open(json_file, "r", encoding="utf-8") as f:
    for line in f:
        data.append(json.loads(line))

# ---- Step 3: Shuffle & Split (80/10/10) ----
random.shuffle(data)
n = len(data)
train_end = int(0.8 * n)
valid_end = int(0.9 * n)

train_data = data[:train_end]
valid_data = data[train_end:valid_end]
test_data = data[valid_end:]

# ---- Step 4: Save in JSONL format ----
def save_jsonl(path, records):
    os.makedirs(os.path.dirname(path), exist_ok=True)  # ensure folder exists
    with open(path, "w", encoding="utf-8") as f:
        for r in records:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")

save_jsonl("csv files/train.jsonl", train_data)
save_jsonl("csv files/valid.jsonl", valid_data)
save_jsonl("csv files/test.jsonl", test_data)

print(f"✅ Saved train.jsonl ({len(train_data)} records), "
      f"valid.jsonl ({len(valid_data)} records), "
      f"test.jsonl ({len(test_data)} records))")


✅ Saved train.jsonl (708 records), valid.jsonl (88 records), test.jsonl (89 records))


In [1]:
import shutil

folders = [
    "1epoch_largev2lora_adapter",
    "1epoch_largev2lora_fine_tuned_whisper",
    "1epochND_largev2lora_adapter",
    "1epochND_largev2lora_fine_tuned_whisper"
]

for folder in folders:
    shutil.make_archive(folder, 'zip', folder)
    print(f"✅ Zipped {folder} → {folder}.zip")


✅ Zipped 1epoch_largev2lora_adapter → 1epoch_largev2lora_adapter.zip
✅ Zipped 1epoch_largev2lora_fine_tuned_whisper → 1epoch_largev2lora_fine_tuned_whisper.zip
✅ Zipped 1epochND_largev2lora_adapter → 1epochND_largev2lora_adapter.zip
✅ Zipped 1epochND_largev2lora_fine_tuned_whisper → 1epochND_largev2lora_fine_tuned_whisper.zip
