In [1]:
import os
import pandas as pd
import urllib.request
import zipfile
import glob
import shutil
import librosa

In [2]:
os.getcwd()

'/home/jovyan/work/Datasets'

In [3]:
os.makedirs("SLR66_Telugu", exist_ok=True) # Create a folder to save the files

In [4]:
# URLs of files to download
urls = {
    "line_index_female.tsv": "https://www.openslr.org/resources/66/line_index_female.tsv",
    "line_index_male.tsv": "https://www.openslr.org/resources/66/line_index_male.tsv",
    "te_in_female.zip": "https://www.openslr.org/resources/66/te_in_female.zip",
    "te_in_male.zip": "https://www.openslr.org/resources/66/te_in_male.zip"
}

In [6]:
for filename, url in urls.items():
    files_path = os.path.join("SLR66_Telugu", filename)
    if not os.path.exists(files_path):
        print(f"Downloading {filename}...")
        urllib.request.urlretrieve(url, files_path)
        print(f"Saved to {files_path}")
    else:
        print(f"{filename} already exists, skipping.")

Downloading line_index_female.tsv...
Saved to SLR66_Telugu/line_index_female.tsv
Downloading line_index_male.tsv...
Saved to SLR66_Telugu/line_index_male.tsv
Downloading te_in_female.zip...
Saved to SLR66_Telugu/te_in_female.zip
Downloading te_in_male.zip...
Saved to SLR66_Telugu/te_in_male.zip


In [7]:
zip_files = ["te_in_female.zip", "te_in_male.zip"]
for zfile in zip_files:
    zip_path = os.path.join("SLR66_Telugu", zfile)
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        extract_dir = os.path.join("SLR66_Telugu", zfile.replace(".zip", ""))
        os.makedirs(extract_dir, exist_ok=True)
        zip_ref.extractall(extract_dir)
        print(f"Extracted {zfile} to {extract_dir}")

Extracted te_in_female.zip to SLR66_Telugu/te_in_female
Extracted te_in_male.zip to SLR66_Telugu/te_in_male


**For te_in_female folder and te_in_male folder**

In [28]:
def count_wav_files(folder_path, label):
    wav_files = set()
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".wav"):
                wav_files.add(file)
    print(f"Number of files available in {label} folder: {len(wav_files)}")
    return wav_files
female_folder_p = "/home/jovyan/work/Datasets/SLR66_Telugu/te_in_female" # Usage
male_folder_p = "/home/jovyan/work/Datasets/SLR66_Telugu/te_in_male"
available_files_f = count_wav_files(female_folder_p, "female")
available_files_m = count_wav_files(male_folder_p, "male")

Number of files available in female folder: 2294
Number of files available in male folder: 2154


**Converting tsv files in to csv files**

In [29]:
def load_tsv(tsv_path, label, output_csv=True):
    """
    Load a TSV file with or without headers, assign proper column names if necessary,
    display dataset insights, and optionally save to CSV.

    Parameters:
        tsv_path (str): Path to the .tsv file
        label (str): A label like 'male' or 'female' for display/logging
        output_csv (bool): Whether to save the loaded DataFrame to CSV

    Returns:
        pd.DataFrame: Loaded DataFrame with proper columns
    """
    print(f"\n--- Loading {label} dataset ---")
    with open(tsv_path, 'r', encoding='utf-8') as f: # Try reading first row to check if column names exist
        first_line = f.readline().strip().split('\t')

    has_header = "path" in first_line and "sentence" in first_line # Check if first row contains expected column headers

    if has_header:
        df = pd.read_csv(tsv_path, sep='\t', dtype=str)
    else:
        df = pd.read_csv(tsv_path, sep='\t', header=None, names=["path", "sentence"], dtype=str)

    print(f"Rows loaded in {label}: {len(df)}")

    if output_csv:
        csv_name = f"line_index_telugu_{label}.csv"
        df.to_csv(csv_name, index=False)
        print(f"Saved to CSV: {csv_name}")

    print("\nHead:\n", df.head()) # Show data insights
    print("\nTail:\n", df.tail())
    print("\nInfo:")
    print(df.info())
    print("\nDescribe:")
    print(df.describe())
    print(f"\nColumns: {df.columns.tolist()}")
    print(f"Shape: {df.shape}")
    
    return df

male_tsv_path = "/home/jovyan/work/Datasets/SLR66_Telugu/line_index_male.tsv"
female_tsv_path = "/home/jovyan/work/Datasets/SLR66_Telugu/line_index_female.tsv"

limp_df = load_tsv(male_tsv_path, "male")
lifp_df = load_tsv(female_tsv_path, "female")


--- Loading male dataset ---
Rows loaded in male: 2154
Saved to CSV: line_index_telugu_male.csv

Head:
                     path                                           sentence
0  tem_07220_01981175708                          దీనిని తటస్థీకరణము అందురు
1  tem_06359_01516502961                       దీనిని స్కోరుగా అనువదిస్తారు
2  tem_02812_01868000938           రెండంతస్తుల భవనాలు పూర్తిగా నీట మునిగాయి
3  tem_08680_00300488472  పిన్ కోడ్ అయిదు లక్షలు అయిదు వేలు అయిదు వందలు ...
4  tem_02769_01899577322                    ద్రవరూపంలోని మలినాలు తేమ వంటివి

Tail:
                        path                                           sentence
2149  tem_09222_01274346342                         దాని వలన ప్రయోజనం శూన్యం\n
2150  tem_00682_01449465265              బొమ్మకు కొంచెం పొడవు వ్యాఖ్య వ్రాశాను
2151  tem_04272_01299232078  ఉదాహరణకు ఏనుగు అనే ఇంటిపేరు ఏనుగు అని కనిపిస్త...
2152  tem_03338_01904171899             రవీంద్రుని రచనలలో గీతాంజలి చాల గొప్పది
2153  tem_03338_00242203933         

**To merge both female and male audio files in to single folder**

**Merging both female and male csv files**

In [39]:
def copy_wav_files(source_dirs, target_dir):
    """Copy all .wav files from multiple source directories into a target directory."""
    os.makedirs(target_dir, exist_ok=True)
    total_files_copied = 0
    for source_dir in source_dirs:
        print(f"Copying from {source_dir}...")
        for filename in os.listdir(source_dir):
            if filename.endswith(".wav"):
                src_path = os.path.join(source_dir, filename)
                dst_path = os.path.join(target_dir, filename)
                shutil.copy2(src_path, dst_path)
                total_files_copied += 1
    print(f"All .wav files have been merged into: {target_dir}")
    print(f"Total files copied: {total_files_copied}")


def count_audio_files(folder_path):
    """Count the number of .wav files in a directory (including subdirectories)."""
    wav_files = {file for root, _, files in os.walk(folder_path) for file in files if file.endswith(".wav")}
    print(f"Number of .wav files in '{folder_path}': {len(wav_files)}")
    return wav_files


def inspect_csv_files(paths):
    """Display summary information for each CSV file."""
    for path in paths:
        print(f"\n--- Inspecting: {path} ---")
        if os.path.exists(path):
            try:
                df = pd.read_csv(path)
                print("\nHead:\n", df.head())
                print("\nTail:\n", df.tail())
                print("\nInfo:")
                print(df.info())
                print("\nDescribe:\n", df.describe())
                print(f"\nColumns: {df.columns.tolist()}")
                print(f"Shape: {df.shape}")
            except Exception as e:
                print(f"Error reading {path}: {e}")
        else:
            print("File not found:", path)


def merge_csv_files(csv_paths, output_path, drop_duplicates=False):
    """Merge multiple CSV files into one and save the result."""
    try:
        dfs = [pd.read_csv(path) for path in csv_paths]
        merged_df = pd.concat(dfs, ignore_index=True)
        if drop_duplicates:
            merged_df.drop_duplicates(subset=['path', 'sentence'], inplace=True)
        merged_df.to_csv(output_path, index=False)
        print(f"Merged CSV saved to: {output_path}")
        print(f"Final shape: {merged_df.shape}")
        return merged_df
    except Exception as e:
        print(f"Error while merging CSVs: {e}")
        return None


def inspect_merged_file(file_path):
    """Load and inspect a merged CSV file."""
    if not os.path.exists(file_path):
        print(f"Merged file not found: {file_path}")
        return
    try:
        m_df = pd.read_csv(file_path)
        print(f"\n--- Inspection of merged file: {file_path} ---\n")
        print("Info:")
        print(m_df.info())
        print("\nColumns:")
        print(m_df.columns)
        print("\nDescribe:\n", m_df.describe())
        print("\nShape:", m_df.shape)
        print("\nHead:\n", m_df.head())
        print("\nTail:\n", m_df.tail())
    except Exception as e:
        print(f"Error reading merged file: {e}")


In [40]:
male_dir = "/home/jovyan/work/Datasets/SLR66_Telugu/te_in_male" # Step 1: Merge audio files
female_dir = "/home/jovyan/work/Datasets/SLR66_Telugu/te_in_female"
merged_audio_dir = "/home/jovyan/work/Datasets/SLR66_Telugu/te_Telugu_merged"
copy_wav_files([male_dir, female_dir], merged_audio_dir)
count_audio_files(merged_audio_dir)
csv_paths = [ # Step 2: Inspect the original CSVs
    "line_index_telugu_male.csv",
    "line_index_telugu_female.csv"
]
inspect_csv_files(csv_paths)
output_csv = "merged_csv_files_openslr_telugu.csv" # Step 3: Merge CSV files
merged_df = merge_csv_files(csv_paths, output_csv, drop_duplicates=False)
inspect_merged_file(output_csv) # Step 4: Inspect merged file

Copying from /home/jovyan/work/Datasets/SLR66_Telugu/te_in_male...
Copying from /home/jovyan/work/Datasets/SLR66_Telugu/te_in_female...
All .wav files have been merged into: /home/jovyan/work/Datasets/SLR66_Telugu/te_Telugu_merged
Total files copied: 4448
Number of .wav files in '/home/jovyan/work/Datasets/SLR66_Telugu/te_Telugu_merged': 4448

--- Inspecting: line_index_telugu_male.csv ---

Head:
                     path                                           sentence
0  tem_07220_01981175708                          దీనిని తటస్థీకరణము అందురు
1  tem_06359_01516502961                       దీనిని స్కోరుగా అనువదిస్తారు
2  tem_02812_01868000938           రెండంతస్తుల భవనాలు పూర్తిగా నీట మునిగాయి
3  tem_08680_00300488472  పిన్ కోడ్ అయిదు లక్షలు అయిదు వేలు అయిదు వందలు ...
4  tem_02769_01899577322                    ద్రవరూపంలోని మలినాలు తేమ వంటివి

Tail:
                        path                                           sentence
2149  tem_09222_01274346342                         దాని

**Finding unique files in merged file**

In [41]:
def inspect_duplicates(df, path_column='path'):
    """Report number of unique and duplicate paths in the DataFrame."""
    total_rows = len(df)
    unique_paths = df[path_column].nunique()
    duplicate_count = total_rows - unique_paths

    print(f"Total rows in CSV: {total_rows}")
    print(f"Unique audio paths: {unique_paths}")
    print(f"Duplicate rows (same audio used multiple times): {duplicate_count}")
    return unique_paths, duplicate_count


def filter_existing_audio(df, audio_dir, path_column='path'):
    """Add a 'found' column to indicate whether corresponding audio files exist in the directory."""
    df['audio_file'] = df[path_column].astype(str) + ".wav"

    available_files = {
        file for root, _, files in os.walk(audio_dir)
        for file in files if file.endswith(".wav")
    }

    df['found'] = df['audio_file'].apply(lambda x: 'yes' if x in available_files else 'no')
    print(df['found'].value_counts())

    found_df = df[df['found'] == 'yes'].copy()

    if 'audio_file' in found_df.columns:
        found_df.drop(columns=['audio_file'], inplace=True)

    return found_df


def save_filtered_csv(df, output_file):
    """Save the filtered DataFrame to a CSV file."""
    df.to_csv(output_file, index=False)
    print(f"Exported {len(df)} rows to '{output_file}'")


def add_audio_durations(df, audio_dir, path_column='path'):
    """Add duration columns (in sec and ms) to the DataFrame using librosa."""
    duration_sec = []
    duration_ms = []

    for file_name in df[path_column]:
        file_path = os.path.join(audio_dir, f"{file_name}.wav")
        try:
            y, sr = librosa.load(file_path, sr=None)
            dur = librosa.get_duration(y=y, sr=sr)
            duration_sec.append(dur)
            duration_ms.append(dur * 1000)
        except Exception as e:
            duration_sec.append(None)
            duration_ms.append(None)
            print(f"Error loading {file_path}: {e}")

    df['duration_sec'] = duration_sec
    df['duration_ms'] = duration_ms
    return df


def summarize_durations(df, columns=['duration_sec', 'duration_ms']):
    """Print the total sum of duration columns."""
    for col in columns:
        if col in df.columns:
            print(f"Total {col}: {df[col].sum()}")
        else:
            print(f"Column '{col}' not found in the DataFrame.")

In [42]:
# Paths
merged_csv = "merged_csv_files_openslr_telugu.csv"
audio_dir = "/home/jovyan/work/Datasets/SLR66_Telugu/te_Telugu_merged"
filtered_csv = "filtered_found_audio_data_openslr.csv"
final_csv = "filtered_with_durations_openslr.csv"
merged_df = pd.read_csv(merged_csv) # Step 1: Load merged CSV and inspect for duplicates
inspect_duplicates(merged_df)
filtered_df = filter_existing_audio(merged_df, audio_dir) # Step 2: Filter only rows with existing audio files
save_filtered_csv(filtered_df, filtered_csv)
filtered_df = pd.read_csv(filtered_csv) # Step 3: Add durations to the filtered DataFrame
filtered_with_durations_df = add_audio_durations(filtered_df, audio_dir)
filtered_with_durations_df.to_csv(final_csv, index=False)
print("Exact durations added without rounding. File saved.")
summarize_durations(filtered_with_durations_df) # Step 4: Summarize duration columns

Total rows in CSV: 4448
Unique audio paths: 4448
Duplicate rows (same audio used multiple times): 0
found
yes    4448
Name: count, dtype: int64
Exported 4448 rows to 'filtered_found_audio_data_openslr.csv'
Exact durations added without rounding. File saved.
Total duration_sec: 20551.936
Total duration_ms: 20551936.0


**Final dataset Info**

In [43]:
def load_dataset(csv_path):
    """Load dataset from the given CSV path."""
    df = pd.read_csv(csv_path)
    return df

def preview_dataset(df, n=5):
    """Print head and tail of the DataFrame."""
    print("Head of the DataFrame:")
    print(df.head(n))
    print("\nTail of the DataFrame:")
    print(df.tail(n))

def display_info(df):
    """Print DataFrame structure and stats."""
    print("\nDataFrame Info:")
    print(df.info())
    print("\nDescriptive Statistics (Numerical Columns):")
    print(df.describe())
    print("\nDescriptive Statistics (Categorical Columns):")
    print(df.describe(include=['object']))

def handle_missing_sentences(df):
    """Remove rows with missing 'sentence' values."""
    print("\nNull Values Count:")
    print(df.isnull().sum())

    null_sentences_df = df[df['sentence'].isnull()]
    print("\nRows with missing 'sentence':")
    print(null_sentences_df)

    df = df.dropna(subset=['sentence'])

    print("Rows with missing sentences dropped.")
    print("New shape:", df.shape)

    print("\nUpdated Null Values Count:")
    print(df.isnull().sum())

    return df

def column_uniques_and_modes(df):
    """Print unique count and most frequent value (mode) for each column."""
    print("\nUnique Values per Column:")
    for col in df.columns:
        print(f"{col}: {df[col].nunique()}")

    print("\nMost Frequent Values (Mode):")
    for col in df.columns:
        try:
            print(f"{col}: {df[col].mode().values[0]}")
        except Exception:
            print(f"{col}: No mode available")

def summarize_durations(df):
    """Print the total durations in seconds and milliseconds."""
    print("\nTotal Durations Summary:")
    if 'duration_sec' in df.columns:
        print(f"Total duration (seconds): {df['duration_sec'].sum()}")
    if 'duration_ms' in df.columns:
        print(f"Total duration (milliseconds): {df['duration_ms'].sum()}")

def save_cleaned_dataset(df, output_path):
    """Save the cleaned dataset to a CSV."""
    df.to_csv(output_path, index=False)
    print(f"\nCleaned dataset saved to '{output_path}'")

In [44]:
# Final dataset path
input_path = "/home/jovyan/work/Datasets/filtered_with_durations_openslr.csv"
output_path = "filtered_with_durations_openslr_telugu_cleaned.csv"
df = load_dataset(input_path) # Step-by-step analysis and cleanup
preview_dataset(df)
display_info(df)
df = handle_missing_sentences(df)
column_uniques_and_modes(df)
summarize_durations(df)
save_cleaned_dataset(df, output_path)

Head of the DataFrame:
                    path                                           sentence  \
0  tem_07220_01981175708                          దీనిని తటస్థీకరణము అందురు   
1  tem_06359_01516502961                       దీనిని స్కోరుగా అనువదిస్తారు   
2  tem_02812_01868000938           రెండంతస్తుల భవనాలు పూర్తిగా నీట మునిగాయి   
3  tem_08680_00300488472  పిన్ కోడ్ అయిదు లక్షలు అయిదు వేలు అయిదు వందలు ...   
4  tem_02769_01899577322                    ద్రవరూపంలోని మలినాలు తేమ వంటివి   

  found  duration_sec  duration_ms  
0   yes      3.413333  3413.333333  
1   yes      4.266667  4266.666667  
2   yes      3.669333  3669.333333  
3   yes      4.181333  4181.333333  
4   yes      6.997333  6997.333333  

Tail of the DataFrame:
                       path  \
4443  tef_06566_01479491396   
4444  tef_05691_01910883963   
4445  tef_08664_00831343057   
4446  tef_03689_01754896634   
4447  tef_05705_01399415008   

                                               sentence found  durati