In [None]:
import os

def count_lines_in_leaf_files(folder_path):
    total_lines = 0

    for root, dirs, files in os.walk(folder_path):
        # leaf directory = no subdirectories under it
        if not dirs:
            for file in files:
                file_path = os.path.join(root, file)
                try:
                    with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                        total_lines += sum(1 for _ in f)
                except Exception as e:
                    print(f"Skipping {file_path} due to error: {e}")

    return total_lines

# Example usage
folder = "/home/soham37/COIL_D_institutes/IITD/Monolingual/Hindi/Validated"
print("Total lines in leaf files:", count_lines_in_leaf_files(folder))


Total lines in leaf files: 989


In [2]:
folder = "/home/soham37/Desktop/COIL-D/COIL_D_Repo/Monolingual/Hindi/Validated/GOV"
print("Total lines in leaf files:", count_lines_in_leaf_files(folder))

Total lines in leaf files: 57376


In [5]:
folder = "/home/soham37/COIL_D_institutes/MIT/Monolingual/Tamil/Validated"
print("Total lines in leaf files:", count_lines_in_leaf_files(folder))

Total lines in leaf files: 17325


In [6]:
folder = "/home/soham37/COIL_D_institutes/MIT/Monolingual/Tamil/Raw Data"
print("Total lines in leaf files:", count_lines_in_leaf_files(folder))

Total lines in leaf files: 99884


# Find the Monolingual Distribution of the Data

In [1]:
import os
import csv

def count_lines_in_file(filepath):
    """Count number of lines in a text file."""
    try:
        with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
            return sum(1 for _ in f)
    except Exception as e:
        print(f"Error reading {filepath}: {e}")
        return 0

def collect_stats(parent_folder, output_csv="/home/soham37/python/Stats/monolingual_stats.csv"):
    stats = []

    for language in os.listdir(parent_folder):
        lang_path = os.path.join(parent_folder, language)
        if not os.path.isdir(lang_path):
            continue

        for category in ["Raw", "Validated"]:  # loop over Raw & Validated
            category_path = os.path.join(lang_path, category)
            if not os.path.isdir(category_path):
                continue

            for primary_domain in os.listdir(category_path):
                primary_path = os.path.join(category_path, primary_domain)
                if not os.path.isdir(primary_path):
                    continue

                num_files = 0
                total_lines = 0

                # Traverse all sub-domains recursively
                for root, _, files in os.walk(primary_path):
                    for file in files:
                        file_path = os.path.join(root, file)
                        num_files += 1
                        total_lines += count_lines_in_file(file_path)

                stats.append({
                    "Language": language,
                    "Category": category,
                    "Primary_Domain": primary_domain,
                    "Num_Files": num_files,
                    "Num_Lines": total_lines
                })

    # Save to CSV
    with open(output_csv, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["Language", "Category", "Primary_Domain", "Num_Files", "Num_Lines"])
        writer.writeheader()
        writer.writerows(stats)

    print(f"Statistics saved to {output_csv}")
    return stats


# Example usage:
parent_folder = "/home/soham37/python/Monolingual"  # replace with actual path
stats = collect_stats(parent_folder)

# Pretty print
for row in stats:
    print(row)

Statistics saved to /home/soham37/python/Stats/monolingual_stats.csv
{'Language': 'Hindi', 'Category': 'Raw', 'Primary_Domain': 'Healthcare_raw', 'Num_Files': 6, 'Num_Lines': 6656}
{'Language': 'Hindi', 'Category': 'Raw', 'Primary_Domain': 'Judiciary', 'Num_Files': 1, 'Num_Lines': 10103}
{'Language': 'Hindi', 'Category': 'Raw', 'Primary_Domain': 'Education', 'Num_Files': 3, 'Num_Lines': 8364}
{'Language': 'Hindi', 'Category': 'Raw', 'Primary_Domain': 'Science and Technology', 'Num_Files': 11, 'Num_Lines': 4165}
{'Language': 'Hindi', 'Category': 'Raw', 'Primary_Domain': 'Agriculture', 'Num_Files': 1, 'Num_Lines': 5099}
{'Language': 'Hindi', 'Category': 'Validated', 'Primary_Domain': 'GOV', 'Num_Files': 45, 'Num_Lines': 112789}
{'Language': 'Hindi', 'Category': 'Validated', 'Primary_Domain': 'Tourism', 'Num_Files': 1, 'Num_Lines': 4288}
{'Language': 'Hindi', 'Category': 'Validated', 'Primary_Domain': 'Health', 'Num_Files': 11, 'Num_Lines': 6552}
{'Language': 'Hindi', 'Category': 'Validat