In [1]:
import pandas as pd

# Load TSV (adjust file path)
tsv_path = "/home/soham37/python/POS_data_create/EDU/merged_edu_2.tsv"
df = pd.read_csv(tsv_path, sep="\t", dtype=str)  # read as strings to handle empties uniformly

# Normalize empties: strip whitespace and set empty strings to NaN
df = df.apply(lambda col: col.str.strip() if col.dtype == "object" else col)
df = df.replace({"": pd.NA})

# Count non-empty per column (including Source if needed)
non_empty_counts = df.notna().sum()

print("Non-empty rows per column:")
print(non_empty_counts)

Non-empty rows per column:
source_HIN    294
HIN-ASM       294
HIN-BAN       294
HIN-BRX         0
HIN-DOI       294
HIN-GOM         0
HIN-GUJ       294
HIN-KAS       294
HIN-MAI       294
HIN-MAR       294
HIN-MNI       294
HIN-NPI       294
HIN-ODI       294
HIN-PAN       294
HIN-SAT         1
HIN-SND         0
HIN-TEL       294
HIN-URD         1
TAM-KAN         1
TAM-MAL         1
TAM-TEL         1
dtype: int64


  df = df.replace({"": pd.NA})


In [3]:
target_size = 290
source_col = df.columns[0]
candidate_cols = [c for c in df.columns if c != source_col]

# Compute availability mask for each candidate column
non_empty_masks = {c: df[c].notna() for c in candidate_cols}
counts = {c: int(mask.sum()) for c, mask in non_empty_masks.items()}

# 1) Order columns by scarcity (ascending)
ordered_cols = sorted(candidate_cols, key=lambda c: counts[c])

# 2) Seed selection with rows that are non-empty for the scarcest column(s)
selected_idx = pd.Index([])
used_mask = pd.Series(False, index=df.index)

for col in ordered_cols:
    if len(selected_idx) >= target_size:
        break
    # rows available for this col that aren't already picked
    avail = non_empty_masks[col] & (~used_mask)
    needed = target_size - len(selected_idx)
    pick = df.index[avail][:needed]  # stable selection
    selected_idx = selected_idx.append(pick)
    used_mask.loc[pick] = True

# 3) If still short of 600, fill with rows that maximize total non-empties across all columns
if len(selected_idx) < target_size:
    remaining = df.index[~used_mask]
    # Score rows by how many non-empty candidate entries they have
    row_scores = df.loc[remaining, candidate_cols].notna().sum(axis=1)
    # Choose highest-scoring rows until we hit 600
    needed = target_size - len(selected_idx)
    filler = row_scores.sort_values(ascending=False).index[:needed]
    selected_idx = selected_idx.append(filler)

subset = df.loc[selected_idx].copy()

# Optional: ensure uniqueness by Source and keep first occurrence
subset = subset.drop_duplicates(subset=[source_col], keep="first")

# If duplicates removal made it smaller than 600, top up again (optional)
if len(subset) > target_size:
    subset = subset.iloc[:target_size]
elif len(subset) < target_size:
    # Try to top up with remaining rows (lower score) without hurting scarcest columns
    remaining2 = df.index.difference(subset.index)
    row_scores2 = df.loc[remaining2, candidate_cols].notna().sum(axis=1)
    topup = row_scores2.sort_values(ascending=False).index[:(target_size - len(subset))]
    subset = pd.concat([subset, df.loc[topup]]).drop_duplicates(subset=[source_col], keep="first").iloc[:target_size]

print(f"Final subset size: {len(subset)}")

# Check non-empty counts in the subset
subset_counts = subset.notna().sum()
print("Non-empty rows per column in subset:")
print(subset_counts)

Final subset size: 290
Non-empty rows per column in subset:
source_HIN    290
HIN-ASM       290
HIN-BAN       290
HIN-BRX         0
HIN-DOI       290
HIN-GOM         0
HIN-GUJ       290
HIN-KAS       290
HIN-MAI       290
HIN-MAR       290
HIN-MNI       290
HIN-NPI       290
HIN-ODI       290
HIN-PAN       290
HIN-SAT         1
HIN-SND         0
HIN-TEL       290
HIN-URD         1
TAM-KAN         1
TAM-MAL         1
TAM-TEL         1
dtype: int64


  selected_idx = selected_idx.append(pick)


In [5]:
# Save counts
counts_df = pd.DataFrame({
    "column": df.columns,
    "non_empty_full": [int(df[c].notna().sum()) for c in df.columns],
    "non_empty_subset": [int(subset[c].notna().sum()) for c in df.columns]
})
counts_df.to_csv("/home/soham37/python/POS_data_create/EDU/sent_count_in_290_sample.csv", index=False)

# Save subset
subset.to_csv("/home/soham37/python/POS_data_create/EDU/subset_290.tsv", sep="\t", index=False)


# Create Files

In [8]:
file_path = "/home/soham37/Downloads/Create_data_pos - subset_edu_1000.tsv"

In [9]:
import pandas as pd

In [10]:
df = pd.read_csv(file_path, sep="\t")

In [11]:
df.columns

Index(['source_HIN', 'HIN-ASM', 'HIN-BAN', 'HIN-DOI', 'HIN-GOM', 'HIN-GUJ',
       'HIN-KAS', 'HIN-MAI', 'HIN-MAR', 'HIN-MNI', 'HIN-NPI', 'HIN-ODI',
       'HIN-PAN', 'HIN-SAT', 'HIN-SND', 'HIN-TEL', 'HIN-URD', 'TAM-KAN',
       'TAM-MAL', 'TAM-TEL'],
      dtype='object')

In [16]:
df = df[["source_HIN", "HIN-BAN", "HIN-MAI", "HIN-ODI", "HIN-SAT"]] #df for iitp language pairs

In [19]:
import pandas as pd
import os
import math

# Example: Load your dataframe
df = df.fillna("")  # replace NaN with empty strings

# Directory to save output files
out_dir = "/home/soham37/python/POS_data_create/EDU/IITP"
os.makedirs(out_dir, exist_ok=True)

# Assign unique IDs for source and translations
df = df.reset_index(drop=True)
df["source_id"] = [f"SRC_{i+1}" for i in range(len(df))]

# Language pair columns (skip 'source_id' and 'source')
lang_pairs = [col for col in df.columns if col not in ["source_HIN", "source_id"]]

for lp in lang_pairs:
    # Create per-language subfolder
    lp_dir = os.path.join(out_dir, lp)
    os.makedirs(lp_dir, exist_ok=True)

    # Unique translation IDs
    df[f"{lp}_id"] = [
        f"{lp}_{i+1}" if val.strip() != "" else ""
        for i, val in enumerate(df[lp])
    ]

    # Non-empty rows
    non_empty_df = df[df[lp].str.strip() != ""]
    empty_df = df[df[lp].str.strip() == ""]

    # Save empty rows: only source_id + source
    empty_file = os.path.join(lp_dir, f"{lp}_empty.txt")
    empty_df[["source_id", "source_HIN"]].to_csv(empty_file, index=False, sep="\t", header=False)

    # Save non-empty rows: source_id, source, translation_id, translation
    non_empty_file = os.path.join(lp_dir, f"{lp}_non_empty.txt")
    non_empty_df[["source_id", "source_HIN", f"{lp}_id", lp]].to_csv(non_empty_file, index=False, sep="\t", header=False)

    # Split non-empty into chunks of 30
    num_chunks = math.ceil(len(non_empty_df) / 30)
    for i in range(num_chunks):
        chunk = non_empty_df.iloc[i*30 : (i+1)*30]        
        chunk_file = os.path.join(lp_dir, f"{lp}_non_empty_part_{i+1}.txt")
        # chunk[["source_id", "source", f"{lp}_id", lp]].to_csv(chunk_file, index=False, sep="\t", header=False)
        chunk[lp].to_csv(chunk_file, index=False, header=False) # save only the translations        

print("✅ Files created successfully in:", out_dir)


✅ Files created successfully in: /home/soham37/python/POS_data_create/EDU/IITP
