In [None]:
import pandas as pd
import uuid
import json
import os
import ast

# --- Config ---
INPUT_FILE = "/Users/shashiranjan/Desktop/shuru/location_heirarchy_list/caste_data/phase_2/json_output_formate/json_output_final/output_try/final_caste_data - Sheet1 (1).csv"
OUTPUT_DIR = "output"
OUTPUT_FILE = os.path.join(OUTPUT_DIR, "caste_structure_final_with_uid.json")
os.makedirs(OUTPUT_DIR, exist_ok=True)

# --- Helper Functions ---
def normalize(text):
    return str(text).strip().replace(" ", "-").lower() if pd.notna(text) else ""

def clean_name(text):
    return str(text).strip().strip("'").strip('"')

def safe_eval(val):
    try:
        val = str(val).strip()
        if val.startswith("[") and val.endswith("]"):
            return [clean_name(x) for x in ast.literal_eval(val)]
        else:
            return [clean_name(x) for x in val.split(",") if clean_name(x)]
    except Exception:
        return []

def generate_display_name(name):
    name = clean_name(name)
    return {lang.capitalize(): name for lang in [
        "en", "hi", "as", "bn", "gu", "kn", "ml", "mr",
        "or", "pa", "ta", "te", "ori"
    ]}

# --- Load Data ---
df = pd.read_csv(INPUT_FILE)
df.rename(columns={"subcaste": "subcaste"}, inplace=True)
df['caste'] = df['caste'].astype(str).apply(clean_name)
df['subcaste'] = df['subcaste'].fillna("[]").apply(safe_eval)
df['source'] = df['source'].fillna('').astype(str)

# --- UUID Mapping ---
caste_uid_map = {c: str(uuid.uuid4()) for c in df['caste'].unique()}
df['casteUID'] = df['caste'].map(caste_uid_map)

# --- Caste Records ---
caste_records = []
for _, row in df.drop_duplicates(subset='caste').iterrows():
    caste = row['caste']
    caste_records.append({
        "id": caste_uid_map[caste],
        "name": normalize(caste),  # apply lower and replace space with dash
        "displayName": generate_display_name(caste),
        "casteType": "CASTE",
        "key": normalize(caste),
        "parentKey": None,
        "parentId": None,
        "stateId": None,
        "source": row['source']
    })

# --- Subcaste Records ---
subcaste_records = []
for _, row in df.iterrows():
    caste = row['caste']
    parent_id = row['casteUID']
    source = row['source']
    for sub in row['subcaste']:
        subcaste_records.append({
            "id": str(uuid.uuid4()),
            "name": normalize(sub),  # apply lower and replace space with dash
            "displayName": generate_display_name(sub),
            "casteType": "SUB_CASTE",
            "key": f"{normalize(caste)}#{normalize(sub)}",
            "parentKey": normalize(caste),
            "parentId": parent_id,
            "stateId": None,
            "source": source
        })

# --- Save Final JSON ---
final_json = caste_records + subcaste_records
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(final_json, f, indent=2, ensure_ascii=False)

print(f"✅ Final JSON with UID, name formatting, and source saved to: {OUTPUT_FILE}")

KeyError: 'Caste'

## after executing gemini_transliteration.py

In [5]:
df1 = pd.read_json("/Users/shashiranjan/Desktop/shuru/location_heirarchy_list/caste_data/phase_2/json_output_formate/json_output_final/output_try/output/caste_structure_final_transliterated_new.json")

In [6]:
df1

Unnamed: 0,id,name,displayName,casteType,key,parentKey,parentId,stateId,source
0,a6775d76-b537-4272-9d25-d7865a250150,lohana,"{'En': 'LOHANA', 'Hi': 'लोहना', 'As': 'লোহানা'...",CASTE,lohana,,,,GPT
1,f22c5356-19be-4deb-81fb-5413fdc6e173,aarakh,"{'En': 'AARAKH', 'Hi': 'आरख', 'As': 'আৰাখ', 'B...",CASTE,aarakh,,,,MF
2,a2d59a6d-28da-411b-88de-67e7752562de,modh,"{'En': 'MODH', 'Hi': 'मोढ', 'As': 'মোঢ', 'Bn':...",CASTE,modh,,,,GPT
3,e90c9aab-6c5b-4ad4-88f5-f9c84618c14f,abbasi,"{'En': 'ABBASI', 'Hi': 'अब्बासी', 'As': 'আব্বা...",CASTE,abbasi,,,,GOV
4,1fbcb9f6-c310-4b4a-b901-ea2b11556699,ablakaror,"{'En': 'ABLAKAROR', 'Hi': 'अबलकरोर', 'As': 'অব...",CASTE,ablakaror,,,,MF
5,ec9dddf7-ff52-4ea2-9d89-802802990a40,abdal,"{'En': 'ABDAL', 'Hi': 'अब्दाल', 'As': 'আবদাল',...",CASTE,abdal,,,,GOV
6,36c8903c-f896-4620-a200-189235498c4e,porwal,"{'En': 'PORWAL', 'Hi': 'पोरवाल', 'As': 'পোৰৱাল...",CASTE,porwal,,,,GPT
7,14fc05ec-3c1c-4202-a071-11cdf3b4d97f,vaniyar,"{'En': 'VANIYAR', 'Hi': 'वनियार', 'As': 'বণিয়...",CASTE,vaniyar,,,,GPT
8,34a31260-d4d0-4b73-9c5c-2a7ea62702dc,khandelwal,"{'En': 'KHANDELWAL', 'Hi': 'खण्डेलवाल', 'As': ...",CASTE,khandelwal,,,,GPT
9,9f961044-9fc7-4256-9d32-720a52042997,vaishya-vani,"{'En': 'VAISHYA VANI', 'Hi': 'वैश्य-वाणी', 'As...",CASTE,vaishya-vani,,,,GPT


In [10]:
def fix_display_name_en_case(d):
    if isinstance(d, dict) and "En" in d:
        d["En"] = d["En"].title()
    return d

df1['displayName'] = df1['displayName'].apply(fix_display_name_en_case)

In [19]:
import ast

# Step 1: Convert displayName column from string to dict (if needed)
df1['displayName'] = df1['displayName'].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)

# Step 2: Copy 'Or' to 'Ori'
def copy_or_to_ori(d):
    if isinstance(d, dict) and 'Or' in d:
        d['Ori'] = d['Or']
    return d

df1['displayName'] = df1['displayName'].apply(copy_or_to_ori)

In [20]:
df1.to_json("output/caste_structure_final_transliterated_final.json", orient="records", indent=2, force_ascii=False)