# Data Cleaning & Normalization

This notebook cleans and normalizes the EV India 13 dataset, creating controlled vocabularies and enhancing embedding text.


In [None]:
import json
import sys
from pathlib import Path

# Add src to path
project_root = Path().resolve().parent
sys.path.insert(0, str(project_root / "src"))

from data_cleaning import clean_dataset

# Set up paths
data_path = project_root / "data" / "raw" / "data.json"
output_path = project_root / "data" / "processed" / "cleaned_data.json"
vocab_dir = project_root / "data" / "vocabularies"

print(f"Loading data from: {data_path}")
print(f"Output will be saved to: {output_path}")
print(f"Vocabularies will be saved to: {vocab_dir}")


In [None]:
# Load raw data
with open(data_path, 'r') as f:
    raw_data = json.load(f)

print(f"Loaded {len(raw_data)} entries")
print(f"\nExample raw entry:")
print(json.dumps(raw_data[0], indent=2))


In [None]:
# Clean dataset
cleaned_data, vocabularies = clean_dataset(raw_data, vocab_dir)

print(f"Cleaned {len(cleaned_data)} entries")
print(f"\nCreated vocabularies:")
print(f"  - {len(vocabularies['domains'])} canonical domains")
print(f"  - {len(vocabularies['categories'])} categories")
print(f"  - {len(vocabularies['locations'])} locations")


In [None]:
# Show example cleaned entry
print("Example cleaned entry:")
print(json.dumps(cleaned_data[0], indent=2))


In [None]:
# Show vocabularies
print("=== DOMAINS ===")
for domain in vocabularies['domains'][:20]:  # Show first 20
    print(f"  - {domain}")
if len(vocabularies['domains']) > 20:
    print(f"  ... and {len(vocabularies['domains']) - 20} more")

print(f"\n=== CATEGORIES ===")
for cat in vocabularies['categories']:
    print(f"  - {cat}")

print(f"\n=== LOCATIONS ===")
for loc in vocabularies['locations']:
    print(f"  - {loc}")


In [None]:
# Compare original vs normalized domains for a few entries
import pandas as pd
df = pd.DataFrame(cleaned_data)

print("Domain normalization examples:")
for idx, row in df.head(5).iterrows():
    print(f"\n{row['name']}:")
    print(f"  Original: {row.get('domains', [])}")
    print(f"  Normalized: {row.get('domains_normalized', [])}")
    print(f"  Enhanced embedding_text: {row['embedding_text'][:100]}...")


In [None]:
# Save cleaned data
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w') as f:
    json.dump(cleaned_data, f, indent=2)

print(f"Saved cleaned data to: {output_path}")
print(f"Saved vocabularies to: {vocab_dir}")
