# Data Cleaning & Normalization

This notebook cleans and normalizes the EV India 13 dataset, creating controlled vocabularies and enhancing embedding text.


In [1]:
import json
import sys
from pathlib import Path

# Add src to path
project_root = Path().resolve().parent
sys.path.insert(0, str(project_root / "src"))

from data_cleaning import clean_dataset

# Set up paths
data_path = project_root / "data" / "raw" / "data.json"
output_path = project_root / "data" / "processed" / "cleaned_data.json"
vocab_dir = project_root / "data" / "vocabularies"

print(f"Loading data from: {data_path}")
print(f"Output will be saved to: {output_path}")
print(f"Vocabularies will be saved to: {vocab_dir}")


Loading data from: /Users/kevinmcpherson/github-projects/emergent-ventures-semantic-map/data/raw/data.json
Output will be saved to: /Users/kevinmcpherson/github-projects/emergent-ventures-semantic-map/data/processed/cleaned_data.json
Vocabularies will be saved to: /Users/kevinmcpherson/github-projects/emergent-ventures-semantic-map/data/vocabularies


In [2]:
# Load raw data
with open(data_path, 'r') as f:
    raw_data = json.load(f)

print(f"Loaded {len(raw_data)} entries")
print(f"\nExample raw entry:")
print(json.dumps(raw_data[0], indent=2))


Loaded 18 entries

Example raw entry:
{
  "name": "Khyathi Komalan",
  "age": 19,
  "education": "Sophomore at Caltech, majoring in Mathematics",
  "location": "India/USA",
  "project_name": "Category Theory Research",
  "project_description": "Research applying category theory to domains ranging from quantum physics to social relationships.",
  "domains": [
    "mathematics",
    "category theory",
    "theoretical physics",
    "complex systems",
    "social modeling"
  ],
  "category": "research",
  "funding_type": "career development",
  "cohort": "EV India 13",
  "links": [],
  "embedding_text": "Category theory research applied to quantum physics and social relationships by Caltech math student."
}


In [3]:
# Clean dataset
cleaned_data, vocabularies = clean_dataset(raw_data, vocab_dir)

print(f"Cleaned {len(cleaned_data)} entries")
print(f"\nCreated vocabularies:")
print(f"  - {len(vocabularies['domains'])} canonical domains")
print(f"  - {len(vocabularies['categories'])} categories")
print(f"  - {len(vocabularies['locations'])} locations")


Cleaned 18 entries

Created vocabularies:
  - 29 canonical domains
  - 7 categories
  - 1 locations


In [4]:
# Show example cleaned entry
print("Example cleaned entry:")
print(json.dumps(cleaned_data[0], indent=2))


Example cleaned entry:
{
  "name": "Khyathi Komalan",
  "age": 19,
  "education": "Sophomore at Caltech, majoring in Mathematics",
  "location": "India/USA",
  "project_name": "Category Theory Research",
  "project_description": "Research applying category theory to domains ranging from quantum physics to social relationships.",
  "domains": [
    "mathematics",
    "category theory",
    "theoretical physics",
    "complex systems",
    "social modeling"
  ],
  "category": "research",
  "funding_type": "career development",
  "cohort": "EV India 13",
  "links": [],
  "embedding_text": "Khyathi Komalan. works on Category Theory Research. : Research applying category theory to domains ranging from quantum physics to social relationships.. Domains: mathematics, category theory, theoretical physics, complex systems, social modeling. Category: research.",
  "domains_normalized": [
    "mathematics",
    "physics",
    "complex systems",
    "social modeling"
  ],
  "location_normalized": "

In [5]:
# Show vocabularies
print("=== DOMAINS ===")
for domain in vocabularies['domains'][:20]:  # Show first 20
    print(f"  - {domain}")
if len(vocabularies['domains']) > 20:
    print(f"  ... and {len(vocabularies['domains']) - 20} more")

print(f"\n=== CATEGORIES ===")
for cat in vocabularies['categories']:
    print(f"  - {cat}")

print(f"\n=== LOCATIONS ===")
for loc in vocabularies['locations']:
    print(f"  - {loc}")


=== DOMAINS ===
  - aerospace
  - agriculture
  - artificial intelligence
  - automation
  - biotechnology
  - business
  - career development
  - civic technology
  - climate adaptation
  - community building
  - complex systems
  - education
  - energy
  - engineering
  - financial technology
  - hardware
  - healthcare
  - human resources
  - legal technology
  - materials science
  ... and 9 more

=== CATEGORIES ===
  - career
  - education
  - hardware
  - organization
  - research
  - software
  - startup

=== LOCATIONS ===
  - India


In [6]:
# Compare original vs normalized domains for a few entries
import pandas as pd
df = pd.DataFrame(cleaned_data)

print("Domain normalization examples:")
for idx, row in df.head(5).iterrows():
    print(f"\n{row['name']}:")
    print(f"  Original: {row.get('domains', [])}")
    print(f"  Normalized: {row.get('domains_normalized', [])}")
    print(f"  Enhanced embedding_text: {row['embedding_text'][:100]}...")


Domain normalization examples:

Khyathi Komalan:
  Original: ['mathematics', 'category theory', 'theoretical physics', 'complex systems', 'social modeling']
  Normalized: ['mathematics', 'physics', 'complex systems', 'social modeling']
  Enhanced embedding_text: Khyathi Komalan. works on Category Theory Research. : Research applying category theory to domains r...

Soumil Nema:
  Original: ['biotech', 'regenerative medicine', 'stem cells', 'neurology']
  Normalized: ['biotechnology', 'healthcare']
  Enhanced embedding_text: Soumil Nema. works on Stem Cell Therapies for Neurological Disorders. : Developing stem cell therapi...

Anushka Punukollu:
  Original: ['agriculture', 'sustainability', 'materials science', 'climate adaptation']
  Normalized: ['agriculture', 'sustainability', 'materials science', 'climate adaptation']
  Enhanced embedding_text: Anushka Punukollu. works on SucroSoil. : Repurposing sugarcane waste into hydrogels to combat soil e...

Deev Mehta:
  Original: ['robotics

In [7]:
# Save cleaned data
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w') as f:
    json.dump(cleaned_data, f, indent=2)

print(f"Saved cleaned data to: {output_path}")
print(f"Saved vocabularies to: {vocab_dir}")


Saved cleaned data to: /Users/kevinmcpherson/github-projects/emergent-ventures-semantic-map/data/processed/cleaned_data.json
Saved vocabularies to: /Users/kevinmcpherson/github-projects/emergent-ventures-semantic-map/data/vocabularies
