# Data Exploration - EV India 13 Cohort

This notebook explores the initial dataset to understand data quality, identify missing fields, and determine normalization needs.


In [1]:
import json
import pandas as pd
import numpy as np
from pathlib import Path
from collections import Counter

# Set up paths
project_root = Path().resolve().parent
data_path = project_root / "data" / "raw" / "data.json"

print(f"Loading data from: {data_path}")


Loading data from: /Users/kevinmcpherson/github-projects/emergent-ventures-semantic-map/data/raw/data.json


In [2]:
# Load data
with open(data_path, 'r') as f:
    data = json.load(f)

print(f"Total entries: {len(data)}")
print(f"\nFirst entry example:")
print(json.dumps(data[0], indent=2))


Total entries: 18

First entry example:
{
  "name": "Khyathi Komalan",
  "age": 19,
  "education": "Sophomore at Caltech, majoring in Mathematics",
  "location": "India/USA",
  "project_name": "Category Theory Research",
  "project_description": "Research applying category theory to domains ranging from quantum physics to social relationships.",
  "domains": [
    "mathematics",
    "category theory",
    "theoretical physics",
    "complex systems",
    "social modeling"
  ],
  "category": "research",
  "funding_type": "career development",
  "cohort": "EV India 13",
  "links": [],
  "embedding_text": "Category theory research applied to quantum physics and social relationships by Caltech math student."
}


In [3]:
# Convert to DataFrame for easier analysis
df = pd.DataFrame(data)
print(f"DataFrame shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nData types:")
print(df.dtypes)


DataFrame shape: (18, 12)

Columns: ['name', 'age', 'education', 'location', 'project_name', 'project_description', 'domains', 'category', 'funding_type', 'cohort', 'links', 'embedding_text']

Data types:
name                    object
age                    float64
education               object
location                object
project_name            object
project_description     object
domains                 object
category                object
funding_type            object
cohort                  object
links                   object
embedding_text          object
dtype: object


In [4]:
# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())
print(f"\nPercentage of missing values:")
print((df.isnull().sum() / len(df) * 100).round(2))


Missing values per column:
name                   0
age                    7
education              0
location               0
project_name           0
project_description    0
domains                0
category               0
funding_type           0
cohort                 0
links                  0
embedding_text         0
dtype: int64

Percentage of missing values:
name                    0.00
age                    38.89
education               0.00
location                0.00
project_name            0.00
project_description     0.00
domains                 0.00
category                0.00
funding_type            0.00
cohort                  0.00
links                   0.00
embedding_text          0.00
dtype: float64


In [5]:
# Analyze domains - collect all unique domains
all_domains = []
for domains_list in df['domains'].dropna():
    if isinstance(domains_list, list):
        all_domains.extend([d.lower().strip() for d in domains_list])

domain_counts = Counter(all_domains)
print(f"Total unique domains: {len(domain_counts)}")
print(f"\nDomain frequency:")
for domain, count in domain_counts.most_common():
    print(f"  {domain}: {count}")


Total unique domains: 46

Domain frequency:
  hardware: 5
  education: 3
  automation: 2
  ai: 2
  career development: 2
  energy: 2
  healthtech: 2
  mathematics: 1
  category theory: 1
  theoretical physics: 1
  complex systems: 1
  social modeling: 1
  biotech: 1
  regenerative medicine: 1
  stem cells: 1
  neurology: 1
  agriculture: 1
  sustainability: 1
  materials science: 1
  climate adaptation: 1
  robotics: 1
  agritech: 1
  hr tech: 1
  engineering: 1
  evs: 1
  battery technology: 1
  fintech: 1
  civic technology: 1
  legal tech: 1
  healthcare: 1
  medical devices: 1
  wearables: 1
  labor safety: 1
  transportation: 1
  open-source: 1
  platforms: 1
  physics: 1
  talent development: 1
  aerospace: 1
  community building: 1
  stem: 1
  assistive technology: 1
  digital access: 1
  smes: 1
  business education: 1
  learning platforms: 1


In [6]:
# Analyze categories
print("Category distribution:")
print(df['category'].value_counts())
print(f"\nUnique categories: {df['category'].nunique()}")


Category distribution:
category
hardware        6
software        4
startup         2
career          2
organization    2
research        1
education       1
Name: count, dtype: int64

Unique categories: 7


In [7]:
# Analyze locations
print("Location distribution:")
print(df['location'].value_counts())
print(f"\nUnique locations: {df['location'].nunique()}")
print(f"\nLocations with '/' (multi-location):")
multi_location = df[df['location'].str.contains('/', na=False)]
print(multi_location[['name', 'location']])


Location distribution:
location
India           16
India/USA        1
India/Canada     1
Name: count, dtype: int64

Unique locations: 3

Locations with '/' (multi-location):
                name      location
0    Khyathi Komalan     India/USA
2  Anushka Punukollu  India/Canada


In [8]:
# Analyze funding types
print("Funding type distribution:")
print(df['funding_type'].value_counts())


Funding type distribution:
funding_type
project               10
career development     4
startup                4
Name: count, dtype: int64


In [9]:
# Analyze age distribution
print("Age statistics:")
print(df['age'].describe())
print(f"\nEntries with age: {df['age'].notna().sum()} / {len(df)}")


Age statistics:
count    11.000000
mean     18.454545
std       3.205110
min      13.000000
25%      17.000000
50%      19.000000
75%      20.000000
max      25.000000
Name: age, dtype: float64

Entries with age: 11 / 18


In [10]:
# Check embedding_text quality
print("Embedding text length statistics:")
df['embedding_text_len'] = df['embedding_text'].str.len()
print(df['embedding_text_len'].describe())
print(f"\nShortest embedding texts:")
print(df.nsmallest(3, 'embedding_text_len')[['name', 'embedding_text']])


Embedding text length statistics:
count     18.000000
mean      58.388889
std       14.801188
min       35.000000
25%       50.250000
50%       55.500000
75%       65.750000
max      101.000000
Name: embedding_text_len, dtype: float64

Shortest embedding texts:
             name                                embedding_text
15  Krupal Virani           General career development support.
3      Deev Mehta  Autonomous rover enabling automated farming.
13     Yash Darji  Building an experimental rocketry community.


In [11]:
# Check for empty links
empty_links = df[df['links'].apply(lambda x: len(x) == 0 if isinstance(x, list) else True)]
print(f"Entries with empty links: {len(empty_links)} / {len(df)}")


Entries with empty links: 18 / 18


In [12]:
# Summary of normalization needs
print("=== NORMALIZATION NEEDS ===")
print("\n1. Domains:")
print(f"   - {len(domain_counts)} unique domain values need standardization")
print(f"   - Some may need merging (e.g., 'AI' vs 'artificial intelligence')")
print(f"   - Need to handle case variations")

print("\n2. Categories:")
print(f"   - {df['category'].nunique()} categories appear consistent")
print(f"   - Values: {list(df['category'].unique())}")

print("\n3. Locations:")
print(f"   - {df['location'].nunique()} unique locations")
print(f"   - {len(multi_location)} entries have multiple locations (separated by '/')")
print(f"   - Need to standardize format")

print("\n4. Embedding text:")
print(f"   - Some entries have very short embedding_text")
print(f"   - Should enhance with more context (name + project + domains + category)")

print("\n5. Missing data:")
print(f"   - Age: {df['age'].isna().sum()} missing")
print(f"   - Education: {df['education'].isna().sum()} missing")
print(f"   - Links: {len(empty_links)} empty")


=== NORMALIZATION NEEDS ===

1. Domains:
   - 46 unique domain values need standardization
   - Some may need merging (e.g., 'AI' vs 'artificial intelligence')
   - Need to handle case variations

2. Categories:
   - 7 categories appear consistent
   - Values: ['research', 'startup', 'hardware', 'software', 'career', 'education', 'organization']

3. Locations:
   - 3 unique locations
   - 2 entries have multiple locations (separated by '/')
   - Need to standardize format

4. Embedding text:
   - Some entries have very short embedding_text
   - Should enhance with more context (name + project + domains + category)

5. Missing data:
   - Age: 7 missing
   - Education: 0 missing
   - Links: 18 empty
