# Data Exploration - EV India 13 Cohort

This notebook explores the initial dataset to understand data quality, identify missing fields, and determine normalization needs.


In [None]:
import json
import pandas as pd
import numpy as np
from pathlib import Path
from collections import Counter

# Set up paths
project_root = Path().resolve().parent
data_path = project_root / "data" / "raw" / "data.json"

print(f"Loading data from: {data_path}")


In [None]:
# Load data
with open(data_path, 'r') as f:
    data = json.load(f)

print(f"Total entries: {len(data)}")
print(f"\nFirst entry example:")
print(json.dumps(data[0], indent=2))


In [None]:
# Convert to DataFrame for easier analysis
df = pd.DataFrame(data)
print(f"DataFrame shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nData types:")
print(df.dtypes)


In [None]:
# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())
print(f"\nPercentage of missing values:")
print((df.isnull().sum() / len(df) * 100).round(2))


In [None]:
# Analyze domains - collect all unique domains
all_domains = []
for domains_list in df['domains'].dropna():
    if isinstance(domains_list, list):
        all_domains.extend([d.lower().strip() for d in domains_list])

domain_counts = Counter(all_domains)
print(f"Total unique domains: {len(domain_counts)}")
print(f"\nDomain frequency:")
for domain, count in domain_counts.most_common():
    print(f"  {domain}: {count}")


In [None]:
# Analyze categories
print("Category distribution:")
print(df['category'].value_counts())
print(f"\nUnique categories: {df['category'].nunique()}")


In [None]:
# Analyze locations
print("Location distribution:")
print(df['location'].value_counts())
print(f"\nUnique locations: {df['location'].nunique()}")
print(f"\nLocations with '/' (multi-location):")
multi_location = df[df['location'].str.contains('/', na=False)]
print(multi_location[['name', 'location']])


In [None]:
# Analyze funding types
print("Funding type distribution:")
print(df['funding_type'].value_counts())


In [None]:
# Analyze age distribution
print("Age statistics:")
print(df['age'].describe())
print(f"\nEntries with age: {df['age'].notna().sum()} / {len(df)}")


In [None]:
# Check embedding_text quality
print("Embedding text length statistics:")
df['embedding_text_len'] = df['embedding_text'].str.len()
print(df['embedding_text_len'].describe())
print(f"\nShortest embedding texts:")
print(df.nsmallest(3, 'embedding_text_len')[['name', 'embedding_text']])


In [None]:
# Check for empty links
empty_links = df[df['links'].apply(lambda x: len(x) == 0 if isinstance(x, list) else True)]
print(f"Entries with empty links: {len(empty_links)} / {len(df)}")


In [None]:
# Summary of normalization needs
print("=== NORMALIZATION NEEDS ===")
print("\n1. Domains:")
print(f"   - {len(domain_counts)} unique domain values need standardization")
print(f"   - Some may need merging (e.g., 'AI' vs 'artificial intelligence')")
print(f"   - Need to handle case variations")

print("\n2. Categories:")
print(f"   - {df['category'].nunique()} categories appear consistent")
print(f"   - Values: {list(df['category'].unique())}")

print("\n3. Locations:")
print(f"   - {df['location'].nunique()} unique locations")
print(f"   - {len(multi_location)} entries have multiple locations (separated by '/')")
print(f"   - Need to standardize format")

print("\n4. Embedding text:")
print(f"   - Some entries have very short embedding_text")
print(f"   - Should enhance with more context (name + project + domains + category)")

print("\n5. Missing data:")
print(f"   - Age: {df['age'].isna().sum()} missing")
print(f"   - Education: {df['education'].isna().sum()} missing")
print(f"   - Links: {len(empty_links)} empty")
