In [6]:
import pandas as pd
import numpy as np
import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
import kagglehub
from kagglehub import KaggleDatasetAdapter

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
try:
    print("Loading original dataset from KaggleHub...")
    file_path = "global_startup_success_dataset.csv"
    df_raw = kagglehub.load_dataset(
        KaggleDatasetAdapter.PANDAS,
        "hamnakaleemds/global-startup-success-dataset",
        file_path,
    )
    print(f"Successfully loaded raw dataset with shape {df_raw.shape}")
except Exception as e:
    print(f"Error loading dataset from KaggleHub: {e}")
    exit()

# How many samples are there for the different "Industry" categories?
print("\n" + "=" * 50 + "\n")
print("--- Checking Samples for Different Categories ---")
print("Number of startups per industry:")
print(df_raw["Industry"].value_counts())
print("\n" + "=" * 50 + "\n")

Loading original dataset from KaggleHub...


  df_raw = kagglehub.load_dataset(


Successfully loaded raw dataset with shape (5000, 15)


--- Checking Samples for Different Categories ---
Number of startups per industry:
Industry
Tech          524
Gaming        520
EdTech        518
Energy        510
Logistics     509
AI            501
FinTech       496
E-commerce    483
Healthcare    474
FoodTech      465
Name: count, dtype: int64




In [None]:
def run_pipeline_for_subgroup(df_subgroup, subgroup_name):
    """
    Takes a dataframe subgroup, runs the full data prep and model training pipeline,
    and returns the performance metrics.
    """
    print(f"--- Running pipeline for '{subgroup_name}' subgroup ---")
    print(f"Initial subgroup shape: {df_subgroup.shape}")

    # data cleaning and feature engineering
    df = df_subgroup.copy()
    df.drop_duplicates(inplace=True)

    # Most feature engineering steps from the original script
    df["StartupID"] = (
        df["Startup Name"].str.extract(r"_(\d+)", expand=False).astype("Int64")
    )
    binary_map = {"Yes": 1, "No": 0}
    df["Acquired?"] = df["Acquired?"].map(binary_map)
    df["IPO?"] = df["IPO?"].map(binary_map)
    current_year = datetime.datetime.now().year
    df["Startup Age"] = current_year - df["Founded Year"]
    df["Number of Employees"] = df["Number of Employees"].replace(0, np.nan)
    df["Funding per Employee"] = df["Total Funding ($M)"] / df["Number of Employees"]
    df["Revenue per Employee"] = df["Annual Revenue ($M)"] / df["Number of Employees"]
    df[["Funding per Employee", "Revenue per Employee"]] = df[
        ["Funding per Employee", "Revenue per Employee"]
    ].fillna(0)
    df.replace([np.inf, -np.inf], 0, inplace=True)
    country_freq = df["Country"].value_counts(normalize=True)
    df["Country_Encoded"] = df["Country"].map(country_freq)
    # OneHotEncode only 'Funding Stage' since 'Industry' is constant for the subgroup
    ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
    encoded_data = ohe.fit_transform(df[["Funding Stage"]])
    encoded_df = pd.DataFrame(
        encoded_data, columns=ohe.get_feature_names_out(["Funding Stage"])
    )
    df = pd.concat([df.reset_index(drop=True), encoded_df], axis=1)

    df["Uses_Python"] = (
        df["Tech Stack"].str.contains("Python", case=False, na=False).astype(int)
    )
    df["Uses_Java"] = (
        df["Tech Stack"].str.contains("Java", case=False, na=False).astype(int)
    )
    df["Uses_Nodejs"] = (
        df["Tech Stack"].str.contains("Node.js", case=False, na=False).astype(int)
    )
    df["Uses_AI_in_Stack"] = (
        df["Tech Stack"].str.contains("AI", case=False, na=False).astype(int)
    )

    df = df.drop(
        columns=["Startup Name", "Country", "Funding Stage", "Industry", "Tech Stack"],
        errors="ignore",
    )

    # Log Transformation
    skewed_cols = [
        "Total Funding ($M)",
        "Number of Employees",
        "Annual Revenue ($M)",
        "Valuation ($B)",
        "Customer Base (Millions)",
        "Social Media Followers",
        "Funding per Employee",
        "Revenue per Employee",
    ]
    for col in skewed_cols:
        if col in df.columns:
            df[col] = np.log1p(df[col])
            df.rename(columns={col: f"Log_{col}"}, inplace=True)

    # --- 2. Define Features and Target ---
    X = df.drop(columns=["StartupID", "Success Score"])
    y = df["Success Score"]

    # train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # check representativeness
    # For regression, we can check if the mean of the target is similar in both sets.
    print(f"Mean 'Success Score' in full subgroup: {y.mean():.2f}")
    print(f"Mean 'Success Score' in training set:   {y_train.mean():.2f}")
    print(f"Mean 'Success Score' in testing set:    {y_test.mean():.2f}")
    
    # We scale after splitting to prevent data leakage from the test set
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    #model training and evaluation
    model = GradientBoostingRegressor(
        n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42
    )
    model.fit(X_train_scaled, y_train)
    predictions = model.predict(X_test_scaled)

    mae = mean_absolute_error(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)

    print(f"Model performance for '{subgroup_name}': R-squared = {r2:.4f}")
    print("\n" + "=" * 50 + "\n")

    return {
        "Subgroup": subgroup_name,
        "MAE": mae,
        "MSE": mse,
        "R-squared": r2,
        "SampleSize": len(df),
    }


# List of industries to test. Choose ones with a decent number of samples.
industries_to_test = ["Healthcare", "Tech", "AI", "Gaming", "FinTech"]
all_results = []

for industry in industries_to_test:
    # Filter the raw dataframe for the current industry
    df_industry_subgroup = df_raw[df_raw["Industry"] == industry]

    # Run the pipeline and store the results
    # We only run if there are enough samples to create a meaningful train/test split
    if len(df_industry_subgroup) > 50:
        industry_results = run_pipeline_for_subgroup(df_industry_subgroup, industry)
        all_results.append(industry_results)
    else:
        print(
            f"Skipping '{industry}' due to insufficient sample size ({len(df_industry_subgroup)} rows)."
        )
        print("\n" + "=" * 50 + "\n")

print("--- Final Subgroup Analysis Results ---")
results_df = pd.DataFrame(all_results)
print(results_df.to_string())

--- Running pipeline for 'Healthcare' subgroup ---
Initial subgroup shape: (474, 15)
Mean 'Success Score' in full subgroup: 4.82
Mean 'Success Score' in training set:   4.87
Mean 'Success Score' in testing set:    4.62
The means are very close, indicating a representative split.
Model performance for 'Healthcare': R-squared = -0.1890


--- Running pipeline for 'Tech' subgroup ---
Initial subgroup shape: (524, 15)
Mean 'Success Score' in full subgroup: 5.20
Mean 'Success Score' in training set:   5.19
Mean 'Success Score' in testing set:    5.25
The means are very close, indicating a representative split.
Model performance for 'Tech': R-squared = -0.1936


--- Running pipeline for 'AI' subgroup ---
Initial subgroup shape: (501, 15)
Mean 'Success Score' in full subgroup: 5.12
Mean 'Success Score' in training set:   5.05
Mean 'Success Score' in testing set:    5.37
The means are very close, indicating a representative split.
Model performance for 'AI': R-squared = -0.1474


--- Running pi