In [11]:
try:
    import xgboost
    import lightgbm
except ImportError:
    print("Installing missing libraries: xgboost and lightgbm...")
    import sys
    !{sys.executable} -m pip install xgboost lightgbm
    print("Installation complete. Please restart the kernel and run all cells again.")

In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.feature_selection import mutual_info_regression
import xgboost as xgb
import lightgbm as lgb

In [13]:
# change for clean or unclean
file_name = "../data/data_without_outliers.csv"
df = pd.read_csv(file_name)
print(df.columns.tolist())

['Founded Year', 'Log_Total Funding ($M)', 'Log_Number of Employees', 'Log_Annual Revenue ($M)', 'Log_Valuation ($B)', 'Success Score', 'Acquired?', 'IPO?', 'Log_Customer Base (Millions)', 'Log_Social Media Followers', 'StartupID', 'Startup Age', 'Log_Funding per Employee', 'Log_Revenue per Employee', 'Country_Encoded', 'Funding Stage_IPO', 'Funding Stage_Seed', 'Funding Stage_Series A', 'Funding Stage_Series B', 'Funding Stage_Series C', 'Industry_AI', 'Industry_E-commerce', 'Industry_EdTech', 'Industry_Energy', 'Industry_FinTech', 'Industry_FoodTech', 'Industry_Gaming', 'Industry_Healthcare', 'Industry_Logistics', 'Industry_Tech', 'Uses_Python', 'Uses_Java', 'Uses_Nodejs', 'Uses_AI']


Building our own Success Score based on Features

In [14]:
# Drop the old Success Score column
df.drop(columns=['Success Score'], inplace=True)

# Approach: use outcome variables as measures of success: IPO, Aquired, Log_Valuation

# Define the weights for our new score. IPO is the biggest success,
# followed by acquisition, and then a high valuation.
# These weights are based on domain knowledge.
ipo_weight = 0.50
acquired_weight = 0.25
valuation_weight = 0.25

# Create the new score.
# We use the raw values (before scaling) for IPO and Acquired status.
# The valuation is already scaled, so it will contribute proportionally.
df["New_Success_Score"] = (
    df["IPO?"] * ipo_weight
    + df["Acquired?"] * acquired_weight
    + df["Log_Valuation ($B)"] * valuation_weight
)

print("Description of the new score: ", df['New_Success_Score'].describe())


Description of the new score:  count    3.471000e+03
mean     7.369501e-17
std      6.135522e-01
min     -1.464480e+00
25%     -5.102392e-01
50%     -1.559064e-02
75%      4.988839e-01
max      1.060610e+00
Name: New_Success_Score, dtype: float64


In [15]:
# exclude features that were used to generate success score (prevent data leakage)
y = df['New_Success_Score']
features_to_exclude = [
    "New_Success_Score",
    "IPO?",
    "Acquired?",
    "Log_Valuation ($B)",
    "StartupID",
]
X_all = df.drop(columns=features_to_exclude)

In [16]:
# select top 10 features by mutual information with success score
# (can play around with this and try selecting top 15, etc to see best outcome)
print("Performing feature selection with the new Success Score")
mi_scores = mutual_info_regression(X_all, y, random_state=42)
mi_scores = pd.Series(mi_scores, name="MI Scores", index=X_all.columns)
top_features = mi_scores.sort_values(ascending=False).head(15).index.tolist()

print("Top 10 features selected based on Mutual Information:")
print(top_features)

# The final feature set (X) contains only the top predictors
X = X_all[top_features]

Performing feature selection with the new Success Score
Top 10 features selected based on Mutual Information:
['Industry_Healthcare', 'Funding Stage_Seed', 'Uses_Python', 'Uses_AI', 'Industry_Gaming', 'Funding Stage_Series A', 'Industry_Logistics', 'Funding Stage_Series B', 'Funding Stage_Series C', 'Industry_FoodTech', 'Log_Annual Revenue ($M)', 'Industry_Tech', 'Log_Customer Base (Millions)', 'Startup Age', 'Funding Stage_IPO']


In [17]:
print(f"Shape of our final features (X): {X.shape}")
print(f"Shape of our target (y): {y.shape}")
print("\n" + "=" * 50 + "\n")

Shape of our final features (X): (3471, 15)
Shape of our target (y): (3471,)




This is where it becomes different based on model

In [None]:
print("--- Training and Evaluating Multiple Models ---")

# Define the models to test
models = {
    "Gradient Boosting": GradientBoostingRegressor(
        n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42
    ),
    "XGBoost": xgb.XGBRegressor(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=3,
        random_state=42,
        objective="reg:squarederror",
    ),
    "LightGBM": lgb.LGBMRegressor(
        n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42
    ),
}

results = []

# Loop through each model and perform cross-validation
for name, model in models.items():
    print(f"Training {name}...")
    r2_scores = cross_val_score(model, X, y, cv=5, scoring="r2")
    mae_scores = -cross_val_score(model, X, y, cv=5, scoring="neg_mean_absolute_error")

    results.append(
        {
            "Model": name,
            "Avg R-squared": np.mean(r2_scores),
            "Avg MAE": np.mean(mae_scores),
        }
    )

print("\nCross-validation complete for all models.")
print("\n" + "=" * 50 + "\n")


print("--- Model Performance Comparison ---")
results_df = pd.DataFrame(results)
print(results_df.to_string(index=False))
print("\n" + "=" * 50 + "\n")
print("Script finished.")

--- Training and Evaluating Multiple Models ---
Training Gradient Boosting...
Training XGBoost...
Training LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000295 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 411
[LightGBM] [Info] Number of data points in the train set: 2776, number of used features: 15
[LightGBM] [Info] Start training from score -0.010618
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000338 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 410
[LightGBM] [Info] Number of data points in the train set: 2777, number of used features: 15
[LightGBM] [Info] Start training from score 0.002237
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000290 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 411
[Ligh

Results: