In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_regression
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error, root_mean_squared_error

# Import the models
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR

In [12]:
# change for clean or unclean
file_name = "../data/data_with_outliers.csv"
df = pd.read_csv(file_name)

In [13]:
# Define features (X) and target (y)
# We drop StartupID as it's an identifier, not a predictive feature.
X = df.drop(columns=["StartupID", "Success Score"])
y = df["Success Score"]

# Calculate Mutual Information scores to find the most predictive features
mi_scores = mutual_info_regression(X, y, random_state=42)
mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)

# Select the top 15 features
top_features = mi_scores.sort_values(ascending=False).head(15).index.tolist()

print("Top 15 features selected based on Mutual Information:")
print(top_features)

# Create the final feature set with only the top predictors
X = X[top_features]

print(f"\nFinal feature set shape: {X.shape}")

Top 15 features selected based on Mutual Information:
['Industry_EdTech', 'Log_Customer Base (Millions)', 'Uses_Python', 'Industry_Healthcare', 'Industry_Logistics', 'Industry_AI', 'Funding Stage_IPO', 'Industry_FinTech', 'Uses_AI', 'Industry_Tech', 'Industry_E-commerce', 'Uses_Java', 'Log_Funding per Employee', 'Country_Encoded', 'Acquired?']

Final feature set shape: (5000, 15)


In [14]:
# split into 80/20 train and test split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("--- Data Split Complete ---")
print(f"Training set size: {len(X_train)} rows")
print(f"Testing set size: {len(X_test)} rows")
print("\n" + "=" * 50 + "\n")

--- Data Split Complete ---
Training set size: 4000 rows
Testing set size: 1000 rows




This is where it becomes different based on model

In [15]:
gbr = GradientBoostingRegressor(
    n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42
)

# Train the model
gbr.fit(X_train, y_train)

# Make predictions on the test set
gbr_predictions = gbr.predict(X_test)

# Evaluate the model
gbr_mae = mean_absolute_error(y_test, gbr_predictions)
gbr_r2 = r2_score(y_test, gbr_predictions)
gbr_mse = mean_squared_error(y_test, gbr_predictions)
gbr_rmse = root_mean_squared_error(y_test, gbr_predictions)
print(f"Gradient Boosting MAE: {gbr_mae:.4f}")
print(f"Gradient Boosting MSE: {gbr_mse:.4f}")
print(f"Gradient Boosting R-squared: {gbr_r2:.4f}")
print(f"Gradient Boosting RMSE: {gbr_rmse:.4f}")

Gradient Boosting MAE: 2.2655
Gradient Boosting MSE: 6.7148
Gradient Boosting R-squared: -0.0127
Gradient Boosting RMSE: 2.5913


Results:

Gradient Boosting MAE: 2.2655

Gradient Boosting MSE: 6.7148

Gradient Boosting R-squared: -0.0127

Gradient Boosting RMSE: 2.5913