In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_regression
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error

# Import the models
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR


In [7]:
# change for clean or unclean
file_name = "../data/data_without_outliers.csv"
df = pd.read_csv(file_name)

In [4]:
# Define features (X) and target (y)
# We drop StartupID as it's an identifier, not a predictive feature.
X = df.drop(columns=["StartupID", "Success Score"])
y = df["Success Score"]

# Calculate Mutual Information scores to find the most predictive features
mi_scores = mutual_info_regression(X, y, random_state=42)
mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)

# Select the top 15 features
top_features = mi_scores.sort_values(ascending=False).head(15).index.tolist()

print("Top 15 features selected based on Mutual Information:")
print(top_features)

# Create the final feature set with only the top predictors
X = X[top_features]

print(f"\nFinal feature set shape: {X.shape}")

Top 15 features selected based on Mutual Information:
['Founded Year', 'IPO?', 'Log_Annual Revenue ($M)', 'Log_Number of Employees', 'Industry_Energy', 'Industry_E-commerce', 'Uses_Java', 'Industry_AI', 'Uses_AI', 'Log_Valuation ($B)', 'Industry_Logistics', 'Log_Social Media Followers', 'Industry_FinTech', 'Log_Revenue per Employee', 'Funding Stage_Seed']

Final feature set shape: (3471, 15)


In [8]:
# split into 80/20 train and test split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("--- Data Split Complete ---")
print(f"Training set size: {len(X_train)} rows")
print(f"Testing set size: {len(X_test)} rows")
print("\n" + "=" * 50 + "\n")

--- Data Split Complete ---
Training set size: 2776 rows
Testing set size: 695 rows




This is where it becomes different based on model

In [14]:
rfr = RandomForestRegressor(
    n_estimators=100, max_depth=3, random_state=42
)

# Train the model
rfr.fit(X_train, y_train)

# Make predictions on the test set
rfr_predictions = rfr.predict(X_test)

# Evaluate the model
rfr_mae = mean_absolute_error(y_test, rfr_predictions)
rfr_r2 = r2_score(y_test, rfr_predictions)
rfr_mse = mean_squared_error(y_test, rfr_predictions)
print(f"Random Forest Reg MAE: {rfr_mae:.4f}")
print(f"Random Forest Reg MSE: {rfr_mse:.4f}")
print(f"Random Forest Reg R-squared: {rfr_r2:.4f}")

Random Forest Reg MAE: 2.2845
Random Forest Reg MSE: 6.8466
Random Forest Reg R-squared: -0.0087


Results:

Gradient Boosting MAE: 2.3237
Gradient Boosting R-squared: -0.0460