In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/start-up-founder-retention-prediction/sample_submission.csv
/kaggle/input/start-up-founder-retention-prediction/train.csv
/kaggle/input/start-up-founder-retention-prediction/test.csv


In [2]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import VotingClassifier


train_df = pd.read_csv("/kaggle/input/start-up-founder-retention-prediction/train.csv")
test_df = pd.read_csv("/kaggle/input/start-up-founder-retention-prediction/test.csv")

founder_ids = test_df["founder_id"]
X = train_df.drop(["retention_status", "founder_id"], axis=1)
X_test = test_df.drop("founder_id", axis=1)
y = train_df["retention_status"]


target_le = LabelEncoder()
y_encoded = target_le.fit_transform(y)
print(f"Target Mapping: {dict(zip(target_le.classes_, target_le.transform(target_le.classes_)))}")


def engineer_features(df):
    df = df.copy()
    wlb_map = {'Low': 1, 'Medium': 2, 'High': 3}
    ot_map = {'Yes': 1, 'No': 0}
    df['wlb_score'] = df['work_life_balance_rating'].map(wlb_map).fillna(2)
    df['ot_score'] = df['working_overtime'].map(ot_map).fillna(0)
    df['burnout_index'] = df['ot_score'] / (df['wlb_score'] + 0.1)
    
    size_map = {'Small': 10, 'Medium': 50, 'Large': 200}
    df['team_size_est'] = df['team_size_category'].map(size_map).fillna(10)
    revenue_safe = df['monthly_revenue_generated'].clip(lower=0)
    df['rev_per_employee'] = np.log1p(revenue_safe) / (df['team_size_est'] + 1)
    
    df['life_investment_ratio'] = df['years_with_startup'] / (df['founder_age'] + 1)
    
    satisfaction_map = {'Low': 1, 'Medium': 2, 'High': 3}
    performance_map = {'Low': 1, 'Medium': 2, 'High': 3}
    df['satisfaction_score'] = df['venture_satisfaction'].map(satisfaction_map).fillna(2)
    df['performance_score'] = df['startup_performance_rating'].map(performance_map).fillna(2)
    df['satisfaction_performance_product'] = df['satisfaction_score'] * df['performance_score']
    
    df['late_career_founder'] = ((df['founder_age'] > 40) & (df['years_with_startup'] < 3)).astype(int)
    leadership_map = {'Local': 1, 'Regional': 2, 'National': 3, 'Global': 4}
    df['leadership_numeric'] = df['leadership_scope'].map(leadership_map).fillna(2)
    
    df['high_risk_profile'] = ((df['burnout_index'] > 0.5) & (df['satisfaction_score'] <= 2)).astype(int)

    return df

X_eng = engineer_features(X)
X_test_eng = engineer_features(X_test)


cat_cols = [
    'founder_gender', 'founder_role', 'work_life_balance_rating', 
    'venture_satisfaction', 'startup_performance_rating', 'working_overtime',
    'education_background', 'personal_status', 'startup_stage', 
    'team_size_category', 'remote_operations', 'leadership_scope', 
    'innovation_support', 'startup_reputation', 'founder_visibility'
]
num_cols = [c for c in X_eng.columns if c not in cat_cols]

num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False)) 
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
        ('cat', cat_transformer, cat_cols)
    ])





param_grid_mlp = {
    'classifier__hidden_layer_sizes': [(100, 50), (150, 75), (100, 50, 25)],
    'classifier__activation': ['relu', 'tanh'],
    'classifier__solver': ['adam'],
    'classifier__alpha': [0.01, 0.05, 0.1],
    'classifier__learning_rate': ['adaptive'],
    'classifier__learning_rate_init': [0.001, 0.01],
    'classifier__max_iter': [1000],
    'classifier__early_stopping': [True],
    'classifier__validation_fraction': [0.1],
    'classifier__n_iter_no_change': [15]
}

mlp = MLPClassifier(random_state=42)
mlp_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', mlp)])

# Reduced CV folds for speed
grid_search = GridSearchCV(
    mlp_pipeline,
    param_grid_mlp,
    cv=3,  
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_eng, y_encoded)

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best CV Accuracy: {grid_search.best_score_:.4f}")

best_model = grid_search.best_estimator_


kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)  
oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test))

print("\n Generating OOF predictions with Best Model")
from sklearn.base import clone

for fold, (train_idx, val_idx) in enumerate(kf.split(X_eng, y_encoded)):
    print(f"Fold {fold + 1}/5")
    X_tr, X_val = X_eng.iloc[train_idx], X_eng.iloc[val_idx]
    y_tr = y_encoded[train_idx]
    
    fold_model = clone(best_model)
    fold_model.fit(X_tr, y_tr)
    
    val_prob = fold_model.predict_proba(X_val)[:, 1]
    oof_preds[val_idx] = val_prob
    
    test_prob = fold_model.predict_proba(X_test_eng)[:, 1]
    test_preds += test_prob / kf.get_n_splits()
    
    fold_acc = accuracy_score(y_encoded[val_idx], (val_prob > 0.5).astype(int))
    print(f"  Fold {fold + 1} Accuracy: {fold_acc:.4f}")

# Optimising Threshold
thresholds = np.arange(0.35, 0.65, 0.01)
best_acc_thresh = 0
best_thresh = 0.5

for t in thresholds:
    pred_labels = (oof_preds > t).astype(int)
    acc = accuracy_score(y_encoded, pred_labels)
    if acc > best_acc_thresh:
        best_acc_thresh = acc
        best_thresh = t

print(f"\nBest Threshold: {best_thresh:.2f} -> OOF Accuracy: {best_acc_thresh:.4f}")


final_preds_encoded = (test_preds > best_thresh).astype(int)
final_preds_labels = target_le.inverse_transform(final_preds_encoded)

submission = pd.DataFrame({
    'founder_id': founder_ids, 
    'retention_status': final_preds_labels
})
submission.to_csv('submission.csv', index=False)
print("Submission file created")

Target Mapping: {'Left': 0, 'Stayed': 1}
Fitting 3 folds for each of 36 candidates, totalling 108 fits


  return op(a, b)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  return op(a, b)
  result = getattr(ufunc, method)(*inputs, **kwargs)


Best Parameters: {'classifier__activation': 'relu', 'classifier__alpha': 0.1, 'classifier__early_stopping': True, 'classifier__hidden_layer_sizes': (150, 75), 'classifier__learning_rate': 'adaptive', 'classifier__learning_rate_init': 0.01, 'classifier__max_iter': 1000, 'classifier__n_iter_no_change': 15, 'classifier__solver': 'adam', 'classifier__validation_fraction': 0.1}
Best CV Accuracy: 0.7493

 Generating OOF predictions with Best Model
Fold 1/5
  Fold 1 Accuracy: 0.7569
Fold 2/5
  Fold 2 Accuracy: 0.7484
Fold 3/5
  Fold 3 Accuracy: 0.7454
Fold 4/5
  Fold 4 Accuracy: 0.7521
Fold 5/5
  Fold 5 Accuracy: 0.7435

Best Threshold: 0.51 -> OOF Accuracy: 0.7501
Submission file created
