In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/start-up-founder-retention-prediction/sample_submission.csv
/kaggle/input/start-up-founder-retention-prediction/train.csv
/kaggle/input/start-up-founder-retention-prediction/test.csv


In [2]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder


train_df = pd.read_csv("/kaggle/input/start-up-founder-retention-prediction/train.csv")
test_df = pd.read_csv("/kaggle/input/start-up-founder-retention-prediction/test.csv")


founder_ids = test_df["founder_id"]
X = train_df.drop(["retention_status", "founder_id"], axis=1)
X_test = test_df.drop("founder_id", axis=1)
y = train_df["retention_status"]


target_le = LabelEncoder()
y_encoded = target_le.fit_transform(y)
print(f"Target Mapping: {dict(zip(target_le.classes_, target_le.transform(target_le.classes_)))}")

def engineer_features(df):
    df = df.copy()
    wlb_map = {'Low': 1, 'Medium': 2, 'High': 3}
    ot_map = {'Yes': 1, 'No': 0}
    
    df['wlb_score'] = df['work_life_balance_rating'].map(wlb_map).fillna(2)
    df['ot_score'] = df['working_overtime'].map(ot_map).fillna(0)
    df['burnout_index'] = df['ot_score'] / df['wlb_score']
    size_map = {'Small': 10, 'Medium': 50, 'Large': 200}
    df['team_size_est'] = df['team_size_category'].map(size_map).fillna(10)
    
    
    df['rev_per_employee'] = np.log1p(df['monthly_revenue_generated']) / df['team_size_est']
    df['life_investment_ratio'] = df['years_with_startup'] / df['founder_age']
    
    df['is_remote_from_hub'] = (df['distance_from_investor_hub'] > 100).astype(int)
    return df

X_eng = engineer_features(X)
X_test_eng = engineer_features(X_test)

cat_features = [
    'founder_gender', 'founder_role', 'work_life_balance_rating', 
    'venture_satisfaction', 'startup_performance_rating', 'working_overtime',
    'education_background', 'personal_status', 'startup_stage', 
    'team_size_category', 'remote_operations', 'leadership_scope', 
    'innovation_support', 'startup_reputation', 'founder_visibility'
]


num_cols = [c for c in X_eng.columns if c not in cat_features]
for col in num_cols:
    X_eng[col] = X_eng[col].fillna(X_eng[col].median())
    X_test_eng[col] = X_test_eng[col].fillna(X_eng[col].median())


for col in cat_features:
    X_eng[col] = X_eng[col].fillna("Missing").astype(str)
    X_test_eng[col] = X_test_eng[col].fillna("Missing").astype(str)

kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test))

scores = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_eng, y_encoded)):
    X_tr, X_val = X_eng.iloc[train_idx], X_eng.iloc[val_idx]
    y_tr, y_val = y_encoded[train_idx], y_encoded[val_idx]
    
    model = CatBoostClassifier(
        iterations=2000,
        learning_rate=0.03,
        depth=6,
        cat_features=cat_features,
        l2_leaf_reg=3,
        eval_metric='Accuracy',
        early_stopping_rounds=100,
        verbose=0,
        random_state=42
    )
    
    model.fit(X_tr, y_tr, eval_set=(X_val, y_val))
    
    
    val_prob = model.predict_proba(X_val)[:, 1]
    test_prob = model.predict_proba(X_test_eng)[:, 1]
    
    oof_preds[val_idx] = val_prob
    test_preds += test_prob / kf.get_n_splits()
    
    
    acc = accuracy_score(y_val, (val_prob > 0.5).astype(int))
    scores.append(acc)
    print(f"Fold {fold+1} Accuracy: {acc:.4f}")

print(f"Mean Cross Validation Accuracy: {np.mean(scores):.4f}")


thresholds = np.arange(0.3, 0.7, 0.01)
best_acc = 0
best_thresh = 0.5

for t in thresholds:
    pred_labels = (oof_preds > t).astype(int)
    acc = accuracy_score(y_encoded, pred_labels)
    if acc > best_acc:
        best_acc = acc
        best_thresh = t

print(f"Best Threshold: {best_thresh:.2f} -> CV Accuracy: {best_acc:.4f}")

final_preds_encoded = (test_preds > best_thresh).astype(int)
final_preds_labels = target_le.inverse_transform(final_preds_encoded)

submission = pd.DataFrame({
    'founder_id': founder_ids, 
    'retention_status': final_preds_labels
})
submission.to_csv('submission.csv', index=False)
print("Submission saved with optimized threshold")

Target Mapping: {'Left': 0, 'Stayed': 1}


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


Fold 1 Accuracy: 0.7546
Fold 2 Accuracy: 0.7678
Fold 3 Accuracy: 0.7609
Fold 4 Accuracy: 0.7536
Fold 5 Accuracy: 0.7566
Fold 6 Accuracy: 0.7559
Fold 7 Accuracy: 0.7556
Fold 8 Accuracy: 0.7574
Fold 9 Accuracy: 0.7571
Fold 10 Accuracy: 0.7507
Mean Cross Validation Accuracy: 0.7570
Best Threshold: 0.50 -> CV Accuracy: 0.7570
Submission saved with optimized threshold
