In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/start-up-founder-retention-prediction/sample_submission.csv
/kaggle/input/start-up-founder-retention-prediction/train.csv
/kaggle/input/start-up-founder-retention-prediction/test.csv


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report


train_df = pd.read_csv("/kaggle/input/start-up-founder-retention-prediction/train.csv") 
test_df = pd.read_csv("/kaggle/input/start-up-founder-retention-prediction/test.csv") 


founder_ids = test_df["founder_id"]


X = train_df.drop(["retention_status", "founder_id"], axis=1)
X_test = test_df.drop("founder_id", axis=1)
y = train_df["retention_status"]


target_le = LabelEncoder()
y_encoded = target_le.fit_transform(y)
print(f"Target Mapping: {dict(zip(target_le.classes_, target_le.transform(target_le.classes_)))}")


def engineer_features(df):
    df = df.copy()
    wlb_map = {'Poor': 0, 'Fair': 1, 'Good': 2, 'Excellent': 3} 
    ot_map = {'Yes': 1, 'No': 0}
    df['wlb_score'] = df['work_life_balance_rating'].map(wlb_map)
    df['wlb_score'] = df['wlb_score'].fillna(df['wlb_score'].median()) 
    df['ot_score'] = df['working_overtime'].map(ot_map).fillna(0) 
    df['burnout_index'] = df['ot_score'] / (df['wlb_score'] + 1e-6) 
    
    
    size_map = {'Small': 1, 'Medium': 2, 'Large': 3} 
    df['team_size_est'] = df['team_size_category'].map(size_map)
    df['team_size_est'] = df['team_size_est'].fillna(df['team_size_est'].median())
    
    
    df['monthly_revenue_generated'] = pd.to_numeric(df['monthly_revenue_generated'], errors='coerce')
    df['monthly_revenue_generated_log'] = np.log1p(df['monthly_revenue_generated']) 
    
    
    df['life_investment_ratio'] = df['years_with_startup'] / (df['founder_age'] + 1e-6)
    
    
    df['is_remote_from_hub'] = (df['distance_from_investor_hub'] > 70).astype(int) 

    
    df = df.drop(columns=['work_life_balance_rating', 'working_overtime', 'team_size_category', 'monthly_revenue_generated'])
    
    return df

X_eng = engineer_features(X)
X_test_eng = engineer_features(X_test)


num_cols = X_eng.select_dtypes(include=np.number).columns.tolist()
cat_cols = X_eng.select_dtypes(include='object').columns.tolist()


num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])


cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False)) 
])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
        ('cat', cat_transformer, cat_cols)
    ],
    remainder='drop'
)


X_fit, X_val, y_fit, y_val = train_test_split(
    X_eng, y_encoded, test_size=0.80, random_state=42, stratify=y_encoded
)


sv = SVC(kernel='rbf', random_state=42, class_weight='balanced')


clf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', sv)
])


param_grid = {
    'classifier__C': [0.1, 1, 10],      
    'classifier__gamma': ['scale', 0.1, 1],
    'classifier__kernel': ['rbf'] 
}




grid_search = GridSearchCV(
    clf_pipeline, 
    param_grid=param_grid, 
    cv=5, 
    verbose=1, 
    n_jobs=-1, 
    scoring='accuracy',
    )

grid_search.fit(X_fit, y_fit)


best_model = grid_search.best_estimator_



print(f"Best Parameters found: {grid_search.best_params_}")
print(f"Best Cross-Validation Accuracy on 20% fit set: {grid_search.best_score_:.4f}")


val_predictions = best_model.predict(X_val)
val_accuracy = accuracy_score(y_val, val_predictions)


print(f"Accuracy: {val_accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_val, val_predictions, target_names=['Left', 'Stayed']))


test_predictions_encoded = best_model.predict(X_test_eng)
final_preds_labels = target_le.inverse_transform(test_predictions_encoded)


submission = pd.DataFrame({
    'founder_id': founder_ids, 
    'retention_status': final_preds_labels
})


submission.to_csv('submission.csv', index=False)


Target Mapping: {'Left': 0, 'Stayed': 1}
Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best Parameters found: {'classifier__C': 1, 'classifier__gamma': 'scale', 'classifier__kernel': 'rbf'}
Best Cross-Validation Accuracy on 20% fit set: 0.7439
Accuracy: 0.7408
Classification Report:
              precision    recall  f1-score   support

        Left       0.72      0.75      0.73     22677
      Stayed       0.76      0.73      0.75     25012

    accuracy                           0.74     47689
   macro avg       0.74      0.74      0.74     47689
weighted avg       0.74      0.74      0.74     47689

