In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/start-up-founder-retention-prediction/sample_submission.csv
/kaggle/input/start-up-founder-retention-prediction/train.csv
/kaggle/input/start-up-founder-retention-prediction/test.csv


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer, KNNImputer
# Switched model from MLPClassifier to SVC
from sklearn.svm import SVC 
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA



train_df = pd.read_csv("/kaggle/input/start-up-founder-retention-prediction/train.csv")
test_df = pd.read_csv("/kaggle/input/start-up-founder-retention-prediction/test.csv")

if not train_df.index.is_unique:
    train_df = train_df.reset_index(drop=True)
if not test_df.index.is_unique:
    test_df = test_df.reset_index(drop=True)

founder_ids = test_df["founder_id"]

X_full = train_df.drop(["retention_status", "founder_id"], axis=1)
y_full = train_df["retention_status"]
X_test_data = test_df.drop("founder_id", axis=1)

le = LabelEncoder()
y_full_encoded = le.fit_transform(y_full)


X_train_data, X_val_data, y_train_encoded, y_val_encoded = train_test_split(
    X_full,
    y_full_encoded,
    test_size=0.0, # 0% for validation (100% for training)
    random_state=42,
    stratify=y_full_encoded
)


def feature_engineer_improved(df):
    df_copy = df.copy()
    if 'monthly_revenue_generated' in df_copy.columns:
        df_copy['log_revenue'] = np.log1p(df_copy['monthly_revenue_generated'])
    if 'years_with_startup' in df_copy.columns:
        df_copy['log_years_startup'] = np.log1p(df_copy['years_with_startup'])
    if 'distance_from_investor_hub' in df_copy.columns:
        median_distance = df_copy['distance_from_investor_hub'].median()
        df_copy['is_near_hub'] = (df_copy['distance_from_investor_hub'].fillna(median_distance) < median_distance).astype(int)
    if 'years_with_startup' in df_copy.columns and 'founder_age' in df_copy.columns:
        df_copy['experience_ratio'] = df_copy['years_with_startup'] / (df_copy['founder_age'] + 1e-6)
    
    df_copy = df_copy.drop(['monthly_revenue_generated', 'years_with_startup'], axis=1, errors='ignore')
    return df_copy

X_train_fe = feature_engineer_improved(X_train_data)
X_val_fe = feature_engineer_improved(X_val_data)
X_test_data_fe = feature_engineer_improved(X_test_data)
X_full_fe = feature_engineer_improved(X_full)


nominal_cols = [
    'founder_gender', 'founder_role', 'working_overtime', 'education_background',
    'personal_status', 'startup_stage', 'team_size_category',
    'remote_operations', 'leadership_scope', 'innovation_support']
ordinal_cols = [
    'work_life_balance_rating', 'venture_satisfaction', 'startup_performance_rating',
    'startup_reputation', 'founder_visibility']
ordinal_categories = [
    ['Poor', 'Average', 'Good', 'Excellent'], ['Low', 'Medium', 'High'],
    ['Low', 'Average', 'High'], ['Poor', 'Fair', 'Good', 'Excellent'],
    ['Low', 'Medium', 'High']]
numerical_cols_improved = [
    'founder_age', 'funding_rounds_led', 'distance_from_investor_hub',
    'num_dependents', 'years_since_founding', 'log_revenue',
    'log_years_startup', 'experience_ratio']
nominal_cols_improved = nominal_cols + ['is_near_hub']

available_cols = list(X_train_fe.columns)
numerical_cols_filtered = [col for col in numerical_cols_improved if col in available_cols]
nominal_cols_filtered = [col for col in nominal_cols_improved if col in available_cols]
ordinal_cols_filtered = [col for col in ordinal_cols if col in available_cols]

X_train = X_train_fe[numerical_cols_filtered + nominal_cols_filtered + ordinal_cols_filtered]
X_val = X_val_fe[numerical_cols_filtered + nominal_cols_filtered + ordinal_cols_filtered]
X_test = X_test_data_fe[numerical_cols_filtered + nominal_cols_filtered + ordinal_cols_filtered]
X_full_final = X_full_fe[numerical_cols_filtered + nominal_cols_filtered + ordinal_cols_filtered]


numerical_transformer = Pipeline([
    ('imputer', KNNImputer(n_neighbors=5)), ('scaler', StandardScaler()), ('pca', PCA(random_state=42))])
nominal_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])
ordinal_categories_filtered = [
    ordinal_categories[i] for i, col in enumerate(ordinal_cols) if col in available_cols]
ordinal_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')), 
    ('ordinal', OrdinalEncoder(categories=ordinal_categories_filtered, handle_unknown='use_encoded_value', unknown_value=-1)),
    ('scaler', StandardScaler())])
preprocessor = ColumnTransformer(
    transformers=[
        ('num_pca', numerical_transformer, numerical_cols_filtered),
        ('nom', nominal_transformer, nominal_cols_filtered),
        ('ord', ordinal_transformer, ordinal_cols_filtered)
    ],
    remainder='drop')


svc_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('svc', SVC(random_state=42, probability=False)) 
])


param_grid_svc = {
    'preprocessor__num_pca__pca__n_components': [0.9, None],
    'svc__kernel': ['rbf', 'linear'], 
    'svc__C': [0.1, 1, 10], 
    
    'svc__gamma': ['scale', 0.01, 0.1],
}


cv_folds = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    svc_pipeline,
    param_grid_svc,
    cv=cv_folds,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)


print("Starting Grid Search with Support Vector Classifier (SVC) on 20% Training Data...")
grid_search.fit(X_train, y_train_encoded)


print("\nSVC Grid Search Finished.")
print(f"Best Cross-Validation Accuracy on 20% Training Folds: {grid_search.best_score_:.4f}")
print(f"Optimal Hyperparameters: {grid_search.best_params_}")

best_model = grid_search.best_estimator_

final_model = best_model

final_model.set_params(**{k: v for k, v in grid_search.best_params_.items()})

final_model.fit(X_full_final, y_full_encoded)
print("Final model trained on X_full.")

y_pred = le.inverse_transform(best_model.predict(X_test))

submission = pd.DataFrame({'founder_id': founder_ids, 'retention_status': y_pred})
submission.to_csv('submission_svc_final.csv', index=False)


Starting Grid Search with Support Vector Classifier (SVC) on 20% Training Data...
Fitting 3 folds for each of 36 candidates, totalling 108 fits


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)



SVC Grid Search Finished.
Best Cross-Validation Accuracy on 20% Training Folds: 0.7486
Optimal Hyperparameters: {'preprocessor__num_pca__pca__n_components': None, 'svc__C': 1, 'svc__gamma': 0.01, 'svc__kernel': 'rbf'}
Final model trained on X_full.
