In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/start-up-founder-retention-prediction/sample_submission.csv
/kaggle/input/start-up-founder-retention-prediction/train.csv
/kaggle/input/start-up-founder-retention-prediction/test.csv


In [2]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


train_df = pd.read_csv('/kaggle/input/start-up-founder-retention-prediction/train.csv') 
test_df = pd.read_csv('/kaggle/input/start-up-founder-retention-prediction/test.csv') 


X_train = train_df.drop(["retention_status", "founder_id"], axis=1)
X_test = test_df.drop("founder_id", axis=1)
founder_ids = test_df["founder_id"]


le = LabelEncoder()
y_train = le.fit_transform(train_df["retention_status"])


numerical_cols = [
    'founder_age', 'years_with_startup', 'monthly_revenue_generated', 
    'funding_rounds_led', 'distance_from_investor_hub', 
    'num_dependents', 'years_since_founding'
]

# Nominal  categorical features
nominal_cols = [
    'founder_gender', 'founder_role', 'working_overtime', 
    'education_background', 'personal_status', 'startup_stage', 
    'team_size_category', 'remote_operations', 'leadership_scope', 
    'innovation_support'
]

# Ordinal  categorical features 
ordinal_cols = [
    'work_life_balance_rating', 'venture_satisfaction', 
    'startup_performance_rating', 'startup_reputation', 'founder_visibility'
]


ordinal_categories = [
    ['Poor', 'Average', 'Good', 'Excellent'],  # work_life_balance_rating
    ['Low', 'Medium', 'High'],                 # venture_satisfaction
    ['Low', 'Average', 'High'],                 # startup_performance_rating
    ['Poor', 'Fair', 'Good', 'Excellent'],    # startup_reputation
    ['Low', 'Medium', 'High']                  # founder_visibility
]


numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])


nominal_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])


ordinal_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('ordinal', OrdinalEncoder(categories=ordinal_categories, handle_unknown='use_encoded_value', unknown_value=-1)) 
])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_cols),
        ('nom', nominal_pipeline, nominal_cols),
        ('ord', ordinal_pipeline, ordinal_cols)
    ],
    remainder='drop' 
)




logreg_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42, max_iter=10000)) 
])


param_grid = {
    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__solver': ['liblinear'] 
}

#Grid search done on parameter grid to find optimal ones
grid_search = GridSearchCV(
    logreg_pipeline, 
    param_grid, 
    cv=5, 
    scoring='accuracy', 
    verbose=1, 
    n_jobs=-1 
)


grid_search.fit(X_train, y_train)

#Finding best CV accuracy and best hyperparameters using gridsearch
print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")
print(f"Best hyperparameters: {grid_search.best_params_}")



best_logreg = grid_search.best_estimator_


y_pred_proba = best_logreg.predict_proba(X_test)[:, 1]
y_pred_numeric = (y_pred_proba > 0.5).astype(int)


y_pred = le.inverse_transform(y_pred_numeric)


submission = pd.DataFrame({'founder_id': founder_ids, 'retention_status': y_pred})
submission.to_csv('submission.csv', index=False)

print("Submission file created.")

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best cross-validation accuracy: 0.7418
Best hyperparameters: {'classifier__C': 0.1, 'classifier__penalty': 'l2', 'classifier__solver': 'liblinear'}
Submission file created.
