In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/start-up-founder-retention-prediction/sample_submission.csv
/kaggle/input/start-up-founder-retention-prediction/train.csv
/kaggle/input/start-up-founder-retention-prediction/test.csv


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

train_df = pd.read_csv("/kaggle/input/start-up-founder-retention-prediction/train.csv") 
test_df = pd.read_csv("/kaggle/input/start-up-founder-retention-prediction/test.csv")


if not train_df.index.is_unique: 
    train_df = train_df.reset_index(drop=True)
if not test_df.index.is_unique: 
    test_df = test_df.reset_index(drop=True)


founder_ids = test_df["founder_id"]
X_train_data = train_df.drop(["retention_status", "founder_id"], axis=1)
X_test_data = test_df.drop("founder_id", axis=1)


le = LabelEncoder()
y_train = le.fit_transform(train_df["retention_status"])


nominal_cols = [
    'founder_gender', 'founder_role', 'working_overtime', 
    'education_background', 'personal_status', 'startup_stage', 
    'team_size_category', 'remote_operations', 'leadership_scope', 
    'innovation_support'
]
ordinal_cols = [
    'work_life_balance_rating', 'venture_satisfaction', 
    'startup_performance_rating', 'startup_reputation', 'founder_visibility'
]
ordinal_categories = [
    ['Poor', 'Average', 'Good', 'Excellent'],
    ['Low', 'Medium', 'High'],
    ['Low', 'Average', 'High'],
    ['Poor', 'Fair', 'Good', 'Excellent'],
    ['Low', 'Medium', 'High']
]

def feature_engineer_bayes(df):
    
    df_copy = df.copy()
    
    
    if 'monthly_revenue_generated' in df_copy.columns:
        df_copy['log_revenue'] = np.log1p(df_copy['monthly_revenue_generated'])
        df_copy = df_copy.drop('monthly_revenue_generated', axis=1)
        
    
    if 'years_with_startup' in df_copy.columns:
        df_copy['log_years_startup'] = np.log1p(df_copy['years_with_startup'])
        df_copy = df_copy.drop('years_with_startup', axis=1)

    
    df_copy['is_near_hub'] = (df_copy['distance_from_investor_hub'] < df_copy['distance_from_investor_hub'].median()).astype(int)
    
    return df_copy


X_train_fe = feature_engineer_bayes(X_train_data)
X_test_fe = feature_engineer_bayes(X_test_data)


numerical_cols_bayes = [
    'founder_age', 'funding_rounds_led', 'distance_from_investor_hub', 
    'num_dependents', 'years_since_founding', 'log_revenue', 'log_years_startup'
]
nominal_cols_bayes = nominal_cols + ['is_near_hub'] 


X_train = X_train_fe[numerical_cols_bayes + nominal_cols_bayes + ordinal_cols]
X_test = X_test_fe[numerical_cols_bayes + nominal_cols_bayes + ordinal_cols]


numerical_transformer = Pipeline([
    ('imputer', KNNImputer(n_neighbors=5)),
    ('scaler', StandardScaler()) # Vital for Naive Bayes
])

nominal_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

ordinal_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('ordinal', OrdinalEncoder(categories=ordinal_categories, handle_unknown='use_encoded_value', unknown_value=-1)),
    ('scaler', StandardScaler()) # Scale ordinals for Bayes
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols_bayes),
        ('nom', nominal_transformer, nominal_cols_bayes),
        ('ord', ordinal_transformer, ordinal_cols)
    ],
    remainder='drop'
)


nb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', GaussianNB())
])


param_grid = {
    'classifier__var_smoothing': np.logspace(0, -9, num=100)
}

grid_search = GridSearchCV(
    nb_pipeline, 
    param_grid, 
    cv=5, 
    scoring='accuracy', 
    verbose=1, 
    n_jobs=-1
)

print("Training Gaussian Naive Bayes...")
grid_search.fit(X_train, y_train)

print(f"Best Accuracy: {grid_search.best_score_:.4f}")
print(f"Best Smoothing: {grid_search.best_params_}")


best_model = grid_search.best_estimator_
y_pred = le.inverse_transform(best_model.predict(X_test))

submission = pd.DataFrame({'founder_id': founder_ids, 'retention_status': y_pred})
submission.to_csv('naive_bayes_optimized_submission.csv', index=False)
print("Submission saved!")

Training Gaussian Naive Bayes...
Fitting 5 folds for each of 100 candidates, totalling 500 fits


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


Best Accuracy: 0.7319
Best Smoothing: {'classifier__var_smoothing': 0.1873817422860384}
Submission saved!
