In [None]:
import pandas as pd
import numpy as np
import matplotlib
import sklearn

In [None]:
train = pd.read_csv('../data/application_train.csv')
test  = pd.read_csv('../data/application_test.csv')


In [None]:
train.dropna(inplace=True)
# df.sample(20)
# df.info()
# df.columns

# print(df.NAME_TYPE_SUITE.unique())


In [None]:
cols_to_drop = [
    "SK_ID_CURR", "OWN_CAR_AGE", "DAYS_EMPLOYED",
    "WEEKDAY_APPR_PROCESS_START", "HOUR_APPR_PROCESS_START",
    "WALLSMATERIAL_MODE"
]
train.drop(columns=cols_to_drop, errors="ignore", inplace=True)
test.drop(columns=cols_to_drop, errors="ignore", inplace=True)

In [None]:
# There is a lot of useless features, so L1 regularization is very useful here. 
# I will initially train a Logistic Regression model with sklearn. 
# Logistic Regression

In [None]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
import pandas as pd
import numpy as np

# Split features/label
y = train["TARGET"]
X = train.drop(columns=["TARGET"])

# Your original categorical columns
one_hot_cols = [
    'NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY',
    'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
    'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE',
    'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE',
    'EMERGENCYSTATE_MODE'
]

# Split into binary vs multi-class categorical
binary_cats = []
multi_cats = []

for col in one_hot_cols:
    # Count unique *string* categories (drop NaN)
    unique_vals = X[col].dropna().unique()
    if len(unique_vals) <= 2:
        binary_cats.append(col)
    else:
        multi_cats.append(col)

print("Binary categorical columns:", binary_cats)
print("Multi-class categorical columns:", multi_cats)

# Build the column transformer
ct = ColumnTransformer(
    transformers=[
        ("binary", OrdinalEncoder(), binary_cats),
        ("onehot", OneHotEncoder(handle_unknown='ignore', sparse_output=False), multi_cats)
    ],
    remainder="passthrough"  # keep numerical columns
)

# Fit and transform training data
X_enc = ct.fit_transform(X)
X_test_enc = ct.transform(test)

# Get output feature names
feature_names = ct.get_feature_names_out()


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, roc_auc_score
)
from sklearn.model_selection import cross_val_predict

# Score function 
def scores(y_true, y_pred, y_pred_proba):
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall:", recall_score(y_true, y_pred))
    print("F1:", f1_score(y_true, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("ROC-AUC:", roc_auc_score(y_true, y_pred_proba))

# Logistic Regression Model
logreg = LogisticRegression(
    penalty="l1",
    solver="liblinear",
    class_weight="balanced",
    max_iter=200
)

# Cross-validated predictions (5-fold) on training set
y_pred = cross_val_predict(
    logreg, X_enc, y,
    cv=5,
    method="predict"
)

y_pred_proba = cross_val_predict(
    logreg, X_enc, y,
    cv=5,
    method="predict_proba"
)[:, 1]

print("\nLogistic Regression CV Scores")
scores(y, y_pred, y_pred_proba)

# Fit final model on all training data
logreg.fit(X_enc, y)

# Predict on real test data
logreg_test_pred = logreg.predict_proba(X_test_enc)[:, 1]

# Optional: Logistic Regression feature importance
coef_importance = pd.Series(
    abs(logreg.coef_[0]),
    index=feature_names
).sort_values(ascending=False)

print("\nTop 20 Logistic Regression Features:")
print(coef_importance.head(20))


In [None]:
from lightgbm import LGBMClassifier

# LightGBM Model
lgbm = LGBMClassifier(
    objective="binary",
    metric="auc",
    boosting_type="gbdt",
    n_estimators=800,
    learning_rate=0.02,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    class_weight="balanced",
    random_state=42
)

# Cross-validated predictions (5-fold)
lgb_pred = cross_val_predict(
    lgbm, X_enc, y,
    cv=5,
    method="predict"
)

lgb_pred_proba = cross_val_predict(
    lgbm, X_enc, y,
    cv=5,
    method="predict_proba"
)[:, 1]

print("\nLightGBM CV Scores")
scores(y, lgb_pred, lgb_pred_proba)

# Fit final model on ALL training data
lgbm.fit(X_enc, y)

# Predict on real test data
lgbm_test_pred = lgbm.predict_proba(X_test_enc)[:, 1]

# Feature importance
lgb_importance = pd.Series(
    lgbm.feature_importances_,
    index=feature_names
).sort_values(ascending=False)

print("\nTop 20 LightGBM Features:")
print(lgb_importance.head(20))
