In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer

In [2]:
# train.dropna(inplace=True)
# df.sample(20)
# df.info()
# df.columns

# print(df.NAME_TYPE_SUITE.unique())


In [3]:
# 1. Load train & test
train = pd.read_csv('../data/application_train.csv')
test  = pd.read_csv('../data/application_test.csv')

In [4]:
# Function to calculate missing values by column# Funct 
def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

# Missing values statistics
missing_values = missing_values_table(train)
missing_values.head(20)

Your selected dataframe has 122 columns.
There are 67 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
COMMONAREA_MEDI,214865,69.9
COMMONAREA_AVG,214865,69.9
COMMONAREA_MODE,214865,69.9
NONLIVINGAPARTMENTS_MEDI,213514,69.4
NONLIVINGAPARTMENTS_MODE,213514,69.4
NONLIVINGAPARTMENTS_AVG,213514,69.4
FONDKAPREMONT_MODE,210295,68.4
LIVINGAPARTMENTS_MODE,210199,68.4
LIVINGAPARTMENTS_MEDI,210199,68.4
LIVINGAPARTMENTS_AVG,210199,68.4


There are a lot with missing values. The models I am training can handle missing values natively, but many make a boolean missing indicator feature like so: 
```python
df['COMMONAREA_MEDI_missing'] = df['COMMONAREA_MEDI'].isnull().astype(int)
df['COMMONAREA_MEDI'] = df['COMMONAREA_MEDI'].fillna(-1)

# Imputation is also a technique that can be used. Like so: 
from sklearn.decomposition import PCA
from sklearn.preprocessing import Imputer
from sklearn.pipeline import Pipeline

# Make sure to drop the ids and target
train = train.drop(columns = ['SK_ID_CURR', 'TARGET'])
test = test.drop(columns = ['SK_ID_CURR'])
```

There are many forms of imputation but this is one form with PCA: 
```python
# Make a pipeline with imputation and pca
pipeline = Pipeline(steps = [('imputer', Imputer(strategy = 'median')),
             ('pca', PCA())])

# Fit and transform on the training data
train_pca = pipeline.fit_transform(train)

# transform the testing data
test_pca = pipeline.transform(test)
```



In [5]:
# 2. Fix DAYS_EMPLOYED anomaly
train['DAYS_EMPLOYED_ANOM'] = (train['DAYS_EMPLOYED'] == 365243).astype(int)
test['DAYS_EMPLOYED_ANOM']  = (test['DAYS_EMPLOYED'] == 365243).astype(int)

train['DAYS_EMPLOYED'] = train['DAYS_EMPLOYED'].replace({365243: np.nan})
test['DAYS_EMPLOYED']  = test['DAYS_EMPLOYED'].replace({365243: np.nan})

In [6]:
# 3. Drop the same columns from train & test BEFORE encoding
cols_to_drop = [
    "SK_ID_CURR", "OWN_CAR_AGE", "DAYS_EMPLOYED",
    "WEEKDAY_APPR_PROCESS_START", "HOUR_APPR_PROCESS_START",
    "WALLSMATERIAL_MODE", "N"
]

train.drop(columns=cols_to_drop, errors="ignore", inplace=True)
test.drop(columns=cols_to_drop, errors="ignore", inplace=True)

In [7]:
# 4. Split target
y = train["TARGET"]
train_X = train.drop(columns=["TARGET"])

# 5. Align train and test *before* encoding
train_X, test_X = train_X.align(test, join='inner', axis=1)

print("Training shape:", train_X.shape)
print("Testing shape:", test_X.shape)

# 6. Identify categorical columns (only those that exist)
one_hot_cols = [
    'NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY',
    'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
    'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE',
    'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE',
    'EMERGENCYSTATE_MODE'
]

# Keep only existing cols after align
one_hot_cols = [c for c in one_hot_cols if c in train_X.columns]

# Split by cardinality
binary_cats = []
multi_cats  = []

for col in one_hot_cols:
    unique_vals = train_X[col].dropna().unique()
    if len(unique_vals) <= 2:
        binary_cats.append(col)
    else:
        multi_cats.append(col)

print("Binary categorical columns:", binary_cats)
print("Multi-class categorical columns:", multi_cats)

# 7. Build ColumnTransformer
ct = ColumnTransformer(
    transformers=[
        ("binary", OrdinalEncoder(), binary_cats),
        ("onehot", OneHotEncoder(handle_unknown='ignore', sparse_output=False), multi_cats),
    ],
    remainder="passthrough"
)

# 8. Fit on train_X, transform both train_X and test_X
X_enc = ct.fit_transform(train_X)
X_test_enc = ct.transform(test_X)

# 9. Extract final feature names
feature_names = ct.get_feature_names_out()
print("Final encoded training shape:", X_enc.shape)
print("Final encoded testing shape:", X_test_enc.shape)


Training shape: (307511, 116)
Testing shape: (48744, 116)
Binary categorical columns: ['NAME_CONTRACT_TYPE', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'EMERGENCYSTATE_MODE']
Multi-class categorical columns: ['CODE_GENDER', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE']
Final encoded training shape: (307511, 228)
Final encoded testing shape: (48744, 228)


In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, roc_auc_score
)
from sklearn.model_selection import cross_val_predict

def scores(y_true, y_pred, y_pred_proba):
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall:", recall_score(y_true, y_pred))
    print("F1:", f1_score(y_true, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("ROC-AUC:", roc_auc_score(y_true, y_pred_proba))

# logreg = LogisticRegression(
#     penalty="l1",
#     solver="liblinear",
#     class_weight="balanced",
#     max_iter=200
# )

# # CV predictions
# y_pred = cross_val_predict(logreg, X_enc, y, cv=5, method="predict")
# y_pred_proba = cross_val_predict(logreg, X_enc, y, cv=5, method="predict_proba")[:, 1]

# print("\nLogistic Regression CV Scores")
# scores(y, y_pred, y_pred_proba)

# # Fit final model
# logreg.fit(X_enc, y)

# # Predict on test set
# logreg_test_pred = logreg.predict_proba(X_test_enc)[:, 1]

# # Feature importance
# coef_importance = pd.Series(
#     abs(logreg.coef_[0]),
#     index=feature_names
# ).sort_values(ascending=False)

# print("\nTop 20 Logistic Regression Features:")
# print(coef_importance.head(20))

In [11]:
from lightgbm import LGBMClassifier

# LightGBM Model
lgbm = LGBMClassifier(
    objective="binary",
    metric="auc",
    boosting_type="gbdt",
    n_estimators=800,
    learning_rate=0.02,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    class_weight="balanced",
    random_state=42
)

# Cross-validated predictions (5-fold)
lgb_pred = cross_val_predict(
    lgbm, X_enc, y,
    cv=5,
    method="predict"
)

lgb_pred_proba = cross_val_predict(
    lgbm, X_enc, y,
    cv=5,
    method="predict_proba"
)[:, 1]

print("\nLightGBM CV Scores")
scores(y, lgb_pred, lgb_pred_proba)

# Fit final model on ALL training data
lgbm.fit(X_enc, y)

# # Predict on real test data
lgbm_test_pred = lgbm.predict_proba(X_test_enc)[:, 1]

# Feature importance
lgb_importance = pd.Series(
    lgbm.feature_importances_,
    index=feature_names
).sort_values(ascending=False)

print("\nTop 20 LightGBM Features:")
print(lgb_importance.head(20))


[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.041738 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11113
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 216
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000




[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.041002 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11201
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 216
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000




[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.041242 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11111
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 219
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000




[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.047784 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11125
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 217
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000




[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.040633 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11096
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 218
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000




[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.042106 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11113
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 216
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000




[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.039714 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11201
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 216
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000




[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.045153 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11111
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 219
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000




[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.039427 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11125
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 217
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000




[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.033917 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11096
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 218
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000





LightGBM CV Scores
Accuracy: 0.7150280802963146
Precision: 0.17154242801409908
Recall: 0.6606646525679758
F1: 0.2723649467758274
Confusion Matrix:
 [[203478  79208]
 [  8424  16401]]
ROC-AUC: 0.7579055977039819
[LightGBM] [Info] Number of positive: 24825, number of negative: 282686
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.050427 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11117
[LightGBM] [Info] Number of data points in the train set: 307511, number of used features: 219
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000





Top 20 LightGBM Features:
remainder__EXT_SOURCE_1                  1698
remainder__EXT_SOURCE_3                  1695
remainder__EXT_SOURCE_2                  1339
remainder__DAYS_BIRTH                    1307
remainder__AMT_CREDIT                    1241
remainder__AMT_ANNUITY                   1145
remainder__AMT_GOODS_PRICE               1078
remainder__DAYS_ID_PUBLISH                914
remainder__DAYS_LAST_PHONE_CHANGE         775
remainder__DAYS_REGISTRATION              742
remainder__AMT_INCOME_TOTAL               545
remainder__REGION_POPULATION_RELATIVE     446
remainder__AMT_REQ_CREDIT_BUREAU_YEAR     331
remainder__TOTALAREA_MODE                 253
onehot__CODE_GENDER_F                     234
binary__FLAG_OWN_CAR                      222
remainder__AMT_REQ_CREDIT_BUREAU_QRT      221
remainder__LANDAREA_AVG                   208
remainder__APARTMENTS_MODE                207
binary__NAME_CONTRACT_TYPE                198
dtype: int32
