In [None]:
import pandas as pd
import numpy as np
import matplotlib
import sklearn

In [None]:
train = pd.read_csv('../data/application_train.csv')
test  = pd.read_csv('../data/application_test.csv')


In [None]:
train.dropna(inplace=True)
# df.sample(20)
# df.info()
# df.columns

# print(df.NAME_TYPE_SUITE.unique())


In [None]:
# train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)
print(train['DAYS_EMPLOYED'].describe())

print((train['DAYS_BIRTH'] / -365).describe())

# Create an anomalous flag column
train['DAYS_EMPLOYED_ANOM'] = train["DAYS_EMPLOYED"] == 365243

# Replace the anomalous values with nan
train['DAYS_EMPLOYED'].replace({365243: np.nan})

train['DAYS_EMPLOYED'].plot.hist(title = 'Days Employment Histogram');

import matplotlib.pyplot as plt
plt.xlabel('Days Employment');


In [None]:
# Function to calculate missing values by column# Funct 
def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

# Missing values statistics
missing_values = missing_values_table(train)
missing_values.head(20)

There are a lot with missing values. The models I am training can handle missing values natively, but many make a boolean missing indicator feature like so: 
```python
df['COMMONAREA_MEDI_missing'] = df['COMMONAREA_MEDI'].isnull().astype(int)
df['COMMONAREA_MEDI'] = df['COMMONAREA_MEDI'].fillna(-1)

# Imputation is also a technique that can be used. Like so: 
from sklearn.decomposition import PCA
from sklearn.preprocessing import Imputer
from sklearn.pipeline import Pipeline

# Make sure to drop the ids and target
train = train.drop(columns = ['SK_ID_CURR', 'TARGET'])
test = test.drop(columns = ['SK_ID_CURR'])
```

There are many forms of imputation but this is one form with PCA: 
```python
# Make a pipeline with imputation and pca
pipeline = Pipeline(steps = [('imputer', Imputer(strategy = 'median')),
             ('pca', PCA())])

# Fit and transform on the training data
train_pca = pipeline.fit_transform(train)

# transform the testing data
test_pca = pipeline.transform(test)
```



In [None]:
cols_to_drop = [
    "SK_ID_CURR", "OWN_CAR_AGE", "DAYS_EMPLOYED",
    "WEEKDAY_APPR_PROCESS_START", "HOUR_APPR_PROCESS_START",
    "WALLSMATERIAL_MODE"
]
train.drop(columns=cols_to_drop, errors="ignore", inplace=True)
test.drop(columns=cols_to_drop, errors="ignore", inplace=True)

In [None]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
import pandas as pd
import numpy as np

# Split features/label
y = train["TARGET"]
X = train.drop(columns=["TARGET"])

train_X, test_X = train_X.align(test, join='inner', axis=1)

print("Training shape:", train_X.shape)
print("Testing shape:", test_X.shape)

# Categorical columns
one_hot_cols = [
    'NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY',
    'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
    'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE',
    'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE',
    'EMERGENCYSTATE_MODE'
]

# Split into binary vs multi-class categorical
binary_cats = []
multi_cats = []

# Keep only columns that actually exist after align
one_hot_cols = [c for c in one_hot_cols if c in train_X.columns]


for col in one_hot_cols:
    # Count unique *string* categories (drop NaN)
    unique_vals = X[col].dropna().unique()
    if len(unique_vals) <= 2:
        binary_cats.append(col)
    else:
        multi_cats.append(col)

print("Binary categorical columns:", binary_cats)
print("Multi-class categorical columns:", multi_cats)

# Build the column transformer
ct = ColumnTransformer(
    transformers=[
        ("binary", OrdinalEncoder(), binary_cats),
        ("onehot", OneHotEncoder(handle_unknown='ignore', sparse_output=False), multi_cats)
    ],
    remainder="passthrough"  # keep numerical columns
)

# Fit and transform training data
X_enc = ct.fit_transform(train_X)
X_test_enc = ct.transform(test_X)

# Get feature names
feature_names = ct.get_feature_names_out()


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, roc_auc_score
)
from sklearn.model_selection import cross_val_predict

def scores(y_true, y_pred, y_pred_proba):
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall:", recall_score(y_true, y_pred))
    print("F1:", f1_score(y_true, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("ROC-AUC:", roc_auc_score(y_true, y_pred_proba))

logreg = LogisticRegression(
    penalty="l1",
    solver="liblinear",
    class_weight="balanced",
    max_iter=200
)

# CV predictions
y_pred = cross_val_predict(logreg, X_enc, y, cv=5, method="predict")
y_pred_proba = cross_val_predict(logreg, X_enc, y, cv=5, method="predict_proba")[:, 1]

print("\nLogistic Regression CV Scores")
scores(y, y_pred, y_pred_proba)

# Fit final model
logreg.fit(X_enc, y)

# Predict on test set
logreg_test_pred = logreg.predict_proba(X_test_enc)[:, 1]

# Feature importance
coef_importance = pd.Series(
    abs(logreg.coef_[0]),
    index=feature_names
).sort_values(ascending=False)

print("\nTop 20 Logistic Regression Features:")
print(coef_importance.head(20))

In [None]:
from lightgbm import LGBMClassifier

# LightGBM Model
lgbm = LGBMClassifier(
    objective="binary",
    metric="auc",
    boosting_type="gbdt",
    n_estimators=800,
    learning_rate=0.02,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    class_weight="balanced",
    random_state=42
)

# Cross-validated predictions (5-fold)
lgb_pred = cross_val_predict(
    lgbm, X_enc, y,
    cv=5,
    method="predict"
)

lgb_pred_proba = cross_val_predict(
    lgbm, X_enc, y,
    cv=5,
    method="predict_proba"
)[:, 1]

print("\nLightGBM CV Scores")
scores(y, lgb_pred, lgb_pred_proba)

# Fit final model on ALL training data
lgbm.fit(X_enc, y)

# # Predict on real test data
# lgbm_test_pred = lgbm.predict_proba(X_test_enc)[:, 1]

# Feature importance
lgb_importance = pd.Series(
    lgbm.feature_importances_,
    index=feature_names
).sort_values(ascending=False)

print("\nTop 20 LightGBM Features:")
print(lgb_importance.head(20))
