In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score

In [2]:
# Helper metric functions

def roc_auc(actual, preds):
    """Manual ROC-AUC implementation copied from metrics.ipynb."""
    actual = np.array(actual) == 1
    tpr = []
    fpr = []
    for thresh in [x / 100.0 for x in range(0, 101)]:
        preds_t = np.array(preds) >= thresh
        tp = sum(preds_t & actual)
        fp = sum(preds_t & ~actual)
        tn = sum(~preds_t & ~actual)
        fn = sum(~preds_t & actual)
        tpr.append(tp / (tp + fn))
        fpr.append(tn / (tn + fp))

    auc = 0
    for i in range(0, len(tpr) - 1):
        auc += ((tpr[i] + tpr[i + 1]) / 2) * (fpr[i + 1] - fpr[i])
    return auc


def accuracy(actual, preds, thresh):
    # pred should be a list of predicted probabilities between 0 and 1 of a category
    # actual should be a list of 1s and 0s for the actual target category
    # thresh should be a float between 0 and 1
    # Returns the accuracy at the given threshold
    preds = np.array(preds) >= thresh
    actual = np.array(actual) == 1
    actual = (actual == 1)
    acc = np.count_nonzero(preds == actual)/len(actual)
    return acc

def precision(actual, preds, thresh):
    preds = (np.array(preds) >= thresh)
    actual = (np.array(actual) == 1)
    tp = np.count_nonzero(preds & actual)
    fp = np.count_nonzero(preds & ~actual)
    prec = tp/(tp+fp)
    return prec

def f1_score(actual, preds, thresh):
    preds = (np.array(preds) >= thresh)
    actual = (np.array(actual) == 1)
    tp = np.count_nonzero(preds & actual)
    fp = np.count_nonzero(preds & ~actual)
    fn = np.count_nonzero(~preds & actual)
    rec = tp/(tp+fn)
    prec = tp/(tp+fp)
    f1 = (2 * rec * prec)/(rec + prec)
    return(f1)

In [3]:
# ============================
# 1. LOAD DATA
# ============================
df = pd.read_csv("application_train.csv")
df = df.sample(n=30000, random_state=42)

# ============================
# 2. FIX SPECIAL VALUES
# ============================

# DAYS_EMPLOYED = 365243 means “no employment record”
df["DAYS_EMPLOYED"].replace(365243, np.nan, inplace=True)

# MAKE MISSINGNESS INDICATORS (best-practice for this dataset)
for col in df.columns:
    df[col + "_MISSING"] = df[col].isna().astype(int)

# ============================
# 3. IDENTIFY CATEGORICAL COLS
# ============================
categorical_cols = df.select_dtypes(include=["object"]).columns

# ============================
# 4. COERCE NUMERIC-LIKE COLS
# ============================
numeric_like_cols = df.columns.difference(categorical_cols)

df[numeric_like_cols] = df[numeric_like_cols].apply(
    lambda col: pd.to_numeric(col, errors="coerce")
)

# ============================
# 5. SET UP NUMERIC & CATEGORICAL COL LISTS
# ============================
numeric_cols = df.select_dtypes(include=[np.number]).columns
numeric_cols = numeric_cols.drop("TARGET")  # do NOT impute target

# ============================
# 6. IMPUTE NUMERICS (median) & CATEGORICALS (mode)
# ============================
num_imputer = SimpleImputer(strategy="median")
df[numeric_cols] = num_imputer.fit_transform(df[numeric_cols])

cat_imputer = SimpleImputer(strategy="most_frequent")
df[categorical_cols] = cat_imputer.fit_transform(df[categorical_cols])

# Confirm no NaNs
print("Remaining NaNs:", df.isna().sum().sum())

# ============================
# 7. ONE-HOT ENCODE CATEGORICAL FIELDS
# ============================
df_encoded = pd.get_dummies(df, drop_first=True)

# ============================
# 8. CORRELATION BASED FILTERING
# ============================
correlations = df_encoded.corr()["TARGET"]
low_corr_cols = correlations[abs(correlations) < 0.01].index.tolist()

# DO NOT drop TARGET even if correlation calculation returns it
low_corr_cols = [col for col in low_corr_cols if col != "TARGET"]

df_filtered = df_encoded.drop(columns=low_corr_cols, errors='ignore')

print("Final shape:", df_filtered.shape)

# ============================
# 9. READY FOR MODELING
# ============================
X = df_filtered.drop("TARGET", axis=1)
y = df_filtered["TARGET"]


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["DAYS_EMPLOYED"].replace(365243, np.nan, inplace=True)
  df[col + "_MISSING"] = df[col].isna().astype(int)
  df[col + "_MISSING"] = df[col].isna().astype(int)
  df[col + "_MISSING"] = df[col].isna().astype(int)
  df[col + "_MISSING"] = df[col].isna().astype(int)
  df[col + "_MISSING"] = df[col].isna().astype(int)
  df[col + "_MISSING"] = df[col].isna().astype(int)
  df[col + "_MISSING"] = df[col].isna().astype(int)
  df[col + "_MISSING"] = df[col].isna().astype(int)
  df[col + "_MISSING"] = df[col].isna().astype(int)
  df[col + "_MISSING"] = df[col].isna().astype(int)
  df[col + "_MISSING"] = df[col].isna(

Remaining NaNs: 0
Final shape: (30000, 244)


In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score, precision_score, recall_score

model1 = LogisticRegression(max_iter=10000)
model2 = svm.SVC(probability=True)
model3 = LinearDiscriminantAnalysis()

model1.fit(X, y)
model2.fit(X, y)
model3.fit(X, y)

pred1 = model1.predict(X)
pred2 = model2.predict(X)
pred3 = model3.predict(X)

print("LR Precision:", precision_score(y, pred1))
print("SVM Precision:", precision_score(y, pred2))
print("LDA Precision:", precision_score(y, pred3))

print("LR Recall:", recall_score(y, pred1))
print("SVM Recall:", recall_score(y, pred2))
print("LDA Recall:", recall_score(y, pred3))



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=10000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LR Precision: 0.5
SVM Precision: 1.0
LDA Precision: 0.49693251533742333
LR Recall: 0.000407000407000407
SVM Recall: 0.000407000407000407
LDA Recall: 0.03296703296703297


In [5]:
# 'YEARS_BEGINEXPLUATATION_AVG', 'YEARS_BUILD_AVG', 'YEARS_BEGINEXPLUATATION_MODE', 'YEARS_BUILD_MODE', 'YEARS_BEGINEXPLUATATION_MEDI', 'YEARS_BUILD_MEDI', 'LIVINGAPARTMENTS_MEDI', NONLIVINGAPARTMENTS_MEDI'
#not sure if we should drop these? possible proxies for age??


In [6]:
# ============================
# 10. ROC-AUC EVALUATION
# ============================
prob_predictions = {
    "Logistic Regression": model1.predict_proba(X)[:, 1],
    "SVM": model2.predict_proba(X)[:, 1],
    "LDA": model3.predict_proba(X)[:, 1],
}

roc_auc_rows = []
for name, probs in prob_predictions.items():
    roc_auc_rows.append({
        "Model": name,
        "ROC-AUC": roc_auc(y, probs),
    })

pd.DataFrame(roc_auc_rows)



Unnamed: 0,Model,ROC-AUC
0,Logistic Regression,0.639729
1,SVM,0.500679
2,LDA,0.757488


In [7]:
# ============================
# 11. ADDITIONAL METRICS (ACCURACY, PRECISION, F1)
# ============================
threshold = 0.5

metric_rows = []
for name, probs in prob_predictions.items():
    preds_binary = (probs >= threshold).astype(int)
    metric_rows.append({
        "Model": name,
        "Accuracy (custom)": accuracy(y, probs, threshold),
        "Precision (custom)": precision(y, probs, threshold),
        "F1 (custom)": f1_score(y, probs, threshold),
        "Precision (sklearn)": precision_score(y, preds_binary),
    })

pd.DataFrame(metric_rows)



Unnamed: 0,Model,Accuracy (custom),Precision (custom),F1 (custom),Precision (sklearn)
0,Logistic Regression,0.9181,0.5,0.000813,0.5
1,SVM,0.918133,1.0,0.000814,1.0
2,LDA,0.918067,0.496933,0.061832,0.496933
