In [38]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer

# ============================
# 1. LOAD DATA
# ============================
df = pd.read_csv("application_train.csv")

# ============================
# 2. FIX SPECIAL VALUES
# ============================

# DAYS_EMPLOYED = 365243 means “no employment record”
df["DAYS_EMPLOYED"].replace(365243, np.nan, inplace=True)

# MAKE MISSINGNESS INDICATORS (best-practice for this dataset)
for col in df.columns:
    df[col + "_MISSING"] = df[col].isna().astype(int)

# ============================
# 3. IDENTIFY CATEGORICAL COLS
# ============================
categorical_cols = df.select_dtypes(include=["object"]).columns

# ============================
# 4. COERCE NUMERIC-LIKE COLS
# ============================
numeric_like_cols = df.columns.difference(categorical_cols)

df[numeric_like_cols] = df[numeric_like_cols].apply(
    lambda col: pd.to_numeric(col, errors="coerce")
)

# ============================
# 5. SET UP NUMERIC & CATEGORICAL COL LISTS
# ============================
numeric_cols = df.select_dtypes(include=[np.number]).columns
numeric_cols = numeric_cols.drop("TARGET")  # do NOT impute target

# ============================
# 6. IMPUTE NUMERICS (median) & CATEGORICALS (mode)
# ============================
num_imputer = SimpleImputer(strategy="median")
df[numeric_cols] = num_imputer.fit_transform(df[numeric_cols])

cat_imputer = SimpleImputer(strategy="most_frequent")
df[categorical_cols] = cat_imputer.fit_transform(df[categorical_cols])

# Confirm no NaNs
print("Remaining NaNs:", df.isna().sum().sum())

# ============================
# 7. ONE-HOT ENCODE CATEGORICAL FIELDS
# ============================
df_encoded = pd.get_dummies(df, drop_first=True)

# ============================
# 8. CORRELATION BASED FILTERING
# ============================
correlations = df_encoded.corr()["TARGET"]
low_corr_cols = correlations[abs(correlations) < 0.01].index.tolist()

# DO NOT drop TARGET even if correlation calculation returns it
low_corr_cols = [col for col in low_corr_cols if col != "TARGET"]

df_filtered = df_encoded.drop(columns=low_corr_cols, errors='ignore')

print("Final shape:", df_filtered.shape)

# ============================
# 9. READY FOR MODELING
# ============================
X = df_filtered.drop("TARGET", axis=1)
y = df_filtered["TARGET"]


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["DAYS_EMPLOYED"].replace(365243, np.nan, inplace=True)
  df[col + "_MISSING"] = df[col].isna().astype(int)
  df[col + "_MISSING"] = df[col].isna().astype(int)
  df[col + "_MISSING"] = df[col].isna().astype(int)
  df[col + "_MISSING"] = df[col].isna().astype(int)
  df[col + "_MISSING"] = df[col].isna().astype(int)
  df[col + "_MISSING"] = df[col].isna().astype(int)
  df[col + "_MISSING"] = df[col].isna().astype(int)
  df[col + "_MISSING"] = df[col].isna().astype(int)
  df[col + "_MISSING"] = df[col].isna().astype(int)
  df[col + "_MISSING"] = df[col].isna().astype(int)
  df[col + "_MISSING"] = df[col].isna(

Remaining NaNs: 0
Final shape: (307511, 219)


In [37]:
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score, precision_score, recall_score

model1 = LogisticRegression(max_iter=10000)
model2 = svm.SVC()
model3 = LinearDiscriminantAnalysis()

model1.fit(X, y)
model2.fit(X, y)
model3.fit(X, y)

pred1 = model1.predict(X)
pred2 = model2.predict(X)
pred3 = model3.predict(X)

print("LR Precision:", precision_score(y, pred1))
print("SVM Precision:", precision_score(y, pred2))
print("LDA Precision:", precision_score(y, pred3))

print("LR Recall:", recall_score(y, pred1))
print("SVM Recall:", recall_score(y, pred2))
print("LDA Recall:", recall_score(y, pred3))



KeyboardInterrupt: 

In [None]:
# 'YEARS_BEGINEXPLUATATION_AVG', 'YEARS_BUILD_AVG', 'YEARS_BEGINEXPLUATATION_MODE', 'YEARS_BUILD_MODE', 'YEARS_BEGINEXPLUATATION_MEDI', 'YEARS_BUILD_MEDI', 'LIVINGAPARTMENTS_MEDI', NONLIVINGAPARTMENTS_MEDI'
#not sure if we should drop these? possible proxies for age??
