In [2]:
import pandas as pd
import numpy as np

application_train_df = pd.read_csv('rory_work/application_train.csv')
bureau_df = pd.read_csv('bureau.csv')
prev_app_df = pd.read_csv('previous_application.csv')

In [3]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer


application_train_df["DAYS_EMPLOYED"].replace(365243, np.nan, inplace=True)


for col in application_train_df.columns:
    application_train_df[col + "_MISSING"] = application_train_df[col].isna().astype(int)

categorical_cols = application_train_df.select_dtypes(include=["object"]).columns

numeric_like_cols = application_train_df.columns.difference(categorical_cols)

application_train_df[numeric_like_cols] = application_train_df[numeric_like_cols].apply(
    lambda col: pd.to_numeric(col, errors="coerce")
)


numeric_cols = application_train_df.select_dtypes(include=[np.number]).columns
numeric_cols = numeric_cols.drop("TARGET")  


num_imputer = SimpleImputer(strategy="median")
application_train_df[numeric_cols] = num_imputer.fit_transform(application_train_df[numeric_cols])

cat_imputer = SimpleImputer(strategy="most_frequent")
application_train_df[categorical_cols] = cat_imputer.fit_transform(application_train_df[categorical_cols])

# Confirm no NaNs
print("Remaining NaNs:", application_train_df.isna().sum().sum())


df_encoded = pd.get_dummies(application_train_df, drop_first=True)


correlations = df_encoded.corr()["TARGET"]
low_corr_cols = correlations[abs(correlations) < 0.01].index.tolist()


low_corr_cols = [col for col in low_corr_cols if col != "TARGET"]

df_filtered = df_encoded.drop(columns=low_corr_cols, errors='ignore')

print("Final shape:", df_filtered.shape)


X = df_filtered.drop("TARGET", axis=1)
y = df_filtered["TARGET"]


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  application_train_df["DAYS_EMPLOYED"].replace(365243, np.nan, inplace=True)
  application_train_df[col + "_MISSING"] = application_train_df[col].isna().astype(int)
  application_train_df[col + "_MISSING"] = application_train_df[col].isna().astype(int)
  application_train_df[col + "_MISSING"] = application_train_df[col].isna().astype(int)
  application_train_df[col + "_MISSING"] = application_train_df[col].isna().astype(int)
  application_train_df[col + "_MISSING"] = application_train_df[col].isna().astype(int)
  application_train_df[col + "_MISSING"] = application_train_df[col].isna().astype(int)
  applicatio

Remaining NaNs: 0
Final shape: (307511, 219)


In [4]:
# Feature Engineering
application_bureau_df = pd.merge(application_train_df, bureau_df, on = "SK_ID_CURR")
total_overdue = application_bureau_df.groupby("SK_ID_CURR")["AMT_CREDIT_SUM_OVERDUE"].sum()
total_debt = application_bureau_df.groupby("SK_ID_CURR")["AMT_CREDIT_SUM_DEBT"].sum()
times_prolonged = application_bureau_df.groupby("SK_ID_CURR")["CNT_CREDIT_PROLONG"].sum()
days_overdue = application_bureau_df.groupby("SK_ID_CURR")["CREDIT_DAY_OVERDUE"].sum()

In [5]:
application_train_merged_df = application_train_df.merge(total_overdue, on='SK_ID_CURR', how='left')
application_train_merged_df = application_train_merged_df.merge(total_debt, on='SK_ID_CURR', how='left')
application_train_merged_df = application_train_merged_df.merge(times_prolonged, on='SK_ID_CURR', how='left')
application_train_merged_df = application_train_merged_df.merge(days_overdue, on = "SK_ID_CURR", how = 'left')

In [6]:
application_train_merged_df['AMT_CREDIT_SUM_OVERDUE'] = application_train_merged_df['AMT_CREDIT_SUM_OVERDUE'].fillna(0)
application_train_merged_df['AMT_CREDIT_SUM_DEBT'] = application_train_merged_df['AMT_CREDIT_SUM_DEBT'].fillna(0)
application_train_merged_df['CNT_CREDIT_PROLONG'] = application_train_merged_df['CNT_CREDIT_PROLONG'].fillna(0)
application_train_merged_df['CREDIT_DAY_OVERDUE'] = application_train_merged_df['CREDIT_DAY_OVERDUE'].fillna(0)

In [7]:
prev_app_ct = prev_app_df[["SK_ID_CURR"]]
prev_app_ct["PREV_APPS"] = 0
prev_app_ct = prev_app_ct.groupby("SK_ID_CURR").count()
prev_app_ct

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prev_app_ct["PREV_APPS"] = 0


Unnamed: 0_level_0,PREV_APPS
SK_ID_CURR,Unnamed: 1_level_1
100001,1
100002,1
100003,3
100004,1
100005,2
...,...
456251,1
456252,1
456253,2
456254,2


In [8]:
prev_app_approved = prev_app_df[["SK_ID_CURR", "NAME_CONTRACT_STATUS"]]
prev_app_approved["NUM_APPROVED"] = np.where(prev_app_approved["NAME_CONTRACT_STATUS"] == "Approved", 1, 0)
prev_app_approved = prev_app_approved.groupby("SK_ID_CURR").sum().reset_index()
prev_app_approved = prev_app_approved[["SK_ID_CURR", "NUM_APPROVED"]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prev_app_approved["NUM_APPROVED"] = np.where(prev_app_approved["NAME_CONTRACT_STATUS"] == "Approved", 1, 0)


In [9]:
prev_app_ct = prev_app_ct.merge(prev_app_approved, on="SK_ID_CURR", how="inner")

In [10]:
application_train_merged_df = application_train_merged_df.merge(prev_app_ct, on="SK_ID_CURR", how="left")

In [11]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
cols_to_standardize = ['AMT_CREDIT_SUM_OVERDUE', 'AMT_CREDIT_SUM_DEBT', 'CNT_CREDIT_PROLONG', 'CREDIT_DAY_OVERDUE', 'PREV_APPS', 'NUM_APPROVED']

application_train_merged_df[cols_to_standardize] = scaler.fit_transform(application_train_merged_df[cols_to_standardize])

In [12]:
application_train_merged_df.shape[1]

250

In [14]:
# Variable Selection
pd.set_option('display.max_rows', 1000)  

numeric_df = application_train_merged_df.select_dtypes(include=[np.number])
correlation_matrix = numeric_df.corr()
target_column_name = 'TARGET'
target_correlations = correlation_matrix[target_column_name]
print(f"\nCorrelations with '{target_column_name}':")
# print(target_correlations)


sorted_target_correlations = target_correlations.sort_values(ascending=False)
print(f"\nSorted correlations with '{target_column_name}':")
print(sorted_target_correlations)

print(application_train_merged_df['TARGET'])


Correlations with 'TARGET':

Sorted correlations with 'TARGET':
TARGET                                  1.000000
DAYS_BIRTH                              0.078239
DAYS_EMPLOYED                           0.063368
REGION_RATING_CLIENT_W_CITY             0.060893
REGION_RATING_CLIENT                    0.058899
DAYS_LAST_PHONE_CHANGE                  0.055218
DAYS_ID_PUBLISH                         0.051457
REG_CITY_NOT_WORK_CITY                  0.050994
FLAG_EMP_PHONE                          0.045982
REG_CITY_NOT_LIVE_CITY                  0.044395
FLAG_DOCUMENT_3                         0.044346
DAYS_REGISTRATION                       0.041975
EMERGENCYSTATE_MODE_MISSING             0.041392
TOTALAREA_MODE_MISSING                  0.041168
ENTRANCES_AVG_MISSING                   0.040872
ENTRANCES_MODE_MISSING                  0.040872
ENTRANCES_MEDI_MISSING                  0.040872
FLOORSMAX_MEDI_MISSING                  0.040847
FLOORSMAX_MODE_MISSING                  0.040847
FLOO

In [15]:
##Columns with less that 0.01 correlation to target

low_correlation_columns = sorted_target_correlations[abs(sorted_target_correlations) < 0.01].index.tolist()
print(f"\nColumns with less than 0.01 correlation to '{target_column_name}':")
print(low_correlation_columns)

application_train_merged_df = application_train_merged_df.dropna()

df_correlation = application_train_merged_df.drop(columns=low_correlation_columns)

print("Filtered shape:", df_correlation.shape)



Columns with less than 0.01 correlation to 'TARGET':
['OBS_30_CNT_SOCIAL_CIRCLE', 'OBS_60_CNT_SOCIAL_CIRCLE', 'CNT_FAM_MEMBERS', 'REG_REGION_NOT_WORK_REGION', 'REG_REGION_NOT_LIVE_REGION', 'FLAG_DOCUMENT_2', 'CREDIT_DAY_OVERDUE', 'FLAG_DOCUMENT_21', 'LIVE_REGION_NOT_WORK_REGION', 'AMT_CREDIT_SUM_DEBT', 'CNT_CREDIT_PROLONG', 'AMT_REQ_CREDIT_BUREAU_DAY', 'FLAG_MOBIL', 'FLAG_CONT_MOBILE', 'FLAG_DOCUMENT_20', 'AMT_REQ_CREDIT_BUREAU_HOUR', 'FLAG_DOCUMENT_5', 'EXT_SOURCE_2_MISSING', 'DAYS_LAST_PHONE_CHANGE_MISSING', 'AMT_GOODS_PRICE_MISSING', 'FLAG_DOCUMENT_12', 'CNT_FAM_MEMBERS_MISSING', 'AMT_REQ_CREDIT_BUREAU_WEEK', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_7', 'FLAG_EMAIL', 'AMT_ANNUITY_MISSING', 'SK_ID_CURR', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_17', 'AMT_INCOME_TOTAL', 'YEARS_BEGINEXPLUATATION_MODE', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_9', 'YEARS_BEGINEXPLUATATION_AVG', 'NONLIVINGAPARTMENTS_MODE', 'YEARS_BEGINEXPLUATATION_MEDI', 'NONLIVINGAPARTMENTS_MEDI', 'AMT_REQ_CREDIT_BURE

Split code:

In [16]:
application_train_merged_df.shape[1]

250