# Feature Selection Script

Tree-based algorithm suits this project best. Predicting binary outcome based on numerical and categorical features.
Random forest classifier will do a good job of handling both types and capturing complex non-linear relationships between the features and the target.

In [1]:
import pandas as pd
#pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

In [2]:
# -- LOAD IN TRAINING DATASET FROM HOME CREDIT GROUP -- #
df_train = pd.read_csv('/Users/sampence/Documents/IU Bloom/E402 - Computational Methods in Macro/Final Project/home-credit-default-risk/application_train.csv')

In [3]:
mapping = {'Y': 1, 'N': 0, 'M': 1, 'F': 0}
cols = ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']
df_train[cols] = df_train[cols].applymap(mapping.get)


# Visualization

In [4]:
# -- Split the features to visualize data -- #
df1 = df_train.iloc[:,:41]
positive_cases = df1['TARGET'].value_counts()[1]
print("Number of Positive Cases: ", positive_cases)


Number of Positive Cases:  24825


In [5]:
print(df1.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Data columns (total 41 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   SK_ID_CURR                   307511 non-null  int64  
 1   TARGET                       307511 non-null  int64  
 2   NAME_CONTRACT_TYPE           307511 non-null  object 
 3   CODE_GENDER                  307507 non-null  float64
 4   FLAG_OWN_CAR                 307511 non-null  int64  
 5   FLAG_OWN_REALTY              307511 non-null  int64  
 6   CNT_CHILDREN                 307511 non-null  int64  
 7   AMT_INCOME_TOTAL             307511 non-null  float64
 8   AMT_CREDIT                   307511 non-null  float64
 9   AMT_ANNUITY                  307499 non-null  float64
 10  AMT_GOODS_PRICE              307233 non-null  float64
 11  NAME_TYPE_SUITE              306219 non-null  object 
 12  NAME_INCOME_TYPE             307511 non-null  object 
 13 

In [6]:
#corr = df1.corr()

#plt.figure(figsize=(30,30))
#sns.heatmap(corr.round(2), annot=True, vmin=-1.0, cmap='mako')
#plt.show()

#  Preprocessing

In [7]:
def preprocess(df):
    df = df.copy()
    
    # Drop insignificant columns
    df = df.drop('SK_ID_CURR', axis=1)
    df = df.drop('AMT_GOODS_PRICE', axis=1)
    df = df.drop('OWN_CAR_AGE', axis=1)
    df = df.drop('ORGANIZATION_TYPE', axis=1)
    df = df.drop('OCCUPATION_TYPE', axis=1)
    df = df.drop('HOUR_APPR_PROCESS_START', axis=1)
    df = df.drop('REG_REGION_NOT_LIVE_REGION', axis=1)
    df = df.drop('REG_REGION_NOT_WORK_REGION', axis=1)
    df = df.drop('LIVE_REGION_NOT_WORK_REGION', axis=1)
    df = df.drop('REG_CITY_NOT_LIVE_CITY', axis=1)
    df = df.drop('REG_CITY_NOT_WORK_CITY', axis=1)
    df = df.drop('LIVE_CITY_NOT_WORK_CITY', axis=1)
    df = df.drop('WEEKDAY_APPR_PROCESS_START', axis=1)
    df = df.drop('REGION_RATING_CLIENT_W_CITY', axis=1)
    df = df.drop('REGION_RATING_CLIENT', axis=1)
    df = df.drop('DAYS_ID_PUBLISH', axis=1)
    df = df.drop('FLAG_CONT_MOBILE', axis=1)
    
    
    # Create value for na's that may be significant
    df['NAME_TYPE_SUITE'] = df['NAME_TYPE_SUITE'].fillna('None')
    df['CNT_FAM_MEMBERS'] = df['CNT_FAM_MEMBERS'].fillna('None')
    df['AMT_ANNUITY'] = df['AMT_ANNUITY'].fillna(-99999999)
    
    # Fill random NA values with median of column to minimize effect on distribution
    df['CODE_GENDER'] = df['CODE_GENDER'].fillna(df['CODE_GENDER'].median())
    df['AMT_ANNUITY'] = df['AMT_ANNUITY'].fillna(df['AMT_ANNUITY'].median())
    
    # Split df into X and y
    y = df['TARGET'].copy()
    X = df.drop('TARGET', axis=1).copy()
    
    return X, y

In [8]:
X, y = preprocess(df1)

In [9]:
X

Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,NAME_TYPE_SUITE,NAME_INCOME_TYPE,...,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,CNT_FAM_MEMBERS
0,Cash loans,1.0,0,1,0,202500.0,406597.5,24700.5,Unaccompanied,Working,...,0.018801,-9461,-637,-3648.0,1,1,0,1,0,1.0
1,Cash loans,0.0,0,0,0,270000.0,1293502.5,35698.5,Family,State servant,...,0.003541,-16765,-1188,-1186.0,1,1,0,1,0,2.0
2,Revolving loans,1.0,1,1,0,67500.0,135000.0,6750.0,Unaccompanied,Working,...,0.010032,-19046,-225,-4260.0,1,1,1,1,0,1.0
3,Cash loans,0.0,0,1,0,135000.0,312682.5,29686.5,Unaccompanied,Working,...,0.008019,-19005,-3039,-9833.0,1,1,0,0,0,2.0
4,Cash loans,1.0,0,1,0,121500.0,513000.0,21865.5,Unaccompanied,Working,...,0.028663,-19932,-3038,-4311.0,1,1,0,0,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,Cash loans,1.0,0,0,0,157500.0,254700.0,27558.0,Unaccompanied,Working,...,0.032561,-9327,-236,-8456.0,1,1,0,0,0,1.0
307507,Cash loans,0.0,0,1,0,72000.0,269550.0,12001.5,Unaccompanied,Pensioner,...,0.025164,-20775,365243,-4388.0,1,0,0,1,0,1.0
307508,Cash loans,0.0,0,1,0,153000.0,677664.0,29979.0,Unaccompanied,Working,...,0.005002,-14966,-7921,-6737.0,1,1,0,0,1,1.0
307509,Cash loans,0.0,0,1,0,171000.0,370107.0,20205.0,Unaccompanied,Commercial associate,...,0.005313,-11961,-4786,-2562.0,1,1,0,0,0,2.0


In [10]:
#pd.set_option('display.max_rows', None)
print(X.isna().sum())

NAME_CONTRACT_TYPE            0
CODE_GENDER                   0
FLAG_OWN_CAR                  0
FLAG_OWN_REALTY               0
CNT_CHILDREN                  0
AMT_INCOME_TOTAL              0
AMT_CREDIT                    0
AMT_ANNUITY                   0
NAME_TYPE_SUITE               0
NAME_INCOME_TYPE              0
NAME_EDUCATION_TYPE           0
NAME_FAMILY_STATUS            0
NAME_HOUSING_TYPE             0
REGION_POPULATION_RELATIVE    0
DAYS_BIRTH                    0
DAYS_EMPLOYED                 0
DAYS_REGISTRATION             0
FLAG_MOBIL                    0
FLAG_EMP_PHONE                0
FLAG_WORK_PHONE               0
FLAG_PHONE                    0
FLAG_EMAIL                    0
CNT_FAM_MEMBERS               0
dtype: int64


In [11]:
{column: len(X[column].unique()) for column in X.columns}

{'NAME_CONTRACT_TYPE': 2,
 'CODE_GENDER': 2,
 'FLAG_OWN_CAR': 2,
 'FLAG_OWN_REALTY': 2,
 'CNT_CHILDREN': 15,
 'AMT_INCOME_TOTAL': 2548,
 'AMT_CREDIT': 5603,
 'AMT_ANNUITY': 13673,
 'NAME_TYPE_SUITE': 8,
 'NAME_INCOME_TYPE': 8,
 'NAME_EDUCATION_TYPE': 5,
 'NAME_FAMILY_STATUS': 6,
 'NAME_HOUSING_TYPE': 6,
 'REGION_POPULATION_RELATIVE': 81,
 'DAYS_BIRTH': 17460,
 'DAYS_EMPLOYED': 12574,
 'DAYS_REGISTRATION': 15688,
 'FLAG_MOBIL': 2,
 'FLAG_EMP_PHONE': 2,
 'FLAG_WORK_PHONE': 2,
 'FLAG_PHONE': 2,
 'FLAG_EMAIL': 2,
 'CNT_FAM_MEMBERS': 18}

In [12]:
def onehot_encode(df):
    for col in df.select_dtypes(include='object').columns:
        dummies = pd.get_dummies(df[col], prefix=col)
        df = pd.concat([df, dummies], axis=1)
        df.drop(col, axis=1, inplace=True)
    return df

In [13]:
X = onehot_encode(X)

In [14]:
scaler = StandardScaler()
scaled_df = scaler.fit_transform(X)
scaled_df = pd.DataFrame(scaled_df, columns=X.columns)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, random_state=123)

In [16]:
# ------------------------- #
#    TRAINS ONLY XGBOOST    #
# --------------------------#


# Define the XGBoost model
xgb = XGBClassifier(scale_pos_weight=(len(y_train) - sum(y_train)) / sum(y_train), random_state=123)

# Fit the model to the training data
xgb.fit(X_train, y_train)

# Predict on the test data
y_pred = xgb.predict(X_test)

# Calculate the confusion matrix
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

# Calculate specificity
specificity = tn / (tn + fp)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the evaluation metrics
print("Evaluation metrics for XGBoost model with scale_pos_weight:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Specificity: {specificity:.4f}")


Evaluation metrics for XGBoost model with scale_pos_weight:
Accuracy: 0.6764
Precision: 0.1377
Recall: 0.5643
F1 Score: 0.2214
Specificity: 0.6863


In [17]:
# Define the models to use
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=123),
    'Gradient Boosting': GradientBoostingClassifier(random_state=123),
    'Logistic Regression': LogisticRegression(random_state=123),
    'K-Nearest Neighbor': KNeighborsClassifier(),
    'MLP Classifier': MLPClassifier(random_state=123),
    'XGBoost': XGBClassifier(scale_pos_weight=(len(y_train) - sum(y_train)) / sum(y_train), random_state=123)
}

# Define the resampling techniques to use
resamplers = {
    'Undersampling': RandomUnderSampler(random_state=123)
}

# Loop through the resampling techniques and models
for resampler_name, resampler in resamplers.items():
    # Resample the training data
    X_train_resampled, y_train_resampled = resampler.fit_resample(X_train, y_train)

    for model_name, model in models.items():
        # Fit the model to the resampled training data
        model.fit(X_train_resampled, y_train_resampled)

        # Predict on the test data
        y_pred = model.predict(X_test)

        # Calculate the confusion matrix
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

        # Calculate specificity
        specificity = tn / (tn + fp)

        # Calculate evaluation metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        # Print the model's evaluation metrics with the sampling technique used
        print(f"Sampling Technique: {resampler_name} - Model: {model_name}")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1 Score: {f1:.4f}")
        print(f"Specificity: {specificity:.4f}")
        print()


Sampling Technique: Undersampling - Model: Random Forest
Accuracy: 0.6182
Precision: 0.1198
Recall: 0.5805
F1 Score: 0.1987
Specificity: 0.6215

Sampling Technique: Undersampling - Model: Gradient Boosting
Accuracy: 0.6275
Precision: 0.1295
Recall: 0.6237
F1 Score: 0.2145
Specificity: 0.6279

Sampling Technique: Undersampling - Model: Logistic Regression
Accuracy: 0.5800
Precision: 0.1009
Recall: 0.5250
F1 Score: 0.1693
Specificity: 0.5849

Sampling Technique: Undersampling - Model: K-Nearest Neighbor
Accuracy: 0.5310
Precision: 0.0971
Recall: 0.5725
F1 Score: 0.1660
Specificity: 0.5274

Sampling Technique: Undersampling - Model: MLP Classifier
Accuracy: 0.2717
Precision: 0.0874
Recall: 0.8403
F1 Score: 0.1584
Specificity: 0.2213

Sampling Technique: Undersampling - Model: XGBoost
Accuracy: 0.1414
Precision: 0.0850
Recall: 0.9763
F1 Score: 0.1564
Specificity: 0.0673



# Analysis

In [None]:
from sklearn.metrics import accuracy_score

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy: ", accuracy)

In [None]:
from sklearn.metrics import confusion_matrix
p, n = confusion_matrix(y_test, y_pred)
print(p,n)

In [None]:
def create_prediction_df(y_test, y_pred):
    prediction_df = pd.DataFrame({
        'y_test': y_test,
        'y_pred': y_pred
    })
    prediction_df.reset_index(inplace=True)
    return prediction_df


new_df = create_prediction_df(y_test, y_pred)

new_df.iloc[5000:5100,]

In [None]:
# Assuming you have the y_test and y_pred columns in a DataFrame called new_df
# Create Dataframe of positive results from test set
positive_cases_df = new_df[(new_df['y_test'] == 1)]
print(len(positive_cases_df))
positive_cases_df

## Handling Imbalanced Data

The model still has trouble Identifying positive cases. Therefore, it is imperative that we examine the imbalanced dataset. The next step will be applying different solutions to the imbalance problem, rather than SMOTE, which was used initially.

In [None]:
sns.countplot(x=new_df['y_test'])

In [None]:
sns.countplot(x=y_train_resampled)