# **FEATURE ENGINEERING**

## **IMPORTS/TRAIN.CSV**

In [None]:
# Import Required Packages
import my_functions
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
from imblearn.over_sampling import SMOTE
from my_functions import parameters_and_importances
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

In [None]:
train = pd.read_csv('application_train.csv')

## **INITIAL EDA**

In [None]:

counts = train['TARGET'].value_counts() # collect the counts of 0's and 1's
fig, ax = plt.subplots(figsize=(3,5))
ax.bar(counts.index, counts.values, color=['black', 'orange'])
ax.set_xticks([0,1])
ax.set_xlabel('Target')
ax.set_ylabel('Frequency')
ax.set_title('Distribution of Target Variable')
plt.show()


## **TRAIN.CSV PREPROCESSING**

In [None]:
# Split the train data into manageable sets by dtype
target = train['TARGET']
id_curr = train['SK_ID_CURR']
int_features = train.select_dtypes(include='int64')
float_features = train.select_dtypes(include='float64')
object_features = train.select_dtypes(include='object')

### **OBJECT FEATURES**

In [None]:
# Remove Features with too many NA's
drops = ['FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE', 'WEEKDAY_APPR_PROCESS_START']
object_features = object_features.drop(drops, axis=1)

In [None]:
# uses the 'count_na' function from my_functions.py
from my_functions import count_na
count_na(object_features)
# we will return to this function after preprocessing viable object features

**We will assume that an 'NaN' values for 'OCCUPATION_TYPE' and 'NAME_TYPE_SUITE' hold significance in relation to default. The next step is to determine which encoding method will be used on each feature. Some of the features, like 'CODE_GENDER', 'FLAG_OWN_CAR', etc. will simply require an applied map to change non-numeric binary labels to 1's and 0's.**

In [None]:
def weight_of_evidence_encode(df, column_name, label):
    """
    Pass train df, name of column to be encoded, and target column.
    """
    epsilon = 1e-6
    pos = df.groupby(column_name)[label].mean() + epsilon
    neg = 1 - pos + epsilon
    weight_of_evidence = np.log(pos / neg)
    
    woe_dict = weight_of_evidence.to_dict()
    woe_encoded = df[column_name].map(woe_dict)
    
    return woe_encoded


In [None]:
# One-Hot encoding was unsuccessful the first time around, however, this
# function will remain in the notebok in case it is needed later
def one_hot_encode(dataframe, column, drop_first=True):
    """
    One hot encode specified column
    """
    return pd.get_dummies(dataframe, columns=[column])


#### Occupation Feature

Weight of Evidence Encoding: 
 - Natural log of the ratio of prob (default) : prob (pay back) for each unique value of the column

In [None]:

object_features['TARGET'] = target
object_features['OCCUPATION_TYPE'].fillna('not submitted', inplace=True)
object_features['OCCUPATION_TYPE'] = weight_of_evidence_encode(
    object_features, 'OCCUPATION_TYPE', 'TARGET'
)


#### Binary String Features

In [None]:
# fill missing in gender with M
object_features['CODE_GENDER'] = object_features['CODE_GENDER'].fillna('M')

# map ('M' and 'F') and ('Y' and 'N')to 1 and 0, respectively. 
mapping = {'Y': 1, 'N': 0, 'M': 1, 'F': 0, 'Cash loans': 0, 'Revolving loans': 1}
string_cols = ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_CONTRACT_TYPE']
object_features[string_cols] = object_features[string_cols].applymap(mapping.get)

In [None]:
object_features['CODE_GENDER'] = object_features['CODE_GENDER'].fillna(object_features['CODE_GENDER'].mode().iloc[0])

#### NAME_TYPE_SUITE

In [None]:

# fill missing values in the NAME_TYPE_SUITE column with 'not submitted'
object_features['NAME_TYPE_SUITE'] = object_features['NAME_TYPE_SUITE'].fillna('not submitted')

# check the unique values in the NAME_TYPE_SUITE column
print(object_features['NAME_TYPE_SUITE'].unique())


In [None]:
# group by 'NAME_TYPE_SUITE' and count the number of occurrences of 0's and 1's in the 'TARGET' column
counts = object_features.groupby(['NAME_TYPE_SUITE', 'TARGET']).size().unstack(fill_value=0)
counts['posneg_ratio'] = counts[1] / counts[0]
counts

Referring back to the EDA section, the total weight of positive cases in the dataset is 0.08782

In [None]:
# bin the 'NAME_TYPE_SUITE' values into three separate bins of nearest values

suite_mapping = {
    'Children': 0.079643,
    'Family': 0.081018,
    'Group of people': 0.092742,
    'Other_A': 0.096203,
    'Other_B': 0.109023,
    'Spouse, partner': 0.085442,
    'Unaccompanied': 0.089123,
    'not submitted': 0.057283,
}

bins = [0, 0.082, 0.09, 0.11]
labels = [0, 1, 2]
object_features['NAME_TYPE_SUITE'] = object_features['NAME_TYPE_SUITE'].map(suite_mapping)
object_features['NAME_TYPE_SUITE_BINNED'] = pd.cut(object_features['NAME_TYPE_SUITE'], bins=bins, labels=labels)
object_features.drop('NAME_TYPE_SUITE', axis=1, inplace=True)

#### Organization Type

Once again, given the large amount of unique values in this column, it will be Weight of Evidence encoded, just like occupation type. 

In [None]:
object_features['ORGANIZATION_TYPE'].unique()

In [None]:
object_features['ORGANIZATION_TYPE'] = weight_of_evidence_encode(
    object_features, 'ORGANIZATION_TYPE', 'TARGET'
)

#### Housing Type

In [None]:
object_features['NAME_HOUSING_TYPE'].unique()

In [None]:
object_features['NAME_HOUSING_TYPE'] =  weight_of_evidence_encode(
    object_features, 'NAME_HOUSING_TYPE', 'TARGET'
)

#### Income Type

In [None]:
object_features['NAME_INCOME_TYPE'].unique()

In [None]:
counts = object_features['NAME_INCOME_TYPE'].value_counts()
counts

In [None]:
object_features['NAME_INCOME_TYPE'] = weight_of_evidence_encode(
    object_features, 'NAME_INCOME_TYPE', 'TARGET'
)

#### Education

In [None]:
object_features['NAME_EDUCATION_TYPE'].unique()

In [None]:
# Encode the education type feature by frequency. 
# Thesis is that model will pick up on infrequent education levels
# in combination with values of the other features to determine target. 
encoder = LabelEncoder()
encoder.fit(object_features['NAME_EDUCATION_TYPE'])
object_features['NAME_EDUCATION_TYPE'] = encoder.transform(
    object_features['NAME_EDUCATION_TYPE'])  


#### Fam Status

In [None]:
object_features['NAME_FAMILY_STATUS'].unique()

In [None]:
sns.barplot(x='NAME_FAMILY_STATUS', y='TARGET', data=train)


In [None]:
object_features['NAME_FAMILY_STATUS'] = weight_of_evidence_encode(
    object_features, 'NAME_FAMILY_STATUS', 'TARGET'
)



#### review/scaling

In [None]:
# change 'NAME_TYPE_SUITE_BINNED' from dtype category to int
object_features['NAME_TYPE_SUITE_BINNED'] = object_features['NAME_TYPE_SUITE_BINNED'].astype('int64')



In [None]:
# Scale values that are not ordinal, binary, or integer
scaler = StandardScaler()
columns_to_scale = ['NAME_INCOME_TYPE', 'NAME_FAMILY_STATUS',
                    'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 
                    'ORGANIZATION_TYPE']
object_features[columns_to_scale] = scaler.fit_transform(object_features[columns_to_scale])


### **INT FEATURES**

In [None]:
drops = ['SK_ID_CURR']
int_features = int_features.drop(columns=drops)

#### General Approach

This data will be used for much more than selection, removal, or encoding. Numerical features can be transformed and operated on to create more intuitive features for the question we are trying to answer. For example, we will most likely place the sum of all 'FLAG_DOCUMENT' columns into its own column 'MISSING_DOCS' for feature reduction.

#### **Document Columns**
This section will take all 'FLAG_DOCUMENT' columns, sum their total value to create a column of total missing documents in each loan application, and then drop all individual document columns. This is an effort to prevent the classifier from overfitting to the training data - most of these documents are turned in by most people, making the use of each column individually seem redundant.

A simple yet logical implementation of...

In [None]:
doc_columns = [
    "FLAG_DOCUMENT_2",
    "FLAG_DOCUMENT_3",
    "FLAG_DOCUMENT_4",
    "FLAG_DOCUMENT_5",
    "FLAG_DOCUMENT_6",
    "FLAG_DOCUMENT_7",
    "FLAG_DOCUMENT_8",
    "FLAG_DOCUMENT_9",
    "FLAG_DOCUMENT_10",
    "FLAG_DOCUMENT_11",
    "FLAG_DOCUMENT_12",
    "FLAG_DOCUMENT_13",
    "FLAG_DOCUMENT_14",
    "FLAG_DOCUMENT_15",
    "FLAG_DOCUMENT_16",
    "FLAG_DOCUMENT_17",
    "FLAG_DOCUMENT_18",
    "FLAG_DOCUMENT_19",
    "FLAG_DOCUMENT_20",
    "FLAG_DOCUMENT_21",
]

In [None]:

doc_defaults = []
for col in doc_columns:
    doc_defaults.append(int_features.groupby(col)['TARGET'].mean()[1])

fig, ax = plt.subplots(figsize=(10,8))
y_pos = range(len(doc_columns))
ax.barh(y_pos, doc_defaults, align='center')
ax.set_yticks(y_pos)
ax.set_yticklabels(doc_columns)
ax.invert_yaxis()
ax.set_xlabel('Default Rate')
ax.set_title('Default Rate by Flag Document Column')
plt.show()


In [None]:
important_doc_cols = ['FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3'
                      'FLAG_DOCUMENT_21']

In [None]:
# Drop all document columns that are unimportant
doc_columns = [
    "FLAG_DOCUMENT_4",
    "FLAG_DOCUMENT_5",
    "FLAG_DOCUMENT_6",
    "FLAG_DOCUMENT_7",
    "FLAG_DOCUMENT_8",
    "FLAG_DOCUMENT_9",
    "FLAG_DOCUMENT_10",
    "FLAG_DOCUMENT_11",
    "FLAG_DOCUMENT_12",
    "FLAG_DOCUMENT_13",
    "FLAG_DOCUMENT_14",
    "FLAG_DOCUMENT_15",
    "FLAG_DOCUMENT_16",
    "FLAG_DOCUMENT_17",
    "FLAG_DOCUMENT_18",
    "FLAG_DOCUMENT_19",
    "FLAG_DOCUMENT_20",
]

int_features = int_features.drop(doc_columns, axis=1)

#### Children

In [None]:
import matplotlib.pyplot as plt

# Calculate default rate for each CNT_CHILDREN value
default_rates = []
for i in range(int_features['CNT_CHILDREN'].max() + 1):
    rate = int_features[int_features['CNT_CHILDREN'] == i]['TARGET'].mean()
    default_rates.append(rate)

# Create horizontal bar chart
fig, ax = plt.subplots(figsize=(8, 6))
ax.barh(range(len(default_rates)), default_rates, color='blue')
ax.set_yticks(range(len(default_rates)))
ax.set_yticklabels(range(int_features['CNT_CHILDREN'].max() + 1))
ax.invert_yaxis()
ax.set_xlabel('Default Rate')
ax.set_ylabel('Number of Children')
ax.set_title('Default Rate by Number of Children')
plt.show()


#### Time of day

In [None]:
int_features['HOUR_APPR_PROCESS_START'] = pd.cut(
    int_features['HOUR_APPR_PROCESS_START'], 
    bins=[0, 8, 16, 24],
    labels=['morning', 'day', 'night']
).astype(str)

In [None]:
int_features = one_hot_encode(int_features, 'HOUR_APPR_PROCESS_START')

In [None]:
# drop everything except for morning
drops = [
    'HOUR_APPR_PROCESS_START_nan',
    'HOUR_APPR_PROCESS_START_night',
    'HOUR_APPR_PROCESS_START_day',
]

int_features = int_features.drop(drops, axis=1)

#### Living/Working Region

In [None]:
int_features['REGIONS_NOT_MATCHED'] = (int_features['REG_REGION_NOT_LIVE_REGION'] + 
                                       int_features['REG_REGION_NOT_WORK_REGION'] + 
                                       int_features['LIVE_REGION_NOT_WORK_REGION'])
int_features = int_features.drop(columns=['REG_REGION_NOT_LIVE_REGION', 
                                           'REG_REGION_NOT_WORK_REGION', 
                                           'LIVE_REGION_NOT_WORK_REGION'])


In [None]:
int_features['CITIES_NOT_MATCHED'] = int_features['REG_CITY_NOT_LIVE_CITY'] + int_features['REG_CITY_NOT_WORK_CITY']
int_features = int_features.drop(columns=['REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY'])


In [None]:
int_features['REGION_RATING_CLIENT'].unique()

In [None]:

int_features = one_hot_encode(int_features, 'REGION_RATING_CLIENT')


In [None]:
int_features = one_hot_encode(int_features, 'REGION_RATING_CLIENT_W_CITY')

In [None]:
# drop ones and twos from ratings 
drops = [
    'REGION_RATING_CLIENT_1',
    'REGION_RATING_CLIENT_W_CITY_1',
    'REGION_RATING_CLIENT_2',
    'REGION_RATING_CLIENT_W_CITY_2'

]

int_features = int_features.drop(drops, axis=1)

In [None]:
int_features.columns

#### Birthday

In [None]:
# Convert 'DAYS_BIRTH' to years
int_features['AGE'] = round(-int_features['DAYS_BIRTH'] / 365)

# Plot histogram
fig, ax = plt.subplots(figsize=(10, 8))
sns.histplot(
    data=int_features[int_features['TARGET'] == 1], x='AGE', color='red', ax=ax,
    binwidth=2, stat='count', edgecolor='black', linewidth=1.0
)
sns.histplot(
    data=int_features[int_features['TARGET'] == 0], x='AGE', color='blue', ax=ax,
    binwidth=2, stat='count', edgecolor='black', linewidth=1.0
)
ax.set_xlabel('Age')
ax.set_ylabel('Count')
ax.legend(['Default', 'Non-Default'])
plt.title('Default Counts by Age')
plt.show()


In [None]:
int_features['AGE_UNDER_40'] = int_features['DAYS_BIRTH'].apply(lambda x: 1 if x < -40*365 else 0)

In [None]:
int_features.columns

In [None]:
# drop flag columns from obj features and add it to int features
int_features['FLAG_OWN_CAR'] = object_features['FLAG_OWN_CAR']
int_features['FLAG_OWN_REALTY'] = object_features['FLAG_OWN_REALTY']

drops = ['FLAG_OWN_CAR', 'FLAG_OWN_REALTY']
object_features = object_features.drop(drops, axis=1)



flag_cols = int_features.columns[int_features.columns.str.startswith('FLAG_')]

flag_defaults = []
for col in flag_cols:
    flag_defaults.append(int_features.groupby(col)['TARGET'].mean()[1])

fig, ax = plt.subplots(figsize=(10,8))
y_pos = range(len(flag_cols))
ax.barh(y_pos, flag_defaults, align='center')
ax.set_yticks(y_pos)
ax.set_yticklabels(flag_cols)
ax.invert_yaxis()
ax.set_xlabel('Default Rate')
ax.set_title('Default Rate by Flag Document Column')
plt.show()


In [None]:
# Create communications flag column
int_features['COMMUNICATIONS_FLAGS'] = int_features[
    ['FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE',
     'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL']
].sum(axis=1)

# Create flag assets column
int_features['FLAG_ASSETS'] = int_features[['FLAG_OWN_CAR', 'FLAG_OWN_REALTY']].sum(axis=1)

int_features = int_features.drop(['FLAG_MOBIL', 'FLAG_EMP_PHONE', 
                                  'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE',
                                  'FLAG_PHONE', 'FLAG_EMAIL', 
                                  'FLAG_OWN_CAR', 'FLAG_OWN_REALTY'], axis=1)



### **FLOAT FEATURES**

We will begin by identifying possible imputation strategies for columns with a large number of missing values, or just drop them in general. This is an important first step because the model's predictive power can be disrupted by columns that are not filled with accurate measures of central tendency. 

#### Fill missing values

##### **'AMT_ANNUITY'**

In [None]:

# drop rows with empty annuity value
# very few and misleading data

float_features.dropna(subset=['AMT_ANNUITY'], inplace=True)

##### **'AMT_GOODS_PRICE'**

In [None]:
# same with goods price
# we assume that in a loan application, $ amt for annuity and goods
# will be required

float_features.dropna(subset=['AMT_GOODS_PRICE'], inplace=True)


##### **Columns to fill w/ measures of central tendency**

In [None]:

# define float columns to be imputed
fill_with_mean = ['OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE',
                  'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE']
fill_with_median = ['CNT_FAM_MEMBERS', 
                    'DAYS_LAST_PHONE_CHANGE']
fill_with_mode = ['AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY', 
                  'AMT_REQ_CREDIT_BUREAU_WEEK', 'AMT_REQ_CREDIT_BUREAU_MON', 
                  'AMT_REQ_CREDIT_BUREAU_QRT', 'AMT_REQ_CREDIT_BUREAU_YEAR']
# instantiate imputers
mean_imputer = SimpleImputer(strategy='mean')
median_imputer = SimpleImputer(strategy='median')
mode_imputer = SimpleImputer(strategy='most_frequent')

# impute columns with missing values 
float_features[fill_with_mean] = mean_imputer.fit_transform(float_features[fill_with_mean])
float_features[fill_with_median] = median_imputer.fit_transform(float_features[fill_with_median])
float_features[fill_with_mode] = mode_imputer.fit_transform(float_features[fill_with_mode])



##### **EXT_SOURCE Columns**

In [None]:
# loop over the source columns and impute missing values with the mean of the other
# two source columns in the same application

ext_source_cols = ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']

for col in ext_source_cols:
    missing_mask = float_features[col].isnull()
    temp_df = float_features.loc[missing_mask, ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']]
    mean_values = temp_df.mean(axis=1)
    float_features.loc[missing_mask, col] = mean_values

# for columns that have zero external source scores, we will fill them with the mean of the column
float_features[ext_source_cols] = mean_imputer.fit_transform(float_features[ext_source_cols])





##### **Drop columns with too many missing vals**

In [None]:

# define columns to drop
drop = ['OWN_CAR_AGE', 'APARTMENTS_AVG', 'BASEMENTAREA_AVG', 
        'YEARS_BEGINEXPLUATATION_AVG', 'YEARS_BUILD_AVG', 
        'COMMONAREA_AVG', 'ELEVATORS_AVG', 'ENTRANCES_AVG', 
        'FLOORSMAX_AVG', 'FLOORSMIN_AVG', 'LANDAREA_AVG', 
        'LIVINGAPARTMENTS_AVG', 'LIVINGAREA_AVG', 
        'NONLIVINGAPARTMENTS_AVG', 'NONLIVINGAREA_AVG', 
        'APARTMENTS_MODE', 'BASEMENTAREA_MODE', 
        'YEARS_BEGINEXPLUATATION_MODE', 'YEARS_BUILD_MODE', 
        'COMMONAREA_MODE', 'ELEVATORS_MODE', 'ENTRANCES_MODE', 
        'FLOORSMAX_MODE', 'FLOORSMIN_MODE', 'LANDAREA_MODE', 
        'LIVINGAPARTMENTS_MODE', 'LIVINGAREA_MODE', 
        'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAREA_MODE', 
        'APARTMENTS_MEDI', 'BASEMENTAREA_MEDI', 
        'YEARS_BEGINEXPLUATATION_MEDI', 'YEARS_BUILD_MEDI', 
        'COMMONAREA_MEDI', 'ELEVATORS_MEDI', 'ENTRANCES_MEDI', 
        'FLOORSMAX_MEDI', 'FLOORSMIN_MEDI', 'LANDAREA_MEDI', 
        'LIVINGAPARTMENTS_MEDI', 'LIVINGAREA_MEDI', 
        'NONLIVINGAPARTMENTS_MEDI', 'NONLIVINGAREA_MEDI', 
        'TOTALAREA_MODE']

# drop unimportant columns
float_features = float_features.drop(drop, axis=1)


In [None]:
train = pd.concat([float_features, int_features, object_features], axis=1)

In [None]:
# create new aggregated features out of SOCIAL_CIRCLE_CNT
train['SOCIAL_CIRCLE_CNT'] = train[['OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE', 'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE']].sum(axis=1)
train.drop(['OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE', 'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE'], axis=1, inplace=True)


# sum of inquiries
train['AMT_REQ_CREDIT_BUREAU_TOTAL'] = train['AMT_REQ_CREDIT_BUREAU_HOUR'].fillna(0) + train['AMT_REQ_CREDIT_BUREAU_DAY'].fillna(0) + train['AMT_REQ_CREDIT_BUREAU_WEEK'].fillna(0) + train['AMT_REQ_CREDIT_BUREAU_MON'].fillna(0) + train['AMT_REQ_CREDIT_BUREAU_QRT'].fillna(0) + train['AMT_REQ_CREDIT_BUREAU_YEAR'].fillna(0)

train = train.drop(['AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY',
                    'AMT_REQ_CREDIT_BUREAU_WEEK', 'AMT_REQ_CREDIT_BUREAU_MON',
                    'AMT_REQ_CREDIT_BUREAU_QRT', 'AMT_REQ_CREDIT_BUREAU_YEAR'], axis=1)

train_preprocessed = train


## **PREVIOUS_APPLICATION.CSV**

In [None]:
prev = pd.read_csv('previous_application.csv')

In [None]:
# interest rate column
prev['INTEREST_RATE'] = (prev['AMT_ANNUITY'] * prev['CNT_PAYMENT'] - prev['AMT_CREDIT']) / prev['AMT_CREDIT']

# credit:payments
prev['CREDIT_PAYMENTS'] = prev['AMT_CREDIT'] / prev['CNT_PAYMENT']

# credit:cost
prev['CREDIT_COST'] = prev['AMT_CREDIT'] + prev['AMT_GOODS_PRICE']




# dictionary of aggregations to be performed
aggregations = {
    'AMT_ANNUITY': ['mean', 'max', 'min'],
    'AMT_APPLICATION': ['mean', 'max', 'min'],
    'AMT_CREDIT': ['mean', 'max', 'min'],
    'AMT_DOWN_PAYMENT': ['mean', 'max', 'min'],
    'AMT_GOODS_PRICE': ['mean', 'max', 'min'],
    'DAYS_DECISION': ['mean', 'max', 'min'],
    'CNT_PAYMENT': ['mean'],
    'INTEREST_RATE': ['max', 'min'],
    'CREDIT_PAYMENTS': ['mean', 'max', 'min'],
    'CREDIT_COST': ['mean', 'max', 'min']
}



In [None]:
# apply all aggregations to the 'prev' dataset
prev_agg = prev.groupby('SK_ID_CURR').agg(aggregations)

# flatten column names
prev_agg.columns = ['_'.join(col).strip() for col in prev_agg.columns.values]

# Impute missing values with column means
prev_agg.fillna(prev_agg.mean(), inplace=True)


In [None]:
# merge aggregated 'prev' data with 'train'
train_preprocessed['SK_ID_CURR'] = id_curr
train_preprocessed['TARGET'] = target
train_preprocessed = train.merge(prev_agg, on='SK_ID_CURR', how='left')


Dropping the rows with missing values in the aggregation features is seemingly harmless, given that the distribution of the target variable suppresses negative outcomes in these rows. Therefore, we will drop all rows with na values at this point

In [None]:
# check if dataframe has duplicate columns
if len(train_preprocessed.columns) == len(set(train_preprocessed.columns)):
    print("No duplicate columns")
else:
    print("Duplicate columns detected. Removing duplicates...")

    # remove duplicate columns
    train_preprocessed = train_preprocessed.loc[:, ~train_preprocessed.columns.duplicated()]

    # check if duplicate columns were successfully removed
    if len(train_preprocessed.columns) == len(set(train_preprocessed.columns)):
        print("Duplicate columns removed")
    else:
        print("Error: Duplicate columns still present")


## **POS_CASH_BALANCE.CSV**

In [None]:
pos_cash = pd.read_csv('POS_CASH_balance.csv')
pos_cash.info()

In [None]:
pos_cash_agg_dict = {
    'MONTHS_BALANCE': ['min', 'max'],
    'CNT_INSTALMENT': ['min', 'max', 'mean'],
    'CNT_INSTALMENT_FUTURE': ['min', 'max', 'mean'],
    'SK_DPD': ['max', 'mean'],
    'SK_DPD_DEF': ['max', 'mean']
}


In [None]:
# groupby SK_ID_CURR and aggregate
pos_cash_grouped = pos_cash.groupby('SK_ID_CURR').agg(
    pos_cash_agg_dict
)

# join with train on SK_ID_CURR
train_preprocessed = train_preprocessed.merge(
    pos_cash_grouped, on='SK_ID_CURR', how='left', suffixes=('','_POS')
)


## **Credit Card**

In [None]:
credit_card = pd.read_csv('credit_card_balance.csv')
credit_card.info()

In [None]:
agg_dict = {
    'AMT_BALANCE': ['sum', 'max', 'min', 'mean', 'median'],
    'AMT_CREDIT_LIMIT_ACTUAL': ['sum', 'max', 'min', 'mean', 'median'],
    'AMT_PAYMENT_TOTAL_CURRENT': ['sum', 'max', 'min', 'mean', 'median'],
    'AMT_RECEIVABLE_PRINCIPAL': ['sum', 'max', 'min', 'mean', 'median'],
    'AMT_RECIVABLE': ['sum', 'max', 'min', 'mean', 'median'],
    'AMT_TOTAL_RECEIVABLE': ['sum', 'max', 'min', 'mean', 'median'],
    'CNT_DRAWINGS_CURRENT': ['sum', 'max', 'min', 'mean', 'median'],
    'SK_DPD': ['sum', 'max', 'min', 'mean', 'median'],
    'SK_DPD_DEF': ['sum', 'max', 'min', 'mean', 'median']
}


In [None]:
# groupby SK_ID_CURR and aggregate
credit_card_grouped = credit_card.groupby('SK_ID_CURR').agg(
    agg_dict
)

# join with train on SK_ID_CURR
train_preprocessed = train_preprocessed.merge(
    credit_card_grouped, on='SK_ID_CURR', how='left', suffixes=('','_POS')
)


In [None]:
count_na(credit_card_grouped)

## Sample Submissions

In [None]:
bureau = pd.read_csv('bureau.csv')
bureau.info()

In [None]:
import pandas as pd

# Load bureau.csv
bureau = pd.read_csv('bureau.csv')

# Define the aggregations to perform
agg_dict = {
    'DAYS_CREDIT': ['min', 'max', 'mean'],
    'CREDIT_DAY_OVERDUE': ['max', 'mean'],
    'AMT_CREDIT_MAX_OVERDUE': ['max', 'mean'],
    'AMT_CREDIT_SUM': ['max', 'mean'],
    'AMT_CREDIT_SUM_DEBT': ['max', 'mean'],
    'AMT_CREDIT_SUM_OVERDUE': ['max', 'mean'],
    'DAYS_CREDIT_UPDATE': ['max', 'mean'],
    'CNT_CREDIT_PROLONG': ['sum']
}

# Perform the aggregations
bureau_agg = bureau.groupby('SK_ID_CURR').agg(agg_dict)
bureau_agg.columns = ['_'.join(col).strip() for col in bureau_agg.columns.values]

# Merge with train_preprocessed
train_preprocessed = pd.merge(train_preprocessed, bureau_agg, on='SK_ID_CURR', how='left')


In [None]:
count_na(train_preprocessed)

## **FEATURE ENGINNERING**

### **Further EDA**

In [None]:
train = train_preprocessed

In [None]:
new_columns = []
for col in train.columns:
    if isinstance(col, tuple):
        new_col = "_".join(str(c) for c in col)
    else:
        new_col = col
    new_columns.append(new_col)

train.columns = new_columns

In [None]:
count_na(train)

In [None]:
# ratios
train['CREDIT_TO_ANNUITY'] = train['AMT_CREDIT'] / train['AMT_ANNUITY']
train['CREDIT_TO_GOODS'] = train['AMT_CREDIT'] / train['AMT_GOODS_PRICE']
train['ANNUITY_TO_INCOME'] = train['AMT_ANNUITY'] / train['AMT_INCOME_TOTAL']
train['CREDIT_TO_INCOME'] = train['AMT_CREDIT'] / train['AMT_INCOME_TOTAL']
train['GOODS_TO_INCOME'] = train['AMT_GOODS_PRICE'] / train['AMT_INCOME_TOTAL']
train['DAYS_EMPLOYED_TO_BIRTH'] = train['DAYS_EMPLOYED'] / train['DAYS_BIRTH']
train['INCOME_LENGTH'] = train['AMT_INCOME_TOTAL'] / train['DAYS_EMPLOYED']
train['INCOME_PER_PERSON'] = train['AMT_INCOME_TOTAL'] / train['CNT_FAM_MEMBERS']
train['INCOME_BY_AGE'] = train['AMT_INCOME_TOTAL'] / train['DAYS_BIRTH']
train['EMPLOYMENT'] = train['DAYS_EMPLOYED'] / train['DAYS_BIRTH']
train['CREDIT_INCOME'] = train['AMT_REQ_CREDIT_BUREAU_TOTAL'] / train['AMT_INCOME_TOTAL']



### **Interaction Features**

Going to attempt adding a few interaction terms to capture more complex relationships. This approach can, however, lead to model overfitting we will continue to evaluate the performance of the model with varying inputs/parameters.

In [None]:
# Create interaction terms
# create interaction terms between pairs of features
train['INCOME_CREDIT'] = train['AMT_INCOME_TOTAL'] * train['AMT_CREDIT']
train['CREDIT_GOODS'] = train['AMT_CREDIT'] * train['AMT_GOODS_PRICE']
train['EMPLOYED_BIRTH'] = train['DAYS_EMPLOYED'] * train['DAYS_BIRTH']
train['CHILDREN_INCOME'] = train['CNT_CHILDREN'] * train['AMT_INCOME_TOTAL']
train['CHILDREN_CREDIT'] = train['CNT_CHILDREN'] * train['AMT_CREDIT']
train['CHILDREN_ANNUITY'] = train['CNT_CHILDREN'] * train['AMT_ANNUITY']
train['CREDIT_ANNUITY'] = train['AMT_CREDIT'] * train['AMT_ANNUITY']
train['ANNUITY_INCOME'] = train['AMT_ANNUITY'] / train['AMT_INCOME_TOTAL']
train['INCOME_PER_PERSON'] = train['AMT_INCOME_TOTAL'] / train['CNT_FAM_MEMBERS']


## **MODEL TRAINING/EVAL**

### **SCALE FEATURES**

In [None]:
# Replace infinite values with NaN
train.replace([np.inf, -np.inf], np.nan, inplace=True)

# Drop rows with NaN values
train.dropna(inplace=True)

# Check the number of rows and columns in the updated train dataset
print(f'Number of Rows: {train.shape[0]}')
print(f'Number of Columns: {train.shape[1]}')


In [None]:
# Check for infinite or NaN values
print(np.any(np.isnan(Xtrain)))
print(np.any(np.isnan(Xtest)))
print(np.any(np.isinf(Xtrain)))
print(np.any(np.isinf(Xtest)))


In [None]:
# Separate target and training data
drops = ['TARGET', 'SK_ID_CURR']
preprocessed_target = train['TARGET']
y = preprocessed_target
X = train.drop(drops, axis=1)


# Split dataset into training and testing sets
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.75, random_state=81)

In [None]:
# Create the scaler
scaler = StandardScaler()

In [None]:
# scale to mean 0 std dev 1
Xtrain = scaler.fit_transform(Xtrain)
Xtest = scaler.fit_transform(Xtest)
Xtrain = pd.DataFrame(Xtrain, columns=X.columns)
Xtest = pd.DataFrame(Xtest, columns=X.columns)

### **PCA and SMOTE**

In [None]:
# Reduce dimensionality with PCA
pca = PCA(n_components=25)
Xtrain_pca = pca.fit_transform(Xtrain)
Xtest_pca = pca.transform(Xtest)


# Perform SMOTE on the positive class
smote = SMOTE(random_state=81, sampling_strategy=0.25)
Xtrain_resampled, ytrain_resampled = smote.fit_resample(Xtrain_pca, ytrain)
# will be removed from final version if not applicable.


### **TRAINING**

#### **RandomForest**

##### Randomized Search for Hyperparameters

In [None]:
# use 'parameters_and_importances' function to run RandomizedSearchCV for the RandomForest Classifier
feature_importances, optimal_hyperparameters = parameters_and_importances(Xtrain_resampled, ytrain_resampled)
print(optimal_hyperparameters)

##### Train

In [None]:
%%time

rf = RandomForestClassifier(
    n_estimators=100, min_samples_split=2, min_samples_leaf=2, max_features='sqrt', max_depth=10, criterion='entropy', class_weight={0:1, 1:9.5}
)
rf.fit(Xtrain_resampled, ytrain_resampled)

# Make predictions on the test data
y_pred = rf.predict(Xtest_pca)
y_pred_proba = rf.predict_proba(Xtest_pca)[:, 1]


##### Evaluation Metrics

In [None]:

# Evaluate model performance
print("Accuracy:", accuracy_score(ytest, y_pred))
print("Precision:", precision_score(ytest, y_pred))
print("Recall:", recall_score(ytest, y_pred))
print("F1-Score:", f1_score(ytest, y_pred))
print("ROC-AUC Score:", roc_auc_score(ytest, y_pred_proba))

# Generate confusion matrix
cm = confusion_matrix(ytest, y_pred)
sns.heatmap(cm, annot=True, cmap='Blues', fmt='g')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()


#### **XGBoost**

##### Randomized Search for hyperparameters

In [None]:
# Define the hyperparameter grid
params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.25, 0.5],
    'scale_pos_weight': [2, 4, 8]
}

xgb_model = XGBClassifier()
random_search = RandomizedSearchCV(
    xgb_model,
    param_distributions=params,
    n_iter=10,
    scoring='roc_auc',
    n_jobs=-1,
    cv=5,
    verbose=3
)

random_search.fit(X_train_resampled, y_train_resampled)
best_params = random_search.best_params_
print(best_params)


##### Train

In [None]:
# mess around with scale_pos_weight

# Fit an XGBoost model to the resampled training data
xgb_model = xgb.XGBClassifier(
    n_estimators=100, max_depth=5, learning_rate=0.1, scale_pos_weight=8,
)
xgb_model.fit(Xtrain_resampled, ytrain_resampled)


##### Evaluation Metrics

In [None]:

y_pred = xgb_model.predict(Xtest_pca)
y_proba = xgb_model.predict_proba(Xtest_pca)[:, 1]

# Compute the confusion matrix
cm = confusion_matrix(ytest, y_pred)

# Plot the confusion matrix
sns.heatmap(cm, annot=True, cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

# Print the evaluation metrics
print("Accuracy:", accuracy_score(ytest, y_pred))
print("Precision:", precision_score(ytest, y_pred))
print("Recall:", recall_score(ytest, y_pred))
print("F1-Score:", f1_score(ytest, y_pred))
print("ROC-AUC Score:", roc_auc_score(ytest, y_proba))


In [None]:
fpr, tpr, thresholds = roc_curve(ytest, y_proba)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, color="darkorange", lw=2, label="ROC curve (area = %0.2f)" % roc_auc)
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.fill_between(fpr, tpr, color="orange", alpha=0.5)
plt.xlim([0.0, 1.])
plt.ylim([0.0, 1.])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic")
plt.legend(loc="lower right")
plt.show()
