In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install featuretools



In [None]:
# pandas and numpy for data manipulation
import pandas as pd
import numpy as np

# featuretools for automated feature engineering
import featuretools as ft

# matplotlit and seaborn for visualizations
import matplotlib.pyplot as plt
plt.rcParams['font.size'] = 22
import seaborn as sns

# Suppress warnings from pandas
import warnings
warnings.filterwarnings('ignore')

# modeling
import lightgbm as lgb

# utilities
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder

# memory management
import gc



    train_bureau is the training features built manually using the bureau and bureau_balance data
    train_previous is the training features built manually using the previous, cash, credit, and installments data

Initially, we assess count of features generated through manual engineering. Employing set operations, we distinguish columns exclusive to the bureau dataframe, exclusive to the previous dataframe, and those present in both dataframes. These observations indicate original features from the application dataframe. The code is executed on a limited data subset to prevent kernel overload. The same code has also been executed on the complete dataset, with some results provided for examination.


In [None]:
# Read in data
train_bureau = pd.read_csv('/content/drive/MyDrive/Thesis/Home Credit Section/Preprocessing/train_bureau_raw.csv')
test_bureau = pd.read_csv('/content/drive/MyDrive/Thesis/Home Credit Section/Preprocessing/test_bureau_raw.csv')


train_previous = pd.read_csv('/content/drive/MyDrive/Thesis/Home Credit Section/Preprocessing/train_previous_raw.csv')
test_previous = pd.read_csv('/content/drive/MyDrive/Thesis/Home Credit Section/Preprocessing/test_previous_raw.csv')

# All columns in dataframes
bureau_columns = list(train_bureau.columns)
previous_columns = list(train_previous.columns)

In [None]:
# Bureau only features
bureau_features = list(set(bureau_columns) - set(previous_columns))

# Previous only features
previous_features = list(set(previous_columns) - set(bureau_columns))

# Original features will be in both datasets
original_features = list(set(previous_columns) & set(bureau_columns))

print('There are %d original features.' % len(original_features))
print('There are %d bureau and bureau balance features.' % len(bureau_features))
print('There are %d previous Home Credit loan features.' % len(previous_features))

There are 122 original features.
There are 211 bureau and bureau balance features.
There are 1003 previous Home Credit loan features.


In [None]:
train_labels = train_bureau['TARGET']
previous_features.append('SK_ID_CURR')

train_ids = train_bureau['SK_ID_CURR']
test_ids = test_bureau['SK_ID_CURR']

# Merge the dataframes avoiding duplicating columns by subsetting train_previous
train = train_bureau.merge(train_previous[previous_features], on = 'SK_ID_CURR')
test = test_bureau.merge(test_previous[previous_features], on = 'SK_ID_CURR')

application_train = train.drop(columns=['TARGET'])
merged=pd.concat([application_train, test],  ignore_index=True, sort=False)
target=merged_2=pd.DataFrame()
merged_2['SK_ID_CURR']=train['SK_ID_CURR']
merged_2['TARGET']=train['TARGET']

print('Training shape: ', train.shape)
print('Testing shape: ', test.shape)
print('Testing shape: ', merged.shape)

Training shape:  (307511, 1336)
Testing shape:  (48744, 1335)
Testing shape:  (356255, 1335)


In [None]:
merged['client_credit_AMT_BALANCE_mean_mean']=merged['client_credit_AMT_BALANCE_mean_mean'].interpolate()
merged['client_credit_AMT_DRAWINGS_CURRENT_mean_mean']=merged['client_credit_AMT_DRAWINGS_CURRENT_mean_mean'].interpolate()

merged_2=pd.DataFrame()
merged_2['SK_ID_CURR']=merged['SK_ID_CURR']
merged_2['AMT_CREDIT']=merged['AMT_CREDIT']
merged_2['client_credit_AMT_BALANCE_mean_mean']=merged['client_credit_AMT_BALANCE_mean_mean']
merged_2['client_credit_AMT_DRAWINGS_CURRENT_mean_mean']=merged['client_credit_AMT_DRAWINGS_CURRENT_mean_mean']
merged_2['Drawdown Rate']=merged_2['client_credit_AMT_DRAWINGS_CURRENT_mean_mean']/merged_2['AMT_CREDIT']
merged_2['Utilization Rate']=merged_2['client_credit_AMT_BALANCE_mean_mean']/merged_2['AMT_CREDIT']
merged_2=merged_2.drop(columns=['AMT_CREDIT'])

print('Predictor Variables: ', merged_2.shape)
path='/content/drive/MyDrive/Thesis/Home Credit Section/Final [Small]/'
#merged_2.to_csv(path+'modified_merged.csv', index = False)

Predictor Variables:  (356255, 5)
