In [168]:
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
warnings.filterwarnings("ignore")

In [159]:
data = pd.read_csv(r'C:\Priyanka\job_application_2024\JPMorganChase\Take Home Project\training_loan_data.csv', header=1)
data.head(5)

Unnamed: 0,id,member_id,loan_amnt,term,int_rate,emp_length,home_ownership,annual_inc,desc,purpose,...,inq_last_6mths,mths_since_recent_inq,revol_util,total_bc_limit,mths_since_last_major_derog,tot_hi_cred_lim,tot_cur_bal,application_approved_flag,internal_score,bad_flag
0,10000001,11983056.0,7550,36 months,16.24%,3 years,RENT,28000.0,,debt_consolidation,...,0.0,17.0,72%,4000.0,,3828.953801,5759.0,1,99,0.0
1,10000002,12002921.0,27050,36 months,10.99%,10+ years,OWN,55000.0,Borrower added on 12/31/13 > Combining high ...,debt_consolidation,...,0.0,8.0,61.20%,35700.0,,34359.94073,114834.0,1,353,0.0
2,10000003,11983096.0,12000,36 months,10.99%,4 years,RENT,60000.0,Borrower added on 12/31/13 > I would like to...,debt_consolidation,...,1.0,3.0,24%,18100.0,,16416.61776,7137.0,1,157,0.0
3,10000004,12003142.0,28000,36 months,7.62%,5 years,MORTGAGE,325000.0,,debt_consolidation,...,1.0,3.0,54.60%,42200.0,,38014.14976,799592.0,1,365,0.0
4,10000005,11993233.0,12000,36 months,13.53%,10+ years,RENT,40000.0,,debt_consolidation,...,0.0,17.0,68.80%,7000.0,53.0,6471.462236,13605.0,1,157,0.0


### Data Preparation
- remove duplicates
- split data into train and test set

In [160]:
data.drop_duplicates(keep='first', inplace=True)
data.reset_index(drop=True, inplace=True)
data.dropna(subset=['bad_flag'], inplace=True)

In [151]:
len(data)

188123

In [161]:
# split into train and test set
## Separating Independent and Dependent Columns
X = data.drop(['bad_flag'],axis=1)
Y = data[['bad_flag']]

# Splitting the dataset into the Training and Testing set.
x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size = 0.2, random_state = 1,stratify = Y)


In [162]:
# Data Preprocessing steps 
def feature_engineering(data):
    data['int_rate'] = data['int_rate'].str.replace('%', '').astype(float)
    data.loc[data['emp_length'] == '< 1 year', 'emp_length'] = '0 year'
    data['emp_length_yr'] = data['emp_length'].str.extract(r'(\d+)')
    data.drop(columns=['emp_length'], inplace=True)
    return data


def rare_category_combiner(data):
    data.loc[data['home_ownership'].isin(['OTHER', 'OWN']), 'home_ownership'] = 'OTHER'
    data.loc[~data['purpose'].isin(['debt_consolidation', 'credit_card']), 'purpose'] = 'other'
    return data


def get_numerical_features():
    return ['loan_amnt', 'int_rate', 'annual_inc', 'percent_bc_gt_75', 'dti', 'inq_last_6mths', 'mths_since_recent_inq', 'total_bc_limit']

def get_le_categorical_features():
    return ['emp_length_yr']

def get_ohe_categorical_features():
    return ['home_ownership', 'purpose', 'term']

def convert_into_category(data):
    for column in get_ohe_categorical_features() + get_le_categorical_features():
        data[column] = data[column].astype('category')
    return data

def preprocessor(data):
    data = feature_engineering(data)
    data = rare_category_combiner(data)
    data = data[get_numerical_features() + get_ohe_categorical_features() + get_le_categorical_features()]
    data = convert_into_category(data)
    return data

In [163]:
# Data Transformation steps
class LabelEncodingTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.le = LabelEncoder()
        #self.columns = get_le_categorical_features()

    def fit(self, data, y=None):
        self.le.fit(data['emp_length_yr'])
        return self

    def transform(self, data):
        self.le.transform(data['emp_length_yr'])
        return data


class ModelFeatures(BaseEstimator, TransformerMixin):
    def __init__(self):
        return

    def fit(self,):
        return self

    def transform(self, data):
        return data[get_numerical_features() + get_ohe_categorical_features() + get_le_categorical_features()]
    

numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')), 
    ('scaler', StandardScaler())
])

# Pipeline for categorical features
categorical_label_encoder_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', drop='first'))
    #('label_encoder', LabelEncodingTransformer())
])

# Pipeline for categorical features
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', drop='first'))
])


# Combine pipelines into a ColumnTransformer
transformer_pipeline = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, get_numerical_features()),
        ('ohe_cat', categorical_pipeline, get_ohe_categorical_features()),
        ('le_cat', categorical_label_encoder_pipeline, get_le_categorical_features())
    ])


In [164]:
x_train = preprocessor(x_train)
x_train = transformer_pipeline.fit_transform(x_train)

In [165]:
x_test = preprocessor(x_test)
x_test = transformer_pipeline.transform(x_test)

In [167]:
len(x_train), len(x_test)

(150498, 37625)

### Apply oversampling technique - SMOTE to balance the training set. 
I did not oversample  the entire set, because I wanted to testing set to be original data for model validation.

In [169]:
smote=SMOTE(sampling_strategy='not majority')
x_sm , y_sm = smote.fit_resample(x_train,y_train)

In [170]:
print("After Oversampling - SMOTE, the shape of x_train: {}".format(x_sm.shape))
print("After Oversampling - SMOTE, the shape of y_train: {} \n".format(y_sm.shape))

After Oversampling, the shape of train_X: (279992, 24)
After Oversampling, the shape of train_y: (279992, 1) 



### Model Training