# Loading necessary library

In [85]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

# Loading the data

In [86]:
df = pd.read_csv("bank_loans_final.csv")
df.head(3)

Unnamed: 0.1,Unnamed: 0,Loan Status,Term,Current Loan Amount,Years in current job,Home Ownership,Purpose,Annual Income,Monthly Debt,Years of Credit History,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens
0,0,Fully Paid,Short Term,445412,8.0,Home Mortgage,Home Improvements,1167493.0,5214.74,17.2,6,1,228190,416746.0,1.0,0.0
1,1,Fully Paid,Short Term,262328,10.0,Home Mortgage,Debt Consolidation,1378277.0,33295.98,21.1,35,0,229976,850784.0,0.0,0.0
2,2,Fully Paid,Short Term,312246,8.0,Own Home,Debt Consolidation,2231892.0,29200.53,14.9,18,1,297996,750090.0,0.0,0.0


# Performing Label encoding

In [87]:
labelencoder = LabelEncoder()
df.iloc[:,1] = labelencoder.fit_transform(df.iloc[:,1])
df.iloc[:,2] = labelencoder.fit_transform(df.iloc[:,2])
df.iloc[:,5] = labelencoder.fit_transform(df.iloc[:,5])
df.iloc[:,6] = labelencoder.fit_transform(df.iloc[:,6])

In [88]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,Loan Status,Term,Current Loan Amount,Years in current job,Home Ownership,Purpose,Annual Income,Monthly Debt,Years of Credit History,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens
0,0,1,1,445412,8.0,0,4,1167493.0,5214.74,17.2,6,1,228190,416746.0,1.0,0.0
1,1,1,1,262328,10.0,0,3,1378277.0,33295.98,21.1,35,0,229976,850784.0,0.0,0.0
2,2,1,1,312246,8.0,1,3,2231892.0,29200.53,14.9,18,1,297996,750090.0,0.0,0.0


# Performing one hot encoding for categorical feature

In [89]:
df_home = df['Home Ownership']
df_home = pd.DataFrame({'Home Ownership' : df_home})
df_home.head()

Unnamed: 0,Home Ownership
0,0
1,0
2,1
3,1
4,2


In [90]:
df_purpose = df['Purpose']
df_purpose = pd.DataFrame({'Purpose' : df_purpose})
df_purpose.head()

Unnamed: 0,Purpose
0,4
1,3
2,3
3,3
4,3


In [91]:
onehotencoder = OneHotEncoder()
df_home = onehotencoder.fit_transform(df_home).toarray()
df_home = pd.DataFrame({'Home Mortgage':df_home[:,0],'Own Home':df_home[:,1],'Rent':df_home[:,2]})
df_home.head(3)


Unnamed: 0,Home Mortgage,Own Home,Rent
0,1.0,0.0,0.0
1,1.0,0.0,0.0
2,0.0,1.0,0.0


In [92]:
df_purpose = onehotencoder.fit_transform(df_purpose).toarray()
df_purpose = pd.DataFrame({'Business Loan':df_purpose[:,0],'Buy House':df_purpose[:,1],'Buy a car':df_purpose[:,2],'Debt Consolidation':df_purpose[:,3],'Home Improvement':df_purpose[:,4],'Medical Bills':df_purpose[:,5],'Other':df_purpose[:,6],'Take a Trip':df_purpose[:,7]})
df_purpose.head()

Unnamed: 0,Business Loan,Buy House,Buy a car,Debt Consolidation,Home Improvement,Medical Bills,Other,Take a Trip
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [93]:
## concatenating all the variables
X = df[['Term','Years in current job','Current Loan Amount','Annual Income','Monthly Debt','Years of Credit History','Current Credit Balance','Maximum Open Credit','Tax Liens','Bankruptcies','Number of Credit Problems','Number of Open Accounts']]

In [94]:
df_final = pd.concat([X,df_home,df_purpose],axis=1)
df_final.head(3)

Unnamed: 0,Term,Years in current job,Current Loan Amount,Annual Income,Monthly Debt,Years of Credit History,Current Credit Balance,Maximum Open Credit,Tax Liens,Bankruptcies,...,Own Home,Rent,Business Loan,Buy House,Buy a car,Debt Consolidation,Home Improvement,Medical Bills,Other,Take a Trip
0,1,8.0,445412,1167493.0,5214.74,17.2,228190,416746.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,1,10.0,262328,1378277.0,33295.98,21.1,229976,850784.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,1,8.0,312246,2231892.0,29200.53,14.9,297996,750090.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


# Splitting the data into train and test

In [95]:
X_train,X_test,y_train,y_test = train_test_split(df_final,df['Loan Status'],test_size=0.1,stratify=df['Loan Status'])

### saving the test to seperate file

In [96]:
X_test = pd.DataFrame(X_test)
y_test = pd.DataFrame(y_test)

In [97]:
test = pd.concat([X_test,y_test],axis=1)

In [98]:
test.to_csv("bank_loan_test_data.csv",index=False)

### upsampling the training data for various upsampling ratio and saving in seperate file

In [99]:
## upsampling ratio of 0.8
sm = SMOTE(random_state=42,sampling_strategy=0.8)
X_train_sm_80, y_train_sm_80 = sm.fit_resample(X_train, y_train)

In [100]:
X_train_sm_80 = pd.DataFrame(X_train_sm_80)
y_train_sm_80 = pd.DataFrame(y_train_sm_80)

In [101]:
train_80 = pd.concat([X_train_sm_80,y_train_sm_80],axis=1)
train_80.to_csv("bank_loan_train_80.csv",index=False)

In [102]:
## upsampling ratio of 0.9
sm = SMOTE(random_state=42,sampling_strategy=0.9)
X_train_sm_90, y_train_sm_90 = sm.fit_resample(X_train, y_train)


In [103]:
X_train_sm_90 = pd.DataFrame(X_train_sm_90)
y_train_sm_90 = pd.DataFrame(y_train_sm_90)

In [104]:
train_90 = pd.concat([X_train_sm_90,y_train_sm_90],axis=1)
train_90.to_csv("bank_loan_train_90.csv",index=False)

In [105]:
## upsampling ratio of 1.0
sm = SMOTE(random_state=42,sampling_strategy=1.0)
X_train_sm_100, y_train_sm_100 = sm.fit_resample(X_train, y_train)

In [106]:
X_train_sm_100 = pd.DataFrame(X_train_sm_100)
y_train_sm_100 = pd.DataFrame(y_train_sm_100)

In [107]:
train_100 = pd.concat([X_train_sm_100,y_train_sm_100],axis=1)
train_100.to_csv("bank_loan_train_100.csv",index=False)