In [40]:
import os 
import json
import numpy as np
import pandas as pd
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import make_pipeline

import warnings
warnings.filterwarnings("ignore")

from sklearn.feature_selection import VarianceThreshold
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.preprocessing import LabelBinarizer
from collections import defaultdict
import pandas as pd
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.metrics import roc_auc_score, mean_squared_error

In [41]:
import os
# Office
os.chdir('/home/exe002/Desktop/Prudhviraju/Code/LoanPrediction')
data = pd.read_csv('train_u6lujuX_CVtuZ9i.csv')

## Listing the columns persent in the dataset

In [42]:
data=data.reset_index(drop=True)

### Handling Null Values

In [43]:
data.loc[data['Gender'].isnull(),'Gender']='Trasgender'
data.loc[data['Dependents'].isnull(),'Dependents']='0'
data.loc[data['Education'].isnull(),'Education']='No Education'
data.loc[data['Married'].isnull(),'Married']='No'
data.loc[data['Self_Employed'].isnull(),'Self_Employed']='No'
data=data.drop(index=data.loc[data['LoanAmount'].isnull(),:].index)
data.loc[data['Loan_Amount_Term'].isnull(),'Loan_Amount_Term']=0
data.loc[data['Credit_History'].isnull(),'Credit_History']=0

In [44]:
Numeric_columns=data.select_dtypes(include=['int64','float64'])
categorical_columns=data.select_dtypes(include=['object'])
print(Numeric_columns.columns)
print(categorical_columns.columns)

Index(['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History'],
      dtype='object')
Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'Property_Area', 'Loan_Status'],
      dtype='object')


## Constant Values in Numeric Columns

In [45]:
from sklearn.feature_selection import VarianceThreshold

#Using a Built In Function
sel = VarianceThreshold(threshold=0)
sel.fit(Numeric_columns)  # fit finds the features with zero variance
print('Total # of non constant features',sum(sel.get_support()))
sel.transform(Numeric_columns)
Nonconstant_Numeric=Numeric_columns.loc[:,list(sel.get_support())]

Total # of non constant features 5


## Constant Values in Categorical Columns¶

In [46]:
unique_columns=[]
for i in categorical_columns.columns:
    if len(categorical_columns[i].unique())==1:
        unique_columns.append(i)
print(unique_columns)

[]


## Merging the two dataframes

In [47]:
new_df=pd.concat([categorical_columns,Nonconstant_Numeric],axis=1)
new_df.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
Property_Area        0
Loan_Status          0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
dtype: int64

## Removing the unique key columns

In [48]:
unique_columns=[i for i in new_df.columns if len(new_df[i].unique())==new_df.shape[0]]
print(unique_columns)

['Loan_ID']


## Dropping the unique key columns

In [49]:
new_df=new_df.drop(unique_columns,axis=1)

## Type conversion

In [50]:
new_df['Credit_History']=new_df['Credit_History'].astype(str)

## Univariate Analysis

In [51]:
IQR=new_df['ApplicantIncome'].quantile(0.75)-new_df['ApplicantIncome'].quantile(0.25)
upper_fence=new_df['ApplicantIncome'].quantile(0.75)+(IQR*3)
lower_fence=new_df['ApplicantIncome'].quantile(0.75)-(IQR*3)
print(IQR,'  ', upper_fence,'  ',lower_fence)
new_df=new_df.loc[(new_df['ApplicantIncome']<=upper_fence) & (new_df['ApplicantIncome']>=lower_fence), :]

2867.25    14356.25    -2847.25


In [52]:
from sklearn.preprocessing import MinMaxScaler
from collections import defaultdict
encoding_minmax=defaultdict(MinMaxScaler)
features_num=['ApplicantIncome']
for feat in features_num:
    encoded_df=pd.DataFrame()
    #d=pd.DataFrame(encoding_minmax(feat).fit_transform(new_df[feat]))
    d=pd.DataFrame(encoding_minmax[feat].fit_transform(new_df[[feat]]))
    test_column=d.columns.values
    list_column=[feat+'_'+'minmax' for j in test_column]
    d.columns=list_column
    encoded_df=pd.concat([encoded_df,d],axis=1).reset_index(drop=True)
    new_df.drop(feat,axis=1,inplace=True)
    new_df=new_df.reset_index(drop=True)
    new_df=pd.concat([new_df,encoded_df],axis=1)
import pickle
with open('minmax_pickle.pkl','wb') as f:
    pickle.dump(encoding_minmax,f)

In [53]:
encoding_minmax

defaultdict(sklearn.preprocessing.data.MinMaxScaler,
            {'ApplicantIncome': MinMaxScaler(copy=True, feature_range=(0, 1))})

In [54]:
from sklearn.preprocessing import StandardScaler
from collections import defaultdict
encoding_standardscaler=defaultdict(StandardScaler)
features_num=['LoanAmount']
for feat in features_num:
    encoded_df=pd.DataFrame()
    #d=pd.DataFrame(encoding_minmax(feat).fit_transform(new_df[feat]))
    d=pd.DataFrame(encoding_standardscaler[feat].fit_transform(new_df[[feat]]))
    test_column=d.columns.values
    list_column=[feat+'_'+'minmax' for j in test_column]
    d.columns=list_column
    encoded_df=pd.concat([encoded_df,d],axis=1).reset_index(drop=True)
    new_df.drop(feat,axis=1,inplace=True)
    new_df=new_df.reset_index(drop=True)
    new_df=pd.concat([new_df,encoded_df],axis=1)

with open('standardscaler_pickle.pkl','wb') as f:
    pickle.dump(encoding_standardscaler,f)

## One Hot Encoding

In [55]:
from sklearn.preprocessing import LabelBinarizer
from collections import defaultdict
import pandas as pd
encoding_binarizer=defaultdict(LabelBinarizer)
col_nominal=['Self_Employed','Married','Gender']
for feat in col_nominal:
    encoded_df=pd.DataFrame()
    d=pd.DataFrame(encoding_binarizer[feat].fit_transform(new_df[feat]))
    test_column=d.columns.values
    list_columns=[feat+'_'+str(j) for j in test_column]
    #print(list_columns)
    d.columns=list_columns
    #print(d.head())
    encoded_df=pd.concat([encoded_df,d],axis=1).reset_index(drop=True)
    new_df.drop(feat,axis=1,inplace=True)
    new_df=new_df.reset_index(drop=True)
    new_df=pd.concat([new_df,encoded_df],axis=1)

with open('binarizer_pickle.pkl','wb') as f:
    pickle.dump(encoding_binarizer,f)

In [56]:
encoding_binarizer

defaultdict(sklearn.preprocessing.label.LabelBinarizer,
            {'Self_Employed': LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False),
             'Married': LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False),
             'Gender': LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)})

 ### Label Encoding

In [57]:
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
encoding_labelencoder=defaultdict(LabelEncoder)
col_ordinal=['Education','Property_Area','Loan_Status','Dependents','Credit_History']
for feat in col_ordinal:
    encoded_df=pd.DataFrame()
    d=pd.DataFrame(encoding_labelencoder[feat].fit_transform(new_df[feat]))
    test_column=d.columns.values
    list_columns=[feat+'_'+str(j) for j in test_column]
    d.columns=list_columns
    encoded_df=pd.concat([encoded_df,d],axis=1).reset_index(drop=True)
    new_df.drop(feat,axis=1,inplace=True)
    new_df=new_df.reset_index(drop=True)
    new_df=pd.concat([new_df,encoded_df],axis=1)
#with open('C:\\Users\\prudi\\Desktop\\Data Sets\\Loan Prediction\\labelencoder_pickle.pkl','wb') as f:
#    pickle.dump(encoding_labelencoder,f)
with open('labelencoder_pickle.pkl','wb') as f:
    pickle.dump(encoding_labelencoder,f)

In [58]:
new_df.columns

Index(['CoapplicantIncome', 'Loan_Amount_Term', 'ApplicantIncome_minmax',
       'LoanAmount_minmax', 'Self_Employed_0', 'Married_0', 'Gender_0',
       'Gender_1', 'Gender_2', 'Education_0', 'Property_Area_0',
       'Loan_Status_0', 'Dependents_0', 'Credit_History_0'],
      dtype='object')