In [2]:
#import sample data: Loan screening data for classification
import pandas as pd

df = pd.read_csv('./data/av_loan_u6lujuX_CVtuZ9i.csv', header=0,\
                  dtype={'Dependents':object,
                         'Gender':object, 
                         'Married':object,
                         'Education':object,
                         'Self_Employed':object,
                         'Property_Area':object})
X = df.iloc[:,:-1]
ID = X.iloc[:,[0]]
X = X.drop('Loan_ID',axis=1)
y = df.iloc[:,[-1]]

print('----------------------------------------------------------------------------------------')
print('Raw shape: (%i,%i)' %df.shape)
print('ID shape: (%i,%i)' %ID.shape)
print('X shape: (%i,%i)' %X.shape)
print('y shape: (%i,%i)' %y.shape)
print('----------------------------------------------------------------------------------------')
print(X.dtypes)
print('----------------------------------------------------------------------------------------')
print('Check the null count of the target variable: %i' % y.isnull().sum())

class_mapping = {'N':1, 'Y':0}
y_new  = y.copy()
y_new.loc[:,'Loan_Status'] = y_new['Loan_Status'].map(class_mapping)
print(y_new.groupby(['Loan_Status']).size())


----------------------------------------------------------------------------------------
Raw shape: (614,13)
ID shape: (614,1)
X shape: (614,11)
y shape: (614,1)
----------------------------------------------------------------------------------------
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
dtype: object
----------------------------------------------------------------------------------------
Check the null count of the target variable: 0
Loan_Status
0    422
1    192
dtype: int64


In [3]:
ohe_columns = ['Dependents', 'Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area']
X_ohe = pd.get_dummies(X, dummy_na=True, columns=ohe_columns)
print('X_ohe shape:(%i,%i)' % X_ohe.shape)
X_ohe.head()

X_ohe shape:(614,26)


Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Dependents_nan,...,Education_Graduate,Education_Not Graduate,Education_nan,Self_Employed_No,Self_Employed_Yes,Self_Employed_nan,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Property_Area_nan
0,5849,0.0,,360.0,1.0,1,0,0,0,0,...,1,0,0,1,0,0,0,0,1,0
1,4583,1508.0,128.0,360.0,1.0,0,1,0,0,0,...,1,0,0,1,0,0,1,0,0,0
2,3000,0.0,66.0,360.0,1.0,1,0,0,0,0,...,1,0,0,0,1,0,0,0,1,0
3,2583,2358.0,120.0,360.0,1.0,1,0,0,0,0,...,0,1,0,1,0,0,0,0,1,0
4,6000,0.0,141.0,360.0,1.0,1,0,0,0,0,...,1,0,0,1,0,0,0,0,1,0


In [4]:
from sklearn.preprocessing import Imputer

#欠損値NaNを平均値(mean)で置換
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(X_ohe)

#学習済みImputerを適用しX_newの欠損値を置換
X_ohe_columns = X_ohe.columns.values
X_ohe = pd.DataFrame(imp.transform(X_ohe), columns=X_ohe_columns)

#結果表示
X_ohe.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Dependents_nan,...,Education_Graduate,Education_Not Graduate,Education_nan,Self_Employed_No,Self_Employed_Yes,Self_Employed_nan,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Property_Area_nan
0,5849.0,0.0,146.412162,360.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,4583.0,1508.0,128.0,360.0,1.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2,3000.0,0.0,66.0,360.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,2583.0,2358.0,120.0,360.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,6000.0,0.0,141.0,360.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [5]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import GradientBoostingClassifier

selector = RFE(GradientBoostingClassifier(random_state=1), n_features_to_select=10, step=.05)
selector.fit(X_ohe, y.as_matrix().ravel())

X_fin = X_ohe.loc[:, X_ohe_columns[selector.support_]]
print('X_fin shape:(%i,%i)' %X_fin.shape)
X_fin.head()

X_fin shape:(614,10)


Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Dependents_1,Married_No,Married_nan,Self_Employed_nan,Property_Area_Semiurban
0,5849.0,0.0,146.412162,360.0,1.0,0.0,1.0,0.0,0.0,0.0
1,4583.0,1508.0,128.0,360.0,1.0,1.0,0.0,0.0,0.0,0.0
2,3000.0,0.0,66.0,360.0,1.0,0.0,0.0,0.0,0.0,0.0
3,2583.0,2358.0,120.0,360.0,1.0,0.0,0.0,0.0,0.0,0.0
4,6000.0,0.0,141.0,360.0,1.0,0.0,1.0,0.0,0.0,0.0


In [6]:
import pandas as pd
# import sample data
# Loan screening data for classification 
df_s = pd.read_csv('./data/av_loan_test_Y3wMUE5_7gLdaTN.csv', header=0, \
                  dtype={'Dependents':object,
                        'Gender':object,
                        'Married':object,
                        'Education':object,
                        'Self_Employed':object,
                        'Property_Area':object})
ID_s = df_s.iloc[:,[0]]
X_s = df_s.drop('Loan_ID', axis=1)

print('----------------------------------------------------------------------------------------')
print('Raw shape: (%i,%i)' %df_s.shape)
print('ID shape: (%i,%i)' %ID_s.shape)
print('X shape: (%i,%i)' %X_s.shape)
print(X_s.dtypes)
ID_s.join(X_s).head()

----------------------------------------------------------------------------------------
Raw shape: (333,12)
ID shape: (333,1)
X shape: (333,11)
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome      int64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
dtype: object


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


In [7]:
ohe_columns = ['Dependents','Gender','Married','Education','Self_Employed','Property_Area']
X_ohe_s = pd.get_dummies(X_s, dummy_na=True, columns=ohe_columns)
print('X_ohe_s shape:(%i,%i)' %X_ohe_s.shape)
X_ohe_s.head()

X_ohe_s shape:(333,26)


Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Dependents_0,Dependents_1,Dependents_2,Dependents_nan,Gender_Female,...,Education_Graduate,Education_Not Graduate,Education_nan,Self_Employed_No,Self_Employed_Yes,Self_Employed_nan,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Property_Area_nan
0,5720,0,110.0,360.0,1.0,1,0,0,0,0,...,1,0,0,1,0,0,0,0,1,0
1,3076,1500,126.0,360.0,1.0,0,1,0,0,0,...,1,0,0,1,0,0,0,0,1,0
2,5000,1800,208.0,360.0,1.0,0,0,1,0,0,...,1,0,0,1,0,0,0,0,1,0
3,2340,2546,100.0,360.0,,0,0,1,0,0,...,1,0,0,1,0,0,0,0,1,0
4,3276,0,78.0,360.0,1.0,1,0,0,0,0,...,0,1,0,1,0,0,0,0,1,0


In [8]:
cols_model = set(X_ohe.columns.values)
cols_score = set(X_ohe_s.columns.values)

diff1 = cols_model - cols_score
print('モデルのみに存在する項目:%s' %diff1)

diff2 = cols_score - cols_model
print('スコアのみに存在する項目:%s' %diff2)

モデルのみに存在する項目:{'Dependents_3+'}
スコアのみに存在する項目:{'Gender_Unknown'}


In [9]:
df1 = pd.DataFrame([[1,2,3]], columns=['c1','c2','c3'])
df2 = pd.DataFrame([[3,2,1]], columns=['c1','c2','c3'])
df_all = pd.concat([df1, df2])
df_all

Unnamed: 0,c1,c2,c3
0,1,2,3
0,3,2,1


In [10]:
df3 = pd.DataFrame([[0,1,2,3]], columns=['c0', 'c1', 'c3', 'c4'])
df_all = pd.concat([df_all, df3])
df_all

Unnamed: 0,c0,c1,c2,c3,c4
0,,1,2.0,3,
0,,3,2.0,1,
0,0.0,1,,2,3.0


In [11]:
print(X_ohe_columns)

['ApplicantIncome' 'CoapplicantIncome' 'LoanAmount' 'Loan_Amount_Term'
 'Credit_History' 'Dependents_0' 'Dependents_1' 'Dependents_2'
 'Dependents_3+' 'Dependents_nan' 'Gender_Female' 'Gender_Male'
 'Gender_nan' 'Married_No' 'Married_Yes' 'Married_nan'
 'Education_Graduate' 'Education_Not Graduate' 'Education_nan'
 'Self_Employed_No' 'Self_Employed_Yes' 'Self_Employed_nan'
 'Property_Area_Rural' 'Property_Area_Semiurban' 'Property_Area_Urban'
 'Property_Area_nan']


In [12]:
df_cols_m = pd.DataFrame(None, columns=X_ohe_columns, dtype=float)
df_cols_m

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Dependents_nan,...,Education_Graduate,Education_Not Graduate,Education_nan,Self_Employed_No,Self_Employed_Yes,Self_Employed_nan,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Property_Area_nan


In [13]:
X_ohe_s2 = pd.concat([df_cols_m, X_ohe_s])
print(X_ohe_s2.shape)
X_ohe_s2.head()

(333, 27)


Unnamed: 0,ApplicantIncome,CoapplicantIncome,Credit_History,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Dependents_nan,Education_Graduate,Education_Not Graduate,...,Married_No,Married_Yes,Married_nan,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Property_Area_nan,Self_Employed_No,Self_Employed_Yes,Self_Employed_nan
0,5720.0,0.0,1.0,1.0,0.0,0.0,,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,3076.0,1500.0,1.0,0.0,1.0,0.0,,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,5000.0,1800.0,1.0,0.0,0.0,1.0,,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,2340.0,2546.0,,0.0,0.0,1.0,,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,3276.0,0.0,1.0,1.0,0.0,0.0,,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [14]:
X_ohe_s2 = X_ohe_s2.drop(list(set(X_ohe_s.columns.values)-set(X_ohe.columns.values)), axis=1)
print(X_ohe_s2.shape)
X_ohe_s2.head()

(333, 26)


Unnamed: 0,ApplicantIncome,CoapplicantIncome,Credit_History,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Dependents_nan,Education_Graduate,Education_Not Graduate,...,Married_No,Married_Yes,Married_nan,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Property_Area_nan,Self_Employed_No,Self_Employed_Yes,Self_Employed_nan
0,5720.0,0.0,1.0,1.0,0.0,0.0,,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,3076.0,1500.0,1.0,0.0,1.0,0.0,,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,5000.0,1800.0,1.0,0.0,0.0,1.0,,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,2340.0,2546.0,,0.0,0.0,1.0,,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,3276.0,0.0,1.0,1.0,0.0,0.0,,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [15]:
X_ohe_s2.loc[:,list(set(X_ohe.columns.values)- set(X_ohe_s.columns.values))] = \
    X_ohe_s2.loc[:,list(set(X_ohe.columns.values)- set(X_ohe_s.columns.values))].fillna(0,axis=1)
X_ohe_s2.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,Credit_History,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Dependents_nan,Education_Graduate,Education_Not Graduate,...,Married_No,Married_Yes,Married_nan,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Property_Area_nan,Self_Employed_No,Self_Employed_Yes,Self_Employed_nan
0,5720.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,3076.0,1500.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,5000.0,1800.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,2340.0,2546.0,,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,3276.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [16]:
test = pd.DataFrame([[1,2,3]], columns=['c1','c2','c3'])
test = test.reindex(['c2','c3','c1'], axis=1)
test

Unnamed: 0,c2,c3,c1
0,2,3,1


In [17]:
X_ohe_s2 = X_ohe_s2.reindex(X_ohe.columns.values, axis=1)
X_ohe_s2.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Dependents_nan,...,Education_Graduate,Education_Not Graduate,Education_nan,Self_Employed_No,Self_Employed_Yes,Self_Employed_nan,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Property_Area_nan
0,5720.0,0.0,110.0,360.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,3076.0,1500.0,126.0,360.0,1.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,5000.0,1800.0,208.0,360.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,2340.0,2546.0,100.0,360.0,,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,3276.0,0.0,78.0,360.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [18]:
X_ohe_s3 = pd.DataFrame(imp.transform(X_ohe_s2), columns=X_ohe_columns)
X_ohe_s3.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Dependents_nan,...,Education_Graduate,Education_Not Graduate,Education_nan,Self_Employed_No,Self_Employed_Yes,Self_Employed_nan,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Property_Area_nan
0,5720.0,0.0,110.0,360.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,3076.0,1500.0,126.0,360.0,1.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,5000.0,1800.0,208.0,360.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,2340.0,2546.0,100.0,360.0,0.842199,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,3276.0,0.0,78.0,360.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [19]:
X_fin_s = X_ohe_s3.loc[:,X_ohe_columns[selector.support_]]
print(X_fin_s.shape)
X_fin_s.head()

(333, 10)


Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Dependents_1,Married_No,Married_nan,Self_Employed_nan,Property_Area_Semiurban
0,5720.0,0.0,110.0,360.0,1.0,0.0,0.0,0.0,0.0,0.0
1,3076.0,1500.0,126.0,360.0,1.0,1.0,0.0,0.0,0.0,0.0
2,5000.0,1800.0,208.0,360.0,1.0,0.0,0.0,0.0,0.0,0.0
3,2340.0,2546.0,100.0,360.0,0.842199,0.0,0.0,0.0,0.0,0.0
4,3276.0,0.0,78.0,360.0,1.0,0.0,1.0,0.0,0.0,0.0


ここまでが、データの前処理。
ここから、データ使って予測をしてみる

In [20]:
#check the shape
print('----------------------------------------------------------------------------------------')
print('X_fin shape:(%i,%i)' %X_fin.shape)
print('X_fin_s shape:(%i,%i)' %X_fin_s.shape)
print(y_new.head())
print(y.head())

----------------------------------------------------------------------------------------
X_fin shape:(614,10)
X_fin_s shape:(333,10)
   Loan_Status
0            0
1            1
2            0
3            0
4            0
  Loan_Status
0           Y
1           N
2           Y
3           Y
4           Y


In [21]:
# import basice apis
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Holdout
X_train,X_test,y_train,y_test = train_test_split(X_fin,y,test_size=0.20, random_state=1)

# set pipelines for different algorithms
pipe_knn = Pipeline([('scl',StandardScaler()),('est',KNeighborsClassifier())])
pipe_logistic = Pipeline([('scl',StandardScaler()),('est',LogisticRegression(random_state=1))])
pipe_rf = Pipeline([('scl',StandardScaler()),('est',RandomForestClassifier(random_state=1))])
pipe_gb = Pipeline([('scl',StandardScaler()),('est',GradientBoostingClassifier(random_state=1))])
pipe_mlp = Pipeline([('scl',StandardScaler()),('est',MLPClassifier(hidden_layer_sizes=(5,2), max_iter=500, random_state=1))])

# fit & evaluation
pipe_names = ['KNN','Logistic','RandomForest','GradientBoosting','MLP']
pipe_lines = [pipe_knn, pipe_logistic, pipe_rf, pipe_gb, pipe_mlp]
for (i,pipe) in enumerate(pipe_lines):
    pipe.fit(X_train, y_train.as_matrix().ravel())
    print('%s: %.3f'%(pipe_names[i], accuracy_score(y_test.as_matrix().ravel(), pipe.predict(X_test))))

KNN: 0.756
Logistic: 0.805
RandomForest: 0.764
GradientBoosting: 0.789
MLP: 0.789


In [23]:
#predict in using Logistic
pipe_logistic.fit(X_fin, y.as_matrix().ravel())

predict = pd.DataFrame(pipe_logistic.predict(X_fin_s), columns=['Loan_Status'])
ID_s.join(predict)

Unnamed: 0,Loan_ID,Loan_Status
0,LP001015,Y
1,LP001022,Y
2,LP001031,Y
3,LP001035,Y
4,LP001051,Y
5,LP001054,Y
6,LP001055,Y
7,LP001056,N
8,LP001059,Y
9,LP001067,Y
