# Home Credit Default Risk

In [None]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, roc_auc_score, roc_curve
from sklearn.ensemble import RandomForestClassifier
from pandas.plotting import scatter_matrix
import scipy.stats
from sklearn.feature_selection import SelectPercentile, SelectFromModel

In [None]:
ls ../input

In [None]:
df = pd.read_csv('../input/home-credit-default-risk/application_train.csv')
test = pd.read_csv('../input/home-credit-default-risk/application_test.csv')
sample_submission = pd.read_csv('../input/home-credit-default-risk/sample_submission.csv')


In [None]:
SK_ID_CURR = test.iloc[:, 0]
test.shape, SK_ID_CURR.shape

In [None]:
# 文字列のカテゴリカル変数抽出
objct_cols = df.select_dtypes(include=object)
# display(objct_cols)
objct_cols_list = df.select_dtypes(include=object).columns
# print(objct_cols_list)

# 文字列の列の欠損値を変換
df[objct_cols_list] = df[objct_cols_list].fillna('Missing_Data') 
test[objct_cols_list] = test[objct_cols_list].fillna('Missing_Data') 
# print(df[objct_cols_list].isnull().sum())


# ターゲットエンコーディング
for col in objct_cols_list:
    label_mean = df.groupby(col).TARGET.mean() # groupby()でラベル分した後にラベル毎の'label'の列の平均を取得
    df[col] = df[col].map(label_mean).copy() # df[C]に値を代入
    test[col] = test[col].map(label_mean).copy() # test データにも適用。
# df[objct_cols_list]


# 欠損値処理
df = df.dropna() # 欠損値がある行を削除
test = test.fillna(df.median())
# df = df.fillna(df.mean()) # 欠損値を平均に置き換え
# df = df.fillna(df.median()) # 欠損値を中央値に置き換え

# print(df.isnull().sum())
# print(df.shape)

# データとターゲットに分割
target = df.pop('TARGET')

# 短変量統計
select = SelectPercentile(percentile=4) # select 1% features
select.fit(df, target)
mask = select.get_support()
df_selected = select.transform(df) # type:dataFrame->ndarray

print(df.columns[np.where(mask==True)])
masked_list = df.columns[np.where(mask==True)]

# ndarrayに変換
target = target.values

# 訓練データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(df_selected, target, test_size=0.25, random_state=42)
# X_train.shape


# モデル構築
model_2 = RandomForestClassifier(n_estimators = 10, max_depth= 5, random_state= 42)


model_2.fit(X_train, np.ravel(y_train))
y_train_pred_2 = model_2.predict(X_train)
y_test_pred_2 = model_2.predict(X_test)

X_train = np.concatenate([X_train, y_train_pred_2.reshape(-1, 1)], 1)
X_test = np.concatenate([X_test, y_test_pred_2.reshape(-1, 1)], 1)

model_3 = RandomForestClassifier(n_estimators = 10, max_depth= 5, random_state= 42)
model_3.fit(X_train, np.ravel(y_train))

y_pred_3 = model_3.predict(X_test)
y_pred_proba_3 = model_3.predict_proba(X_test)

print('Accuracy\n', model_3.score(X_test, y_test))
print('\nROC AUC SCORE\n', roc_auc_score(y_test, y_pred_proba_3[:,1]))
print('\nConfusion Matrix\n',confusion_matrix(y_test, y_pred_3))


fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba_3[:,1])

plt.plot(fpr, tpr, marker='o')
plt.xlabel('FPR: False positive rate')
plt.ylabel('TPR: True positive rate')
plt.grid()

In [None]:
test = test[masked_list]
test.shape

In [None]:
test_pred = model_2.predict(test)
test = np.concatenate([test, test_pred.reshape(-1, 1)], 1)

In [None]:
y_pred_proba = model_3.predict_proba(test)
y_pred_proba.shape

In [None]:
Submission = pd.DataFrame({ 'SK_ID_CURR': SK_ID_CURR,'TARGET': y_pred_proba[:,1] })
Submission.to_csv("Submission.csv", index=False)

In [None]:
display(Submission)