In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

        

import matplotlib.pyplot as plt
import seaborn as sns
# Any results you write to the current directory are saved as output.

In [None]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;

## 各ファイルの内容のサマリ 及び　ER図

|ファイル名                 | 意味                                                                                                         |
|-------------------------|--------------------------------------------------------------------------------------------------------------|
|application_{train/test}.csv | 顧客一人一人の主要な情報（性別や家族の人数、資産額、車の有無などの情報。目的変数は債務不履行か、非債務不履行かのラベルデータ。    |
|bureau.csv               | Home Credit社以外のクレジットビューローでの融資情報（月次データ）|
|POS_CASH_balance.csv     | bureauの負債残高の履歴（月次データ）
| previous_application.csv | Home Credit社における過去の融資情報 |
| installments_payments.csv | 過去の融資の残高履歴 |
| HomeCredit_columns_description.csv | 各列の説明|

|カラム名補足|
|----------|
|"Target variable (1 - client with payment difficulties: he/she had late payment more than X days on at least one of the first Y installments of the loan in our sample, 0 - all other cases)",|
![](https://storage.googleapis.com/kaggle-media/competitions/home-credit/home_credit.png)


## データの前処理
ネタ元：　https://www.kaggle.com/c/home-credit-default-risk/data
### CSVデータ読み込み

In [None]:
pd.options.display.max_columns = None

In [None]:
application_train = pd.read_csv('/kaggle/input/home-credit-default-risk/application_train.csv')
application_test = pd.read_csv('/kaggle/input/home-credit-default-risk/application_test.csv')

In [None]:
print('Training data shape: ', application_train.shape)
application_train.head()

In [None]:
bureau = pd.read_csv('/kaggle/input/home-credit-default-risk/bureau.csv')
bureau_balance = pd.read_csv('/kaggle/input/home-credit-default-risk/bureau_balance.csv')
previous_application = pd.read_csv('/kaggle/input/home-credit-default-risk/previous_application.csv')
POS_CASH_balance = pd.read_csv('/kaggle/input/home-credit-default-risk/POS_CASH_balance.csv')
installments_payments = pd.read_csv('/kaggle/input/home-credit-default-risk/installments_payments.csv')
credit_card_balance = pd.read_csv('/kaggle/input/home-credit-default-risk/credit_card_balance.csv')

# 検討の仕方
債務不履行者と，そうでない人々に何か違いがあるのだろうか<br>
【検討1】各カラムのデータ（単品・及びその組合わせ）と，TARGETとで明確な区別があるのだろうか。TARGET=0/1でグルーピングしてみて，相関の有無を確認する<br>
【検証1】何のカラムのデータと関係がありそうかが出てきたら，Random ForestでMSEを出してみる
### 検討に際して事前に実施するべきこと
【前処理】各カラムのCategorical Data及びMissing Valueについては確認の上，処置する

In [None]:
cols_with_missing = [col for col in application_train.columns
                     if application_train[col].isnull().any()]
application_train[cols_with_missing].head(10)

|カラム名 | 意味　 |　訳 | 考察/処置|
|--------|-------|----|-----|
|AMT_ANNUITY | Loan annuity | 年金支払い額 | 支払いがある場合，Home Credit Default Riskに影響するかもしれないが，額にもよる<br>　AMT_INCOME_TOTALから差し引きしてしまうというのみ手ではないか 一旦fillna(0)で0埋めしておく|
|AMT_GOODS_PRICE | For consumer loans it is the price of the goods for which the loan is given | 消費者金融からの商品の価格（？）つまり消費者金融から借りている額ということか？|AMT_ANNUITYと扱い同じでよいかも|
|NAME_TYPE_SUITE| Who accompanied client when applying for the previous application | 誰を伴ってローン組みに来たかっていうこと？ | Dropする |
|OWN_CAR_AGE | Age of client's car | 持ち車の持ってる年| Dropする |
| OCCUPATION_TYPE | What kind of occupation does the client have | 役職 | 社会的な地位に関わるため，あると良いのかもしれないが，会社によってネーミングが違うので，扱い方は要検討。<br> 返済のことを考えれば，意味がなく，AMT_INCOME_TOTALだけが意味がある，ということかもしれない |
|CNT_FAM_MEMBERS| How many family members does client have | 家族構成 | 家族がいたほうが信用力がある？というふうに捉えら得れるかもしれない。2以上だと夫子or妻子＋αであるから，0（単身者）,1（夫妻）,2（子供もしくはプラスα）という扱いが良いのでは|
| EXT_SOURCE_1,2,３ |  Normalized score from external data source | <a href="https://www.kaggle.com/c/home-credit-default-risk/discussion/57363">ディスカッション参照</a><br>なんらかの信用情報を反映したものと思われる。| あれば，参考になるが，なければ，bureauを参考にするんだろうな。一旦Drop |
|XXXXXX_AVG/MODE | Normalized information about building where the client lives, <br>What is average (AVG suffix), modus (MODE suffix), median (MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor | 住んでる家のスコアみたいなもの | あると，参考になるかも。高級なところに住んでる人は信用力高いかも。よくなさそうなら信用力低いかも。<br>しかしimputationの仕方で良いものが思いつかないから，一旦Dropする<br>他のAVG/MODE/MEDIカラムの平均値をとって，Missingの項目があれば，除外，という形で一旦使うか。|
|OBS/DEF_X_CNT_SOCIAL_CIRCLE | How many observation of client's social surroundings defaulted on 30 DPD (days past due) | よくわからん<br><a href="https://www.kaggle.com/c/home-credit-default-risk/discussion/58200">ディスカッション参照</a> |債務不履行になった人間のあるやなしや？ということ？よくわからんわ 一旦dropで|
|DAYS_LAST_PHONE_CHANGE|How many days before application did client change phone|電話変えてから経過した日数|携帯電話代が払える/払えない人の区別に使っているんだとしたら有用かもしれないが。一旦drop|
|AMT_REQ_CREDIT_BUREAU_X | Number of enquiries to Credit Bureau about the client X before application | クレジットビューローから何件の問い合わせがあったか | 有ってことは信用力に疑問がある人なのかもという推測はできるが，断定はできないためdropでいいのでは|

In [None]:
# Drop columns in training and validation data
reduced_X_train = application_train.drop(cols_with_missing, axis=1)
reduced_X_valid = application_test.drop(cols_with_missing, axis=1)

In [None]:
reduced_X_train.head(10)

## 【前処理】　②Categorical Valueの処置の仕方の検討

In [None]:
#オブジェクト型になっているやつらが，categorical_value保持
s = (reduced_X_train.dtypes == 'object')
object_cols = list(s[s].index)
object_cols

Categorical Valueについては後ほど処置を考える。一旦は放置

In [None]:
reduced_X_train = reduced_X_train.drop(object_cols, axis=1)
reduced_X_valid = reduced_X_valid.drop(object_cols, axis=1)

In [None]:
reduced_X_train['loan_vs_income'] = reduced_X_train['AMT_CREDIT']/reduced_X_train['AMT_INCOME_TOTAL']
reduced_X_valid['loan_vs_income'] = reduced_X_valid['AMT_CREDIT']/reduced_X_valid['AMT_INCOME_TOTAL']

In [None]:
# #Categorical Valueへの処置は後ほど考える
# print("Categorical variables:")
# drop_X_train[object_cols].head(10)

In [None]:
# from sklearn.preprocessing import LabelEncoder

# # Make copy to avoid changing original data 
# label_X_train = application_train.copy()
# label_X_valid = application_test.copy()

# # Label encodeingによってcategorical variablesを数値データに変換する fit_transform&transform
# # Apply label encoder to each column with categorical data
# label_encoder = LabelEncoder()
# for col in object_cols:
#     label_X_train[col] = label_encoder.fit_transform(application_train[col])
#     label_X_valid[col] = label_encoder.transform(application_test[col])

## グループ分け:債務不履行者とそれ以外で実データと各カラムの統計情報

In [None]:
X_train_target0 = reduced_X_train[reduced_X_train['TARGET']==1]
X_train_target1 = reduced_X_train[reduced_X_train['TARGET']==0]

In [None]:
#債務不履行者の集団の統計情報
X_train_target1_desc = reduced_X_train[reduced_X_train['TARGET']==1].describe()
X_train_target1_desc


In [None]:
#債務履行者の集団の統計情報
X_train_target0_desc =  reduced_X_train[reduced_X_train['TARGET']==0].describe()
X_train_target0_desc

In [None]:
print(len(X_train_target0))
print(len(X_train_target1))

In [None]:
X_train_target0['AMT_INCOME_TOTAL']/len(X_train_target0)

In [None]:
for column in X_train_target0.columns:
    print(column)

## グラフ描画関数　左：ヒストグラム，右：平均・分散グラフ

In [None]:
def plot_hist_mean(X_target0,X_target1,X_target0_desc,X_target1_desc):
    #カラム毎にヒストグラムと平均・分散棒グラフを描画する
    for column in X_target0.columns:
        plot_hist_mean_every_column(X_target0,X_target1,X_target0_desc,X_target1_desc,column)

In [None]:
def plot_hist_mean_every_column(X_target0,X_target1,X_target0_desc,X_target1_desc,column):

    #ヒストグラムの描画
    fig, (axL, axR) = plt.subplots(ncols=2, figsize=(25,5),facecolor='w')
    
    plt.rcParams["font.size"] = 18

    #fig = plt.figure(figsize=(15, 10), dpi=50)
    axL.set_yscale('log')  # メイン: y軸をlogスケールで描く
    axL.set_xlabel(column, fontsize=18)
    axL.set_ylabel('count',fontsize=18)

    axL.grid(True)
    axL.hist(X_target0[column],alpha=0.5, color='blue',bins=50, density=True)
    axL.hist(X_target1[column],alpha=0.5, color='red',bins=50, density=True)

    #平均・分散（エラーバー）の描画
    axR.grid(True)

    axR.set_ylabel("AMT_INCOME_TOTAL")

    #横軸の名前
    x0 = 'X_target0'
    x1 = 'X_target1'

    axR.bar([x0, x1] ,[X_target0_desc[column]['mean'],X_target1_desc[column]['mean']], 
            yerr = [X_target0_desc[column]['std'],X_target1_desc[column]['std']], color = ['blue', 'red'], alpha = 0.6)
    fig.show()


# 各カラム毎にグラフを作ってみる


In [None]:
plot_hist_mean(X_train_target0,X_train_target1,X_train_target0_desc,X_train_target1_desc)

In [None]:
bureau.describe()

In [None]:
bureau_balance.describe()

In [None]:
previous_application.describe()

In [None]:
POS_CASH_balance.describe()

In [None]:
installments_payments.describe()

In [None]:
credit_card_balance.describe()