# [作業目標]
- 使用 Day 17 剛學到的方法, 對較完整的資料生成離散化特徵
- 觀察上述離散化特徵, 對於目標值的預測有沒有幫助

# [作業重點]
- 仿照 Day 17 的語法, 將年齡資料 ('DAYS_BIRTH' 除以 365) 離散化
- 繪製上述的 "離散化標籤" 與目標值 ('TARGET') 的長條圖

In [1]:
# 載入需要的套件
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# 設定 data_path
dir_data = './data/'

### 之前做過的處理

In [2]:
# 讀取資料檔
f_app_train = os.path.join(dir_data, 'application_train.csv')
app_train = pd.read_csv(f_app_train)
app_train.shape

(307511, 122)

In [3]:
# 將只有兩種值的類別型欄位, 做 Label Encoder, 計算相關係數時讓這些欄位可以被包含在內
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

# 檢查每一個 column
for col in app_train:
    if app_train[col].dtype == 'object':
        # 如果只有兩種值的類別型欄位
        if len(list(app_train[col].unique())) <= 2:
            # 就做 Label Encoder, 以加入相關係數檢查
            app_train[col] = le.fit_transform(app_train[col])            
print(app_train.shape)
app_train.head()

(307511, 122)


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,0,M,0,1,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,0,F,0,0,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,1,M,1,1,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,0,F,0,1,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,0,M,0,1,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
# 受雇日數為異常值的資料, 另外設一個欄位記錄, 並將異常的日數轉成空值 (np.nan)
app_train['DAYS_EMPLOYED_ANOM'] = app_train["DAYS_EMPLOYED"] == 365243
app_train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)

# 出生日數 (DAYS_BIRTH) 取絕對值 
app_train['DAYS_BIRTH'] = abs(app_train['DAYS_BIRTH'])

## 練習時間
參考 Day 17 範例程式，離散化你覺得有興趣的欄位，並嘗試找出有趣的訊息

In [10]:
app_train['custom_day_qcut'] = pd.qcut(app_train['DAYS_EMPLOYED'],100)
# app_train[['DAYS_EMPLOYED', 'custom_day']]
app_train['custom_day_qcut'].value_counts()

(-274.0, -248.0]       2584
(-1566.0, -1525.0]     2579
(-206.0, -185.0]       2575
(-617.0, -587.0]       2562
(-1080.0, -1048.0]     2559
(-798.0, -767.0]       2555
(-981.0, -949.0]       2554
(-1648.0, -1606.0]     2553
(-710.0, -679.0]       2553
(-767.0, -738.0]       2552
(-2792.0, -2725.0]     2551
(-857.0, -826.0]       2549
(-1362.0, -1322.0]     2549
(-440.0, -413.0]       2548
(-888.0, -857.0]       2546
(-557.0, -527.0]       2546
(-3675.8, -3564.0]     2544
(-1777.0, -1729.0]     2544
(-2095.0, -2036.0]     2541
(-1015.0, -981.0]      2540
(-3266.0, -3175.0]     2538
(-1924.0, -1871.0]     2538
(-1443.0, -1403.0]     2537
(-1250.0, -1213.0]     2537
(-413.0, -387.0]       2536
(-333.0, -303.0]       2535
(-1871.0, -1822.0]     2534
(-1213.0, -1179.0]     2534
(-3175.0, -3097.0]     2533
(-2457.0, -2396.0]     2533
                       ... 
(-1146.0, -1112.0]     2513
(-1179.0, -1146.0]     2512
(-111.0, 0.0]          2511
(-3792.0, -3675.8]     2510
(-4372.0, -4215.0]  

In [11]:
app_train['custom_day_cut'] = pd.cut(app_train['DAYS_EMPLOYED'],100)
app_train['custom_day_cut'].value_counts()

(-358.24, -179.12]         18002
(-537.36, -358.24]         16372
(-895.6, -716.48]          15082
(-716.48, -537.36]         14856
(-1074.72, -895.6]         14050
(-1253.84, -1074.72]       13326
(-1432.96, -1253.84]       11635
(-1612.08, -1432.96]       11144
(-1791.2, -1612.08]        10499
(-179.12, 0.0]              9426
(-1970.32, -1791.2]         8928
(-2328.56, -2149.44]        7623
(-2149.44, -1970.32]        7537
(-2507.68, -2328.56]        7282
(-2686.8, -2507.68]         6745
(-2865.92, -2686.8]         6357
(-3045.04, -2865.92]        6063
(-3224.16, -3045.04]        5543
(-3403.28, -3224.16]        4746
(-3582.4, -3403.28]         4364
(-3761.52, -3582.4]         4049
(-3940.64, -3761.52]        3513
(-4119.76, -3940.64]        3141
(-4298.88, -4119.76]        2884
(-4478.0, -4298.88]         2739
(-4657.12, -4478.0]         2649
(-4836.24, -4657.12]        2359
(-5373.6, -5194.48]         2327
(-5194.48, -5015.36]        2212
(-5015.36, -4836.24]        2059
          