# [作業目標]
- 使用 Day 17 剛學到的方法, 對較完整的資料生成離散化特徵
- 觀察上述離散化特徵, 對於目標值的預測有沒有幫助

# [作業重點]
- 仿照 Day 17 的語法, 將年齡資料 ('DAYS_BIRTH' 除以 365) 離散化
- 繪製上述的 "離散化標籤" 與目標值 ('TARGET') 的長條圖

In [22]:
# 載入需要的套件
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# 設定 data_path
dir_data = './data/'

### 之前做過的處理

In [23]:
# 讀取資料檔
f_app_train = os.path.join(dir_data, 'application_train.csv')
app_train = pd.read_csv(f_app_train)
app_train.shape

(307511, 122)

In [24]:
# 將只有兩種值的類別型欄位, 做 Label Encoder, 計算相關係數時讓這些欄位可以被包含在內
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

# 檢查每一個 column
for col in app_train:
    if app_train[col].dtype == 'object':
        # 如果只有兩種值的類別型欄位
        if len(list(app_train[col].unique())) <= 2:
            # 就做 Label Encoder, 以加入相關係數檢查
            app_train[col] = le.fit_transform(app_train[col])            
print(app_train.shape)
app_train.head()

(307511, 122)


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,0,M,0,1,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,0,F,0,0,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,1,M,1,1,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,0,F,0,1,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,0,M,0,1,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
# 受雇日數為異常值的資料, 另外設一個欄位記錄, 並將異常的日數轉成空值 (np.nan)
app_train['DAYS_EMPLOYED_ANOM'] = app_train["DAYS_EMPLOYED"] == 365243
app_train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)

# 出生日數 (DAYS_BIRTH) 取絕對值 
app_train['DAYS_BIRTH'] = abs(app_train['DAYS_BIRTH'])

## 練習時間
參考 Day 17 範例程式，離散化你覺得有興趣的欄位，並嘗試找出有趣的訊息

In [26]:
app_train['YEARS_BIRTH']=app_train['DAYS_BIRTH']/365
app_train["YEARS_BIRTH_CATEGORIES"] = pd.cut(app_train["YEARS_BIRTH"], 10)
print(app_train["YEARS_BIRTH_CATEGORIES"])

0         (25.378, 30.238]
1         (44.819, 49.679]
2          (49.679, 54.54]
3          (49.679, 54.54]
4            (54.54, 59.4]
                ...       
307506    (25.378, 30.238]
307507       (54.54, 59.4]
307508    (39.959, 44.819]
307509    (30.238, 35.099]
307510    (44.819, 49.679]
Name: YEARS_BIRTH_CATEGORIES, Length: 307511, dtype: category
Categories (10, interval[float64]): [(20.469, 25.378] < (25.378, 30.238] < (30.238, 35.099] < (35.099, 39.959] ... (49.679, 54.54] < (54.54, 59.4] < (59.4, 64.26] < (64.26, 69.121]]


In [27]:
app_train.groupby(["YEARS_BIRTH_CATEGORIES"])["YEARS_BIRTH_CATEGORIES"].describe()

Unnamed: 0_level_0,count,unique,top,freq
YEARS_BIRTH_CATEGORIES,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(20.469, 25.378]",13679,1,"(20.469, 25.378]",13679
"(25.378, 30.238]",33127,1,"(25.378, 30.238]",33127
"(30.238, 35.099]",38430,1,"(30.238, 35.099]",38430
"(35.099, 39.959]",41758,1,"(35.099, 39.959]",41758
"(39.959, 44.819]",40350,1,"(39.959, 44.819]",40350
"(44.819, 49.679]",34311,1,"(44.819, 49.679]",34311
"(49.679, 54.54]",33544,1,"(49.679, 54.54]",33544
"(54.54, 59.4]",32650,1,"(54.54, 59.4]",32650
"(59.4, 64.26]",27685,1,"(59.4, 64.26]",27685
"(64.26, 69.121]",11977,1,"(64.26, 69.121]",11977


In [30]:
app_train.groupby(["YEARS_BIRTH_CATEGORIES"])['TARGET'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
YEARS_BIRTH_CATEGORIES,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
"(20.469, 25.378]",13679.0,0.122012,0.327311,0.0,0.0,0.0,0.0,1.0
"(25.378, 30.238]",33127.0,0.11139,0.314619,0.0,0.0,0.0,0.0,1.0
"(30.238, 35.099]",38430.0,0.1019,0.30252,0.0,0.0,0.0,0.0,1.0
"(35.099, 39.959]",41758.0,0.089468,0.285421,0.0,0.0,0.0,0.0,1.0
"(39.959, 44.819]",40350.0,0.078984,0.269717,0.0,0.0,0.0,0.0,1.0
"(44.819, 49.679]",34311.0,0.074437,0.262484,0.0,0.0,0.0,0.0,1.0
"(49.679, 54.54]",33544.0,0.067911,0.251597,0.0,0.0,0.0,0.0,1.0
"(54.54, 59.4]",32650.0,0.055926,0.229783,0.0,0.0,0.0,0.0,1.0
"(59.4, 64.26]",27685.0,0.052953,0.223943,0.0,0.0,0.0,0.0,1.0
"(64.26, 69.121]",11977.0,0.041997,0.200591,0.0,0.0,0.0,0.0,1.0
