# 作業 : (Kaggle)鐵達尼生存預測
***
- 分數以網站評分結果為準, 請同學實際將提交檔(*.csv)上傳試試看  
https://www.kaggle.com/c/titanic/submit

# [作業目標]
- 試著模仿範例寫法, 在鐵達尼生存預測中, 觀查堆疊泛化 (Stacking) 的寫法與效果

# [作業重點]
- 完成堆疊泛化的寫作, 看看提交結果, 想想看 : 分類與回歸的堆疊泛化, 是不是也與混合泛化一樣有所不同呢?(In[14])  
如果可能不同, 應該怎麼改寫會有較好的結果?  
- Hint : 請參考 mlxtrend 官方網站 StackingClassifier 的頁面說明 : Using Probabilities as Meta-Features
http://rasbt.github.io/mlxtend/user_guide/classifier/StackingClassifier/

In [1]:
# 做完特徵工程前的所有準備 (與前範例相同)
import pandas as pd
import numpy as np
import copy, time
import warnings
warnings.filterwarnings('ignore')
from IPython.display import display
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

data_path = 'data/'
df_train = pd.read_csv(data_path + 'titanic_train.csv')
df_test = pd.read_csv(data_path + 'titanic_test.csv')

train_Y = df_train['Survived']
ids = df_test['PassengerId']
df_train = df_train.drop(['PassengerId', 'Survived'] , axis=1)
df_test = df_test.drop(['PassengerId'] , axis=1)
df = pd.concat([df_train,df_test])
df.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
# 檢查 DataFrame 空缺值的狀態
def na_check(df_data):
    data_na = (df_data.isnull().sum() / len(df_data)) * 100
    data_na = data_na.drop(data_na[data_na == 0].index).sort_values(ascending=False)
    missing_data = pd.DataFrame({'Missing Ratio' :data_na})
    display(missing_data.head(10))
na_check(df)

Unnamed: 0,Missing Ratio
Cabin,77.463713
Age,20.091673
Embarked,0.152788
Fare,0.076394


In [3]:
# 以下 In[3]~In[10] 只是鐵達尼預測中的一組特徵工程, 並以此組特徵工程跑參數, 若更換其他特徵工程, In[10]的參數需要重新跑
# Sex : 直接轉男 0 女 1
df["Sex"] = df["Sex"].map({"male": 0, "female":1})
# Fare : 用 log 去偏態, 0 則直接取 0
df["Fare"] = df["Fare"].map(lambda i: np.log(i) if i > 0 else 0)
# Age : 缺值用中位數補
df["Age"] = df["Age"].fillna(df['Age'].median())

In [4]:
# Title 的 特徵工程 : 將各種頭銜按照類型分類, 最後取 One Hot
df_title = [i.split(",")[1].split(".")[0].strip() for i in df["Name"]]
df["Title"] = pd.Series(df_title)
df["Title"] = df["Title"].replace(['Lady', 'the Countess','Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
df["Title"] = df["Title"].map({"Master":0, "Miss":1, "Ms" : 1 , "Mme":1, "Mlle":1, "Mrs":1, "Mr":2, "Rare":3})
df["Title"] = df["Title"].astype(int)
df = pd.get_dummies(df, columns = ["Title"])

In [5]:
# 新建:家庭大小 (Fsize)特徵, 並依照大小分別建獨立欄位
df["Fsize"] = df["SibSp"] + df["Parch"] + 1
df['Single'] = df['Fsize'].map(lambda s: 1 if s == 1 else 0)
df['SmallF'] = df['Fsize'].map(lambda s: 1 if  s == 2  else 0)
df['MedF'] = df['Fsize'].map(lambda s: 1 if 3 <= s <= 4 else 0)
df['LargeF'] = df['Fsize'].map(lambda s: 1 if s >= 5 else 0)

In [6]:
# Ticket : 如果不只是數字-取第一個空白之前的字串(去除'.'與'/'), 如果只是數字-設為'X', 最後再取 One Hot
Ticket = []
for i in list(df.Ticket):
    if not i.isdigit() :
        Ticket.append(i.replace(".","").replace("/","").strip().split(' ')[0])
    else:
        Ticket.append("X")        
df["Ticket"] = Ticket
df = pd.get_dummies(df, columns = ["Ticket"], prefix="T")

In [7]:
# Cabib 依照第一碼分類, 再取 One Hot
df["Cabin"] = pd.Series([i[0] if not pd.isnull(i) else 'X' for i in df['Cabin'] ])
df = pd.get_dummies(df, columns = ["Cabin"], prefix="Cabin")

In [8]:
# Embarked, Pclass 取 One Hot
df = pd.get_dummies(df, columns = ["Embarked"], prefix="Em")
df["Pclass"] = df["Pclass"].astype("category")
df = pd.get_dummies(df, columns = ["Pclass"], prefix="Pc")

# 捨棄 Name 欄位
df.drop(labels = ["Name"], axis = 1, inplace = True)

In [9]:
na_check(df)
df.head()

Unnamed: 0,Missing Ratio


Unnamed: 0,Sex,Age,SibSp,Parch,Fare,Title_0,Title_1,Title_2,Title_3,Fsize,...,Cabin_F,Cabin_G,Cabin_T,Cabin_X,Em_C,Em_Q,Em_S,Pc_1,Pc_2,Pc_3
0,0,22.0,1,0,1.981001,0,0,1,0,2,...,0,0,0,1,0,0,1,0,0,1
1,1,38.0,1,0,4.266662,0,1,0,0,2,...,0,0,0,0,1,0,0,1,0,0
2,1,26.0,0,0,2.070022,0,1,0,0,1,...,0,0,0,1,0,0,1,0,0,1
3,1,35.0,1,0,3.972177,0,1,0,0,2,...,0,0,0,0,0,0,1,1,0,0
4,0,35.0,0,0,2.085672,0,0,1,0,1,...,0,0,0,1,0,0,1,0,0,1


In [10]:
# 將資料最大最小化
df = MinMaxScaler().fit_transform(df)

# 將前述轉換完畢資料 df , 重新切成 train_X, test_X
train_num = train_Y.shape[0]
train_X = df[:train_num]
test_X = df[train_num:]

# 使用三種模型 : 邏輯斯迴歸 / 梯度提升機 / 隨機森林, 參數使用 Random Search 尋找
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
lr = LogisticRegression(tol=0.001, penalty='l2', fit_intercept=True, C=1.0)
gdbt = GradientBoostingClassifier(tol=100, subsample=0.75, n_estimators=250, max_features=20,
                                  max_depth=6, learning_rate=0.03)
rf = RandomForestClassifier(n_estimators=100, min_samples_split=2, min_samples_leaf=1, 
                            max_features='sqrt', max_depth=6, bootstrap=True)

In [11]:
# 線性迴歸預測檔 (結果有部分隨機, 請以 Kaggle 計算的得分為準, 以下模型同理)
lr.fit(train_X, train_Y)
lr_pred = lr.predict_proba(test_X)[:,1]
sub = pd.DataFrame({'PassengerId': ids, 'Survived': lr_pred})
sub['Survived'] = sub['Survived'].map(lambda x:1 if x>0.5 else 0) 
sub.to_csv('titanic_lr.csv', index=False) 

In [12]:
# 梯度提升機預測檔 
gdbt.fit(train_X, train_Y)
gdbt_pred = gdbt.predict_proba(test_X)[:,1]
sub = pd.DataFrame({'PassengerId': ids, 'Survived': gdbt_pred})
sub['Survived'] = sub['Survived'].map(lambda x:1 if x>0.5 else 0) 
sub.to_csv('titanic_gdbt.csv', index=False)

In [13]:
# 隨機森林預測檔
rf.fit(train_X, train_Y)
rf_pred = rf.predict_proba(test_X)[:,1]
sub = pd.DataFrame({'PassengerId': ids, 'Survived': rf_pred})
sub['Survived'] = sub['Survived'].map(lambda x:1 if x>0.5 else 0) 
sub.to_csv('titanic_rf.csv', index=False)

# 作業
* 分類預測的集成泛化, 也與回歸的很不一樣  
既然分類的 Blending 要變成機率, 才比較容易集成,
那麼分類的 Stacking 要讓第一層的模型輸出機率當特徵, 應該要怎麼寫呢?

In [19]:
from mlxtend.classifier import StackingClassifier

meta_estimator = GradientBoostingClassifier(tol=100, subsample=0.70, n_estimators=50, 
                                           max_features='sqrt', max_depth=4, learning_rate=0.3)
"""
Your Code Here
"""

lr = LogisticRegression()

stacking = StackingClassifier(classifiers=[lr_pred, gdbt_pred, rf_pred],
                              meta_classifier=lr)




In [20]:
stacking.fit(train_X, train_Y)
stacking_pred = stacking.predict(test_X)
sub = pd.DataFrame({'PassengerId': ids, 'Survived': stacking_pred})
sub.to_csv('titanic_stacking.csv', index=False)

TypeError: Cannot clone object 'array([0.08340213, 0.55613265, 0.5172207 , 0.42358866, 0.20582841,
       0.08574486, 0.5752319 , 0.8127357 , 0.75486182, 0.29563642,
       0.3079404 , 0.71764871, 0.52867301, 0.09034346, 0.74618298,
       0.87074111, 0.82933418, 0.11534468, 0.54398367, 0.67054224,
       0.20051811, 0.29630431, 0.86368803, 0.46395792, 0.55439108,
       0.20784936, 0.65092902, 0.15140481, 0.66888454, 0.10934535,
       0.18057658, 0.7732931 , 0.61256254, 0.09628689, 0.31884283,
       0.11932551, 0.19393326, 0.21359198, 0.35513984, 0.42456069,
       0.3773356 , 0.70601998, 0.06332581, 0.81499056, 0.83558465,
       0.29531926, 0.25743245, 0.40951285, 0.57892324, 0.62105188,
       0.86321054, 0.25567984, 0.93488359, 0.52347269, 0.43879018,
       0.01986074, 0.3132535 , 0.07103335, 0.30062153, 0.9749136 ,
       0.0800061 , 0.74871767, 0.10673939, 0.87652637, 0.08133186,
       0.954442  , 0.83002348, 0.22104246, 0.79034591, 0.09949578,
       0.25819965, 0.36116655, 0.19108814, 0.3058762 , 0.70996508,
       0.58992978, 0.06867363, 0.38790276, 0.79447694, 0.70639526,
       0.14636321, 0.14183416, 0.63417878, 0.06843918, 0.64776177,
       0.40573204, 0.25007648, 0.16520783, 0.7601056 , 0.23265405,
       0.17785695, 0.06825282, 0.77935624, 0.06867363, 0.37005963,
       0.0712702 , 0.48544036, 0.55417532, 0.65172192, 0.05623545,
       0.89564313, 0.14710413, 0.24036199, 0.07022889, 0.30055123,
       0.15521797, 0.43611869, 0.09118368, 0.06964005, 0.61692001,
       0.27804095, 0.69366851, 0.56794457, 0.72121099, 0.79851981,
       0.10325175, 0.10279897, 0.36540496, 0.57839954, 0.81163036,
       0.46879942, 0.0690594 , 0.58585043, 0.68129393, 0.24036199,
       0.83512696, 0.0743754 , 0.25548675, 0.67456745, 0.07270129,
       0.066308  , 0.28012751, 0.19547069, 0.36930988, 0.05515383,
       0.0723868 , 0.72317374, 0.2252838 , 0.22605915, 0.01736202,
       0.1780746 , 0.90992521, 0.23803565, 0.18681082, 0.21894865,
       0.05771652, 0.28291386, 0.34889784, 0.39068517, 0.24330563,
       0.75553598, 0.53548773, 0.02927092, 0.18198367, 0.0094723 ,
       0.03007115, 0.90027805, 0.20543474, 0.21894865, 0.8552261 ,
       0.24655249, 0.53855853, 0.40563843, 0.06699608, 0.76557183,
       0.84349071, 0.91546262, 0.03335522, 0.64949052, 0.21210601,
       0.06181058, 0.70181807, 0.31097363, 0.10484753, 0.01440618,
       0.48842426, 0.94777004, 0.75678394, 0.39279671, 0.49826344,
       0.57448594, 0.30171795, 0.95662835, 0.78561464, 0.9332129 ,
       0.24984499, 0.63594825, 0.04979937, 0.01701589, 0.143511  ,
       0.51711001, 0.25709367, 0.36735723, 0.85754002, 0.83092617,
       0.51494949, 0.50924948, 0.21790268, 0.5938997 , 0.62483066,
       0.27188172, 0.10842477, 0.26786709, 0.48013254, 0.17025216,
       0.71635597, 0.22723348, 0.17056049, 0.90268458, 0.12815791,
       0.14734526, 0.35850864, 0.17123382, 0.25334323, 0.01605614,
       0.86981953, 0.69418349, 0.22217883, 0.97464795, 0.06867363,
       0.43798118, 0.07586375, 0.22780795, 0.07543782, 0.57987052,
       0.298475  , 0.07332675, 0.24662856, 0.06498917, 0.55185925,
       0.76632078, 0.55792877, 0.05506493, 0.410272  , 0.24915409,
       0.36447919, 0.22615753, 0.47785107, 0.4098924 , 0.50438841,
       0.894365  , 0.79854293, 0.34875836, 0.06842645, 0.03162192,
       0.34752909, 0.88915951, 0.54791619, 0.63251059, 0.33904203,
       0.49681671, 0.31361848, 0.44700608, 0.07460544, 0.32926487,
       0.33783019, 0.4152647 , 0.53522903, 0.8390831 , 0.36049156,
       0.05249029, 0.61789916, 0.74882398, 0.4465069 , 0.6883955 ,
       0.06843918, 0.15714757, 0.06873623, 0.70454123, 0.46139231,
       0.3424894 , 0.09118368, 0.92968043, 0.27880202, 0.44736828,
       0.93177485, 0.6004423 , 0.12381241, 0.76030484, 0.38460797,
       0.20848658, 0.11224039, 0.24662856, 0.34823231, 0.4251759 ,
       0.05284024, 0.07517261, 0.25291867, 0.10484753, 0.30780912,
       0.66951946, 0.82925743, 0.2697452 , 0.64764283, 0.06306421,
       0.07042023, 0.55061335, 0.5430154 , 0.34323367, 0.53024896,
       0.32259248, 0.25133025, 0.0523833 , 0.70158192, 0.24662856,
       0.94254003, 0.73320824, 0.46736439, 0.20915676, 0.81848123,
       0.45758449, 0.66725355, 0.34034309, 0.22189401, 0.56420054,
       0.73274439, 0.66617925, 0.26110492, 0.41905516, 0.8694504 ,
       0.07022889, 0.10924359, 0.58940485, 0.68986147, 0.58675534,
       0.45796522, 0.45147437, 0.89965154, 0.53621814, 0.77786585,
       0.79945495, 0.34734764, 0.14088593, 0.31560105, 0.34022391,
       0.25128071, 0.20963486, 0.73873022, 0.20176585, 0.13689912,
       0.89296973, 0.39889322, 0.0082101 , 0.55628776, 0.06123556,
       0.78510482, 0.58940485, 0.6938271 , 0.81019563, 0.50272726,
       0.50716319, 0.29429315, 0.16906897, 0.14380174, 0.27451359,
       0.19806484, 0.95263033, 0.33670157, 0.40951285, 0.57468452,
       0.00648308, 0.55802648, 0.82801514, 0.0705683 , 0.62067493,
       0.02629949, 0.68286889, 0.71166235, 0.89986346, 0.83668086,
       0.4093131 , 0.70487362, 0.1190373 , 0.13613638, 0.85519656,
       0.92201803, 0.61137961, 0.23473783, 0.20571349, 0.01069518,
       0.40951285, 0.417711  , 0.21199475, 0.61685964, 0.16847566,
       0.4456717 , 0.6065232 , 0.47158411, 0.10047634, 0.0767934 ,
       0.51838956, 0.34599618, 0.09767569, 0.76146235, 0.04911924,
       0.53284369, 0.4213311 , 0.60146288, 0.11333853, 0.39834037,
       0.60859691, 0.12274475, 0.9274622 , 0.31438748, 0.69752813,
       0.27356444, 0.13373975, 0.91451099, 0.24649545, 0.73367025,
       0.24662856, 0.55784283, 0.69818775, 0.06044518, 0.61345711,
       0.32354698, 0.33761136, 0.47633094])' (type <class 'numpy.ndarray'>): it does not seem to be a scikit-learn estimator as it does not implement a 'get_params' methods.