# Sprint アンサンブル学習

In [1]:
import os

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

In [2]:
data_raw = pd.read_csv("/Users/morishuuya/Desktop/dataset/kaggle/HousePrice/train.csv")

In [3]:
train_data = data_raw.copy()

In [4]:
X = train_data.loc[:, ["GrLivArea", "YearBuilt"]]
t = train_data.loc[:, "SalePrice"]

In [5]:
X_train, X_test, t_train, t_test = train_test_split(X, t, test_size=0.2)

# 【問題1】ブレンディングのスクラッチ実装
ブレンディング をスクラッチ実装し、単一モデルより精度があがる例を 最低3つ 示してください。精度があがるとは、検証用データに対する平均二乗誤差（MSE）が小さくなることを指します。

## 1回目

In [6]:
lr = LinearRegression()
dt = DecisionTreeRegressor()
svr = SVR(gamma="scale")

In [7]:
lr.fit(X_train, t_train)
dt.fit(X_train, t_train)
svr.fit(X_train, t_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [8]:
y_lr = lr.predict(X_test)
y_dt = dt.predict(X_test)
y_svr = svr.predict(X_test)
blending_1 = (y_lr + y_dt + y_svr) / 3

In [9]:
print("LogisticRegression\n",mean_squared_error(t_test, y_lr))
print("DecisionTree\n", mean_squared_error(t_test, y_dt))
print("SVR\n", mean_squared_error(t_test, y_svr))
print("Blend\n",mean_squared_error(t_test, blending_1))

LogisticRegression
 2561363471.90957
DecisionTree
 2945653447.2063355
SVR
 7407184312.81181
Blend
 2579522278.771746


## 2回目

In [10]:
lr2 = LinearRegression(normalize = True)
dt2 = DecisionTreeRegressor(max_depth=5)
svr2 = SVR(gamma="scale", kernel="linear")

In [11]:
lr2.fit(X_train, t_train)
dt2.fit(X_train, t_train)
svr2.fit(X_train, t_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
  kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [12]:
y_lr2 = lr2.predict(X_test)
y_dt2 = dt2.predict(X_test)
y_svr2 = svr2.predict(X_test)
blending_2 = ((y_lr2) + (y_dt2) + (y_svr2)) / 3

In [13]:
print(mean_squared_error(t_test, y_lr2))
print(mean_squared_error(t_test, y_dt2))
print(mean_squared_error(t_test, y_svr2))
print(mean_squared_error(t_test, blending_2))

2561363471.9095693
2141235228.9211636
2588348328.6793923
2291204529.8539367


## 3回目

In [14]:
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [15]:
lr3_sc = LinearRegression()
dt3_sc = DecisionTreeRegressor(max_depth=3)
svr3_sc = SVR(gamma="scale", kernel="linear")

In [16]:
lr3 = LinearRegression()
dt3 = DecisionTreeRegressor(max_depth=3)
svr3 = SVR(gamma="scale", kernel="linear")

In [17]:
lr3_sc.fit(X_train_sc, t_train)
dt3_sc.fit(X_train_sc, t_train)
svr3_sc.fit(X_train_sc, t_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
  kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [18]:
lr3.fit(X_train, t_train)
dt3.fit(X_train, t_train)
svr3.fit(X_train, t_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
  kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [19]:
y_lr3_sc = lr3_sc.predict(X_test) 
y_dt3_sc = dt3_sc.predict(X_test) 
y_svr3_sc = svr3_sc.predict(X_test)

blending_3 = ((y_lr3_sc*0.0001) + (y_dt3_sc*0.9) + (y_svr3_sc*0.0009)) / 3

In [20]:
print(mean_squared_error(t_test, y_lr3_sc))
print(mean_squared_error(t_test, y_dt3_sc))
print(mean_squared_error(t_test, y_svr3_sc))

print(mean_squared_error(t_test, blending_3))

2.0457844510081444e+16
42345536250.3215
5533269334162.9795
11416854524.569424


# 【問題2】バギングのスクラッチ実装
バギング をスクラッチ実装し、単一モデルより精度があがる例を 最低1つ 示してください。

In [49]:
class datasplit:
    """
    データを上から分割し、リストに格納するクラス
    最後は余りを含めているので、最後だけ数が異なる可能性がある
    """
    def __init__(self, split_num):
        # 分割数を入力
        self.split_num = split_num
        
    def X(self, data):
        split_point =  int(np.round(data.shape[0]/self.split_num))
        data_list = []
        j = 1
        for i in range(self.split_num):
            if i ==(self.split_num-1):
                data_list.append(data[split_point*i:, :])
            else:
                data_list.append(data[split_point*i:split_point*j, :])
                j+= 1
        return data_list
    
    def t(self, data):
        split_point =  int(np.round(data.shape[0]/self.split_num))
        data_list = []
        j = 1
        for i in range(self.split_num):
            if i ==(self.split_num-1):
                data_list.append(data[split_point*i:])
            else:
                data_list.append(data[split_point*i:split_point*j])
                j+= 1
        return data_list
        

In [51]:
def make_model_list(model, num):
    """
    入力されたモデルを格納したリストを返す
    """
    model_list = []
    for i in range(num):
        model_list.append(model)
        
    return model_list

In [21]:
X_train_b, X_test_b, t_train_b, t_test_b = train_test_split(X, t, test_size=0.2, shuffle=True)

In [26]:
ds_baggin = datasplit(4)

In [83]:
#Xのデータを分割
X_baggin_list = ds_baggin.X(X_train_b.values)

In [84]:
#tのデータを分割
t_bagging_list = ds_baggin.t(t_train_b.values)

In [82]:
#モデルを作成
bagging_model_list = make_model_list( DecisionTreeRegressor(max_depth=5), 4)

In [41]:
bagging_pred_list = []
for i in range(4):
    bagging_model_list[i].fit(X_baggin_list[i], t_bagging_list[i])
    bagging_pred_list.append(bagging_model_list[i].predict(X_test_b))

In [46]:
baging_1 = (bagging_pred_list[0]+bagging_pred_list[1]+bagging_pred_list[2]+bagging_pred_list[3]) / 4

In [48]:
print(mean_squared_error(t_test_b, bagging_pred_list[0]))
print(mean_squared_error(t_test_b, bagging_pred_list[1]))
print(mean_squared_error(t_test_b,bagging_pred_list[2]))
print(mean_squared_error(t_test_b, bagging_pred_list[3]))

print(mean_squared_error(t_test_b, baging_1))

1923088782.7716286
3738272611.348503
5777680571.288019
1879587226.7646046
1884330742.2319853


# 【問題3】スタッキングのスクラッチ実装
スタッキング をスクラッチ実装し、単一モデルより精度があがる例を 最低1つ 示してください。

## クラス、関数

In [50]:
def dataconcate(data_list):
    """
    分割されたデータのリスト受け取り、それぞれ組み合わせたデータを返す
    """
    dataset_list = []
    for i in range(len(data_list)):
        for j in range(len(data_list)):
            if i==j or i>j:
                pass
            else:
                dataset_list.append(np.concatenate([data_list[i], data_list[j]], axis=0))
    return dataset_list

In [85]:
def training(model_list, X_list, X_dataset, t_dataset):
    """
    データの組み合わせごとに学習および、推定を行う
    学習後のモデルと推定値のリストを返す
    """
    predict_list = []
    for i in range(len(model_list)):
        model_list[i].fit(X_dataset[i], t_dataset[i])
        predict_list.append(model_list[i].predict(X_list[-i]))
    return model_list, predict_list

In [86]:
def predict(model_list, X_test):
    """
    モデルのリストを受け取り、テストデータで推定し、推定値のリストを返す
    """
    predict_list = []
    for i in range(len(model_list)):
        predict_list.append(model_list[i].predict(X_test))
    return predict_list

## training part

### データを分割

In [54]:
ds = datasplit(3)

In [56]:
X_list = ds.X(X_train.values)

In [57]:
t_list= ds.t(t_train.values)

### 分割データの組み合わせ

In [58]:
X_dataset = dataconcate(X_list)

In [59]:
t_dataset = dataconcate(t_list)

###  モデルのリストを作成

In [60]:
ML1_DT = make_model_list( DecisionTreeRegressor(max_depth=5), 3)
ML2_DT =make_model_list( DecisionTreeRegressor(max_depth=5), 3)
ML1_SVR = make_model_list(SVR(gamma="scale"), 3)
ML2_SVR = make_model_list(SVR(gamma="scale"), 3)

### 最初の学習

In [61]:
ML1_DT, predict1_DT = training(ML1_DT, X_list, X_dataset, t_dataset)
ML1_SVR, predict1_SVR = training(ML1_SVR, X_list, X_dataset, t_dataset)

### 最初のブレンドデータ

In [62]:
blend_data_DT = np.concatenate([predict1_DT[0], predict1_DT[1], predict1_DT[2]])
blend_data_SVR = np.concatenate([predict1_SVR[0], predict1_SVR[1], predict1_SVR[2]])

In [63]:
blend_data_1 = np.stack([blend_data_DT , blend_data_SVR], 1)
blend_data_1.shape

(1168, 2)

### ブレンドデータの分割

In [64]:
ds = datasplit(3)

In [65]:
X_list = ds.X(blend_data_1)

### 分割データの組み合わせ

In [66]:
X_dataset = dataconcate(X_list)

### 2回目の学習

In [67]:
ML2_DT, predict2_DT = training(ML2_DT, X_list, X_dataset, t_dataset)
ML2_SVR, predict2_SVR = training(ML2_SVR, X_list, X_dataset, t_dataset)

### 二つ目のブレンドデータ

In [68]:
blend_data_DT = np.concatenate([predict2_DT[0], predict2_DT[1], predict2_DT[2]])
blend_data_SVR = np.concatenate([predict2_SVR[0], predict2_SVR[1], predict2_SVR[2]])
blend_data_2 = np.stack([blend_data_DT , blend_data_SVR], 1)
blend_data_2.shape

(1168, 2)

### 最後の学習

In [69]:
last_SVR = SVR(gamma="scale")
last_SVR.fit(blend_data_2, t_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

## predict part

### 最初の推定

In [71]:
pred_DT1 = predict(ML1_DT, X_test.values)
pred_SVR1 = predict(ML1_SVR , X_test.values)

### 最初のブレンドデータ

In [72]:
blend_test_DT1 = np.stack([pred_DT1[0], pred_DT1[1], pred_DT1[2]], 1)
blend_test_SVR1 = np.stack([pred_SVR1[0], pred_SVR1[1], pred_SVR1[2]], 1)

In [73]:
blend_test_DT1 = np.average(blend_test_DT1 , 1)
blend_test_SVR1 = np.average(blend_test_SVR1, 1)

In [74]:
blend_test1 = np.stack([blend_test_DT1, blend_test_SVR1], 1)

### 2回目の推定

In [75]:
pred_DT2 = predict(ML2_DT, blend_test1)
pred_SVR2 = predict(ML2_SVR, blend_test1)

### 二つ目のブレンドデータ

In [79]:
blend_test_DT2 = np.stack([pred_DT2[0], pred_DT2[1], pred_DT2[2]], 1)
blend_test_SVR2 = np.stack([pred_SVR2[0], pred_SVR2[1], pred_SVR2[2]], 1)
blend_test_DT2 = np.average(blend_test_DT2 , 1)
blend_test_SVR2 = np.average(blend_test_SVR2, 1)
blend_test2 = np.stack([blend_test_DT2, blend_test_SVR2], 1)

### 最後の推定

In [80]:
y = last_SVR.predict(blend_test2)

In [81]:
print(mean_squared_error(t_test, y))

7418121257.329934
