# Sprint アンサンブル学習

### データセット

In [2]:
path = '/Users/toyodasatomi/Desktop/DIC/data/train.csv'
df = pd.read_csv(path)

この中のtrain.csvをダウンロードし、目的変数としてSalePrice、説明変数として、GrLivAreaとYearBuiltを使います。

train.csvを学習用（train）8割、検証用（val）2割に分割してください。

In [4]:
display(df.head())

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
#特徴量選択
gy = df.loc[:,["GrLivArea","YearBuilt"]]
sp = df.loc[:,["SalePrice"]]

In [13]:
#テストデータに分割
#gy.info()
#sp.info()
X_train, X_test, y_train, y_test = train_test_split(gy,sp,test_size=0.2,shuffle = True)

In [75]:
y_train = np.ravel(y_train)
y_test = np.ravel(y_test)
y_train.shape
y_test.shape

(292,)

### インポート

In [1]:
import numpy as np
import pandas as pd
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns

In [11]:
from sklearn.model_selection import train_test_split

In [21]:
from sklearn.metrics import mean_squared_error

In [57]:
from sklearn.model_selection import KFold

### scikit-learn

単一のモデルはスクラッチ実装ではなく、scikit-learnなどのライブラリの使用を推奨します。

sklearn.linear_model.LinearRegression — scikit-learn 0.21.3 documentation

sklearn.svm.SVR — scikit-learn 0.21.3 documentation

sklearn.tree.DecisionTreeRegressor — scikit-learn 0.21.3 documentation

## 【問題1】ブレンディングのスクラッチ実装

### 単純に1モデルずつで予測

#### 線形回帰

In [14]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

In [76]:
#学習
lr.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [77]:
#予測
y_pred_lr= lr.predict(X_test)

In [78]:
lr.score(X_test,y_test)

0.6568307208382459

In [79]:
#評価
mean_squared_error(y_test,y_pred_lr)

2176501878.620357

#### SVM

In [23]:
from sklearn.svm import SVR

In [81]:
svr = SVR()
#学習
svr.fit(X_train,y_train)
#予測
y_pred_svm= svr.predict(X_test)
#評価
mean_squared_error(y_test,y_pred_svm)



6715165408.278716

#### 決定木

In [35]:
from sklearn.tree import DecisionTreeRegressor

In [82]:
dt = DecisionTreeRegressor()
#学習
dt.fit(X_train,y_train)
#予測
y_pred_dt= dt.predict(X_test)
#評価
mean_squared_error(y_test,y_pred_dt)

3068734108.6328006

####   単純な予測の結果
線形回帰(2176501878.620357)  
  →決定木(3160579974.5951295)  
      →SVM(6715165408.278716)  

### 3モデルの結果の平均で予測値出してみる

In [90]:
y_mean_pre = np.sum((y_pred_lr,y_pred_svm,y_pred_dt),axis=0)/3

In [93]:
y_mean_one = np.mean((y_pred_lr,y_pred_svm,y_pred_dt),axis=0)

In [96]:
#print(y_mean_pre)

In [97]:
#print(y_mean_one)

In [83]:
y_pred_lr.shape

(292,)

In [91]:
y_mean_pre.shape

(292,)

In [92]:
mean_squared_error(y_test,y_mean_pre)

2409067357.6095343

In [98]:
mean_squared_error(y_test,y_mean_one)

2409067357.6095343

**単純に3モデルをブレンドすると決定木、SVM単体よりはスコアは良くなったが、線形回帰単体のほうがスコアは良い**

### 2モデルの組み合わせで出してみる

In [100]:
#線形回帰と決定木
y_mean_Ir_dt = np.mean((y_pred_lr,y_pred_dt),axis=0)
mean_squared_error(y_test,y_mean_Ir_dt)

2154372195.3661146

In [101]:
#線形回帰とSVM
y_mean_Ir_svm = np.mean((y_pred_lr,y_pred_svm),axis=0)
mean_squared_error(y_test,y_mean_Ir_svm)

3269458099.1266503

In [102]:
#決定木とSVM
y_mean_svm_dt = np.mean((y_pred_svm,y_pred_dt),axis=0)
mean_squared_error(y_test,y_mean_svm_dt)

2986671609.011656

   **スコアの上位2モデルをブレンディングすると良いスコアになった**

### 重み変えて掛け合わせてみる

In [114]:
y_mean_change = np.sum((y_pred_lr*0.6,y_pred_svm*0.1,y_pred_dt*0.3),axis=0)
mean_squared_error(y_test,y_mean_change)

2049937417.2716074

In [115]:
y_mean_change = np.sum((y_pred_lr*0.6,y_pred_svm*0.2,y_pred_dt*0.2),axis=0)
mean_squared_error(y_test,y_mean_change)

2168418282.2034535

**重みを調整してブレンディングすることで、一番良いスコアが出た**

## 【問題2】バギングのスクラッチ実装

#### ブーストトラップサンプルデータセット

In [213]:
#1
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_train,y_train,test_size=0.2,shuffle = True)
#2
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_train,y_train,test_size=0.2,shuffle = True)
#3
X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(X_train,y_train,test_size=0.2,shuffle = True)
#4
X_train_4, X_test_4, y_train_4, y_test_4 = train_test_split(X_train,y_train,test_size=0.2,shuffle = True)
#5
X_train_5, X_test_5, y_train_5, y_test_5 = train_test_split(X_train,y_train,test_size=0.2,shuffle = True)

#### 学習と予測

In [226]:
#1
dt1 = DecisionTreeRegressor()
#学習
dt1.fit(X_train_1,y_train_1)
#予測
#y_pred_dt_1= dt1.predict(X_test_1)
y_pred_dt_1= dt1.predict(X_test)

#2
dt2 = DecisionTreeRegressor()
#学習
dt2.fit(X_train_2,y_train_2)
#予測
y_pred_dt_2= dt2.predict(X_test)

#3
dt3 = DecisionTreeRegressor()
#学習
dt3.fit(X_train_3,y_train_3)
#予測
y_pred_dt_3= dt3.predict(X_test)

#4
dt4 = DecisionTreeRegressor()
#学習
dt4.fit(X_train_4,y_train_4)
#予測
y_pred_dt_4= dt4.predict(X_test)

#5
dt5 = DecisionTreeRegressor()
#学習
dt5.fit(X_train_5,y_train_5)
#予測
y_pred_dt_5= dt5.predict(X_test)

#### 評価

In [227]:
mean_squared_error(y_test,y_pred_dt_1)

3405081071.7636986

In [228]:
mean_squared_error(y_test,y_pred_dt_2)

3285087372.8184934

In [229]:
mean_squared_error(y_test,y_pred_dt_3)

2455001658.5753427

In [230]:
mean_squared_error(y_test,y_pred_dt_4)

3436707170.495054

In [231]:
mean_squared_error(y_test,y_pred_dt_5)

2219298962.4836373

#### バギング

In [252]:
#決定木5種類
y_mean_dt55 = np.mean((y_pred_dt_1,y_pred_dt_2,y_pred_dt_3,y_pred_dt_4,y_pred_dt_5),axis=0)
mean_squared_error(y_test,y_mean_dt55)

2017192833.0241709

**決定木の組み合わせで一番良いスコアが出た**

## 【問題3】スタッキングのスクラッチ実装

#### データセット

X_train, X_test, y_train, y_test 

In [130]:
kf = KFold(n_splits=3, random_state=None, shuffle=True)

In [143]:
a,b,c=kf.split(X_train,y_train)

In [145]:
print(a[0].shape)

(778,)


#### インデックスからデータ作成

In [154]:
df_X_train=pd.DataFrame(X_train)
df_y_train=pd.DataFrame(y_train)

In [157]:
#[a]の分割
X_train_a = df_X_train.iloc[a[0]]
y_train_a = df_y_train.iloc[a[0]]
X_test_a = df_X_train.iloc[a[1]]
y_test_a = df_y_train.iloc[a[1]]

#[b]の分割
X_train_b = df_X_train.iloc[b[0]]
y_train_b = df_y_train.iloc[b[0]]
X_test_b = df_X_train.iloc[b[1]]
y_test_b = df_y_train.iloc[b[1]]

#[a]の分割
X_train_c = df_X_train.iloc[c[0]]
y_train_c = df_y_train.iloc[c[0]]
X_test_c = df_X_train.iloc[c[1]]
y_test_c = df_y_train.iloc[c[1]]

In [153]:
X_train_a = df_X_train.iloc[a[0]]
display(X_train_a.head())

Unnamed: 0,GrLivArea,YearBuilt
1004,1504,2005
1216,1902,1978
204,1284,1947
38,1057,1953
442,1573,1930


#### インスタンス作成

##### とりあえず、パラメータは指定せず

In [156]:
#線形回帰
lr_a = LinearRegression()
lr_b = LinearRegression()
lr_c = LinearRegression()

#SVM
svr_a = SVR()
svr_b = SVR()
svr_c = SVR()

#決定木
dt_a = DecisionTreeRegressor()
dt_b = DecisionTreeRegressor()
dt_c = DecisionTreeRegressor()

#### 学習

In [159]:
#学習
lr_a.fit(X_train_a,y_train_a)
lr_b.fit(X_train_b,y_train_b)
lr_c.fit(X_train_c,y_train_c)

svr_a.fit(X_train_a,y_train_a)
svr_b.fit(X_train_b,y_train_b)
svr_c.fit(X_train_c,y_train_c)

dt_a.fit(X_train_a,y_train_a)
dt_b.fit(X_train_b,y_train_b)
dt_c.fit(X_train_c,y_train_c)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

#### 予測

In [161]:
#線形回帰
y_pred_lr_a= lr_a.predict(X_test_a)
y_pred_lr_b= lr_a.predict(X_test_b)
y_pred_lr_c= lr_a.predict(X_test_c)

In [177]:
lr_abc = np.concatenate([y_pred_lr_a,y_pred_lr_b,y_pred_lr_c],axis = 0)

In [165]:
y_pred_lr_b.shape

(389, 1)

In [174]:
xxx.shape

(1168, 1)

In [176]:
X_train.shape

(1168, 2)

In [180]:
#SVM
y_pred_svr_a= svr_a.predict(X_test_a)
y_pred_svr_b= svr_a.predict(X_test_b)
y_pred_svr_c= svr_a.predict(X_test_c)

In [181]:
svr_abc = np.concatenate([y_pred_svr_a,y_pred_svr_b,y_pred_svr_c],axis = 0)

In [182]:
#決定木
y_pred_dt_a= dt_a.predict(X_test_a)
y_pred_dt_b= dt_a.predict(X_test_b)
y_pred_dt_c= dt_a.predict(X_test_c)

In [183]:
dt_abc = np.concatenate([y_pred_dt_a,y_pred_dt_b,y_pred_dt_c],axis = 0)

#### 予測結果を新たな変数に

In [196]:
new_abc = np.vstack((lr_abc_new,svr_abc,dt_abc))

In [204]:
#これが新しい変数
print(new_abc.T.shape)
new_train_abc=new_abc.T
print(new_train_abc.shape)
print(new_train_abc.shape)

(1168, 3)
(1168, 3)


In [211]:
#これが新しい変数
new_test_y_abc = np.vstack((y_test_a,y_test_b,y_test_c))
print(new_test_y_abc.shape)

(1168, 1)


In [186]:
lr_abc.shape

(1168, 1)

In [190]:
lr_abc_new = np.ravel(lr_abc)

In [191]:
lr_abc_new.shape

(1168,)

In [187]:
svr_abc.shape

(1168,)

In [188]:
dt_abc.shape

(1168,)

#### 新しい変数で学習

In [205]:
#インスタンス化
#線形回帰
lr_new = LinearRegression()
#SVM
svr_new = SVR()
#決定木
dt_new = DecisionTreeRegressor()

In [212]:
#学習 線形回帰
lr_new.fit(new_train_abc,new_test_y_abc)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [253]:
##学習 SVM
svr_new.fit(new_train_abc,new_test_y_abc)

  y = column_or_1d(y, warn=True)


SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
    gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
    tol=0.001, verbose=False)

In [254]:
##学習 決定木
dt_new.fit(new_train_abc,new_test_y_abc)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

#### 推論

In [233]:
#各インスタンスで推論
#線形回帰
y_pred_lr_a= lr_a.predict(X_test)
y_pred_lr_b= lr_a.predict(X_test)
y_pred_lr_c= lr_a.predict(X_test)

#SVM
y_pred_svr_a= svr_a.predict(X_test)
y_pred_svr_b= svr_a.predict(X_test)
y_pred_svr_c= svr_a.predict(X_test)

#決定木
y_pred_dt_a= dt_a.predict(X_test)
y_pred_dt_b= dt_a.predict(X_test)
y_pred_dt_c= dt_a.predict(X_test)

In [235]:
#推論結果の平均取る
y_pred_lr_mean = np.mean((y_pred_lr_a,y_pred_lr_b,y_pred_lr_c),axis=0)
y_pred_svr_mean = np.mean((y_pred_svr_a,y_pred_svr_b,y_pred_svr_c),axis=0)
y_pred_dt_mean = np.mean((y_pred_dt_a,y_pred_dt_b,y_pred_dt_c),axis=0)


In [247]:
#新しい変数にする
new_test_abc = np.vstack((y_pred_lr_mean_new,y_pred_svr_mean,y_pred_dt_mean))

In [None]:
y_pred_lr_mean_new = np.ravel(y_pred_lr_mean)

In [248]:
print(new_test_abc.shape)

(3, 292)


In [249]:
#これ
y_pred_lr_mean_new_abc=new_test_abc.T
print(y_pred_lr_mean_new_abc.shape)

(292, 3)


In [255]:
#予測
y_pred_lr_new= lr_new.predict(y_pred_lr_mean_new_abc)

In [256]:
#評価
mean_squared_error(y_test,y_pred_lr_new)

2404945193.531452

**線形回帰単体実行と比べるとスコアが下がった**

In [257]:
#予測
y_pred_svr_new= svr_new.predict(y_pred_lr_mean_new_abc)

In [258]:
#評価
mean_squared_error(y_test,y_pred_svr_new)

6715175862.0

**SVMはスコアがかなり下がってしまった**

In [259]:
#予測
y_pred_dt_new= dt_new.predict(y_pred_lr_mean_new_abc)

In [260]:
#評価
mean_squared_error(y_test,y_pred_dt_new)

2776344418.8999243

**決定木もスコアが下がった**

#### モデルを2種類だけにしてみる
元のスコアが良い、決定木と線形回帰の組み合わせで

In [261]:
#学習用データ
new_abc_lr_dt = np.vstack((lr_abc_new,dt_abc))
new_train_abc_lr_dt=new_abc_lr_dt.T

In [None]:
#これが新しい変数
new_test_y_abc = np.vstack((y_test_a,y_test_b,y_test_c))
print(new_test_y_abc.shape)

In [262]:
#学習
#線形回帰
lr_new_2 = LinearRegression()

#学習 線形回帰
lr_new_2.fit(new_train_abc_lr_dt,new_test_y_abc)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [265]:
new_train_abc_lr_dt.shape

(1168, 2)

In [267]:
#予測用の新しい変数にする
new_test_abc_lr_dt = np.vstack((y_pred_lr_mean_new,y_pred_dt_mean))
#これ
y_pred_lr_mean_new_abc_lr_dt=new_test_abc_lr_dt.T

In [268]:
y_pred_lr_mean_new_abc_lr_dt.shape

(292, 2)

In [269]:
#予測
y_pred_lr_new_lr_dt= lr_new_2.predict(y_pred_lr_mean_new_abc_lr_dt)

In [270]:
#評価
mean_squared_error(y_test,y_pred_lr_new_lr_dt)

2422718760.9372406

**決定木とSVMで**

In [273]:
#学習用データ
new_abc_svr_dt = np.vstack((svr_abc,dt_abc))
new_train_abc_svr_dt=new_abc_svr_dt.T

In [274]:
#学習
#線形回帰
svr_new_2 = SVR()

#学習 線形回帰
svr_new_2.fit(new_train_abc_svr_dt,new_test_y_abc)

  y = column_or_1d(y, warn=True)


SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
    gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
    tol=0.001, verbose=False)

In [276]:
#予測用の新しい変数にする
new_test_abc_svr_dt = np.vstack((y_pred_svr_mean,y_pred_dt_mean))
#これ
y_pred_lr_mean_new_abc_svr_dt=new_test_abc_svr_dt.T


In [277]:
#予測
y_pred_lr_new_svr_dt= svr_new_2.predict(y_pred_lr_mean_new_abc_svr_dt)
#評価
mean_squared_error(y_test,y_pred_lr_new_svr_dt)


6714897333.843539