# 事前準備

In [1]:
import pandas as pd
full_df = pd.read_csv('train.csv')
target = full_df['SalePrice']
train_df = full_df[['GrLivArea', 'YearBuilt']]

In [2]:
from sklearn.model_selection import train_test_split

X_80, X_20, y_80, y_20 = train_test_split(train_df.values, target.values, test_size=0.2, random_state=0)

# 【問題1】ブレンディングのスクラッチ実装
ブレンディング をスクラッチ実装し、単一モデルより精度があがる例を 最低3つ 示してください。精度があがるとは、検証データに対する平均二乗誤差（MSE）が小さくなることを指します。

## 単独モデル

In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error

reg_N = ElasticNet(alpha=1.0, l1_ratio=0.5).fit(X_80, y_80)
mean_squared_error(reg_N.predict(X_20), y_20)

2942123469.9794154

## 3つのモデルの平均

In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error

reg_1 = LinearRegression().fit(X_80, y_80)
reg_2 = RandomForestRegressor().fit(X_80, y_80)
reg_3 = ElasticNet(alpha=1.0, l1_ratio=0.5).fit(X_80, y_80)

pred_sum = (reg_1.predict(X_20) + reg_2.predict(X_20) + reg_3.predict(X_20)) / 3
mean_squared_error(pred_sum, y_20)

2353557550.3474035

## ３つのハイパーパラメータ

In [5]:
reg_1 = ElasticNet(alpha=1.0, l1_ratio=0.3).fit(X_80, y_80)
reg_2 = ElasticNet(alpha=1.0, l1_ratio=0.5).fit(X_80, y_80)
reg_3 = ElasticNet(alpha=0.1, l1_ratio=0.5).fit(X_80, y_80)

pred_sum = (reg_1.predict(X_20) + reg_2.predict(X_20) + reg_3.predict(X_20)) / 3
mean_squared_error(pred_sum, y_20)

2942113991.3477826

## 3つの前処理

In [6]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# 対数変換
log_y_80 = np.log(y_80)
log_y_20 = np.log(y_20)
reg_1 = ElasticNet(alpha=1.0, l1_ratio=0.5).fit(X_80, log_y_80)

# 正則化
scaler = StandardScaler()
scaler_X_80 = scaler.fit_transform(X_80)
scaler_X_20 = scaler.transform(X_20)
reg_2 = ElasticNet(alpha=1.0, l1_ratio=0.5).fit(scaler_X_80, y_80)

# 01正則化
m_scaler = MinMaxScaler()
m_scaler_X_80 = scaler.fit_transform(X_80)
m_scaler_X_20 = scaler.transform(X_20)
reg_3 = ElasticNet(alpha=1.0, l1_ratio=0.5).fit(m_scaler_X_80, y_80)

pred_sum = (np.exp(reg_1.predict(X_20)) + reg_2.predict(scaler_X_20) + reg_3.predict(m_scaler_X_20)) / 3
mean_squared_error(pred_sum, y_20)

3823407077.4509144

# 【問題2】バギングのスクラッチ実装
バギング をスクラッチ実装し、単一モデルより精度があがる例を 最低1つ 示してください。

# 単一モデルの場合

In [7]:
from sklearn.tree import DecisionTreeRegressor

reg_N = DecisionTreeRegressor(random_state=0).fit(X_80, y_80)
mean_squared_error(reg_N.predict(X_20), y_20)

3009170128.186454

# バギングの場合

In [8]:
from sklearn.model_selection import ShuffleSplit

n_splits=10

rs = ShuffleSplit(n_splits=n_splits, test_size=0.2, random_state=42)
sum_error = 0

# 全体の80%のデータを使用する
for train_index, test_index in rs.split(X_80):    
    reg_bag = DecisionTreeRegressor(random_state=0).fit(X_80[train_index], y_80[train_index])
    sum_error += mean_squared_error(reg_bag.predict(X_20), y_20)
sum_error / n_splits

2793127047.909028

# 【問題3】スタッキングのスクラッチ実装
スタッキング をスクラッチ実装し、単一モデルより精度があがる例を 最低1つ 示してください。

## データ分割

In [9]:
# 3分割
k = 3
train_X3, train_X, train_y3, train_y = train_test_split(X_80, y_80, test_size=(2/k), random_state=0)
train_X1, train_X2, train_y1, train_y2 = train_test_split(train_X, train_y, test_size=(1/ (k - 1)), random_state=0)

In [10]:
train_X2.shape

(390, 2)

In [11]:
train_X1.shape, train_X3.shape

((389, 2), (389, 2))

In [12]:
# 桁合わせ
train_X2 = np.delete(train_X2, 0 ,0)
train_y2 = np.delete(train_y2, 0 ,0)

In [13]:
train_X1.shape, train_X2.shape, train_X3.shape 

((389, 2), (389, 2), (389, 2))

In [14]:
# データを組み合わせる
train_X12 = np.concatenate([train_X1, train_X2])
train_X23 = np.concatenate([train_X2, train_X3])
train_X13 = np.concatenate([train_X1, train_X3])

train_y12 = np.concatenate([train_y1.reshape(-1, 1), train_y2.reshape(-1, 1)])
train_y23 = np.concatenate([train_y2.reshape(-1, 1), train_y3.reshape(-1, 1)])
train_y13 = np.concatenate([train_y1.reshape(-1, 1), train_y3.reshape(-1, 1)])

In [15]:
train_X13.shape

(778, 2)

## 学習

In [16]:
dtr_1 = DecisionTreeRegressor().fit(train_X23, train_y23)
dtr_2 = DecisionTreeRegressor().fit(train_X13, train_y13)
dtr_3 = DecisionTreeRegressor().fit(train_X12, train_y12)

dtr_pred_1 = dtr_1.predict(train_X1).reshape(-1, 1)
dtr_pred_2 = dtr_2.predict(train_X2).reshape(-1, 1)
dtr_pred_3 = dtr_3.predict(train_X3).reshape(-1, 1)

dtr_pred = np.concatenate([dtr_pred_1, dtr_pred_2, dtr_pred_3])

In [17]:
lr_1 = LinearRegression().fit(train_X23, train_y23)
lr_2 = LinearRegression().fit(train_X13, train_y13)
lr_3 = LinearRegression().fit(train_X12, train_y12)

lr_pred_1 = lr_1.predict(train_X1)
lr_pred_2 = lr_2.predict(train_X2)
lr_pred_3 = lr_3.predict(train_X3)

lr_pred = np.concatenate([lr_pred_1, lr_pred_2, lr_pred_3])

In [18]:
# 2次元の特徴量として使用、ブレンド
new_X_train = np.hstack([dtr_pred, lr_pred])

In [19]:
new_X_train.shape

(1167, 2)

In [20]:
# 最終学習
meta_model = LinearRegression().fit(new_X_train, np.concatenate([train_y1, train_y2, train_y3]))

## 推定

In [21]:
dtr_test_pred_1 = dtr_1.predict(X_20).reshape(-1, 1)
dtr_test_pred_2 = dtr_2.predict(X_20).reshape(-1, 1)
dtr_test_pred_3 = dtr_3.predict(X_20).reshape(-1, 1)

lr_test_pred_1 = lr_1.predict(X_20)
lr_test_pred_2 = lr_2.predict(X_20)
lr_test_pred_3 = lr_3.predict(X_20)

In [22]:
dtr_test_pred = (dtr_test_pred_1 + dtr_test_pred_2 + dtr_test_pred_3) / 3
lr_test_pred = (lr_test_pred_1 + lr_test_pred_2 + lr_test_pred_3) / 3

# 2次元の特徴量として使用、ブレンド
new_X_test = np.hstack([dtr_test_pred, lr_test_pred])

In [23]:
# 最終推定
meta_pred = meta_model.predict(new_X_test)

## 評価

In [24]:
reg_N = DecisionTreeRegressor(random_state=0).fit(X_80, y_80)
mean_squared_error(reg_N.predict(X_20), y_20)

3009170128.186454

In [25]:
mean_squared_error(meta_pred, y_20)

2415406187.090605