In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline

# 【問題1】ブレンディングのスクラッチ実装

In [0]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

In [5]:
from keras import models
from keras import layers
from keras import optimizers

Using TensorFlow backend.


In [0]:
df = pd.read_csv('/content/drive/My Drive/train.csv')

In [0]:
train = df[['GrLivArea', 'YearBuilt']]
test = np.log(df['SalePrice'])

In [0]:
X_train, X_val, y_train, y_val = train_test_split(train, test, train_size=0.8, random_state=0)

In [0]:
ss = StandardScaler()

In [10]:
ss.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [0]:
X_train = ss.transform(X_train)
X_val = ss.transform(X_val)

In [0]:
def blending_mean_compute_mse(models, X_val, val):
    predict_array = np.zeros([X_val.shape[0], len(models)])
    for i, model in enumerate(models):
        y_pred = model.predict(X_val)
        predict_array[:, i] = y_pred.flatten()
    y_pred = np.mean(predict_array, axis=1)
    mse = mean_squared_error(y_val, y_pred)
    
    return mse

In [0]:
def compute_mse(model, X_train, y_train, X_val, y_val):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    mse = mean_squared_error(y_val, y_pred)
    
    return mse

## 線形回帰とSVMと決定木の結果を平均するブレンディングを行う

In [15]:
lr = LinearRegression()
compute_mse(lr, X_train, y_train, X_val, y_val)

0.05730354643047342

In [16]:
svr = SVR()
compute_mse(svr, X_train, y_train, X_val, y_val)

0.037334430784131276

In [17]:
dtr = DecisionTreeRegressor()
compute_mse(dtr, X_train, y_train, X_val, y_val)

0.07560855182971471

In [18]:
blending_mean_compute_mse([lr, svr], X_val, y_val)

0.04177057821620026

In [19]:
blending_mean_compute_mse([lr, dtr], X_val, y_val)

0.05344813212487635

In [20]:
blending_mean_compute_mse([svr, dtr], X_val, y_val)

0.04444868705452393

## 線形回帰と決定木をblendingすることでスコアが改善した。

## モデルが異なる方がよいとのことなのでニューラルネットワークを使う

In [0]:
model = models.Sequential()
model.add(layers.Dense(16, activation='relu', input_shape=(2,)))
model.add(layers.Dense(8, activation='relu'))
model.add(layers.Dense(1))
model.compile(optimizer='rmsprop', loss='mse', metrics=['mae'])

In [24]:
model.fit(X_train, y_train, batch_size=2, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.callbacks.History at 0x7f5897d39a90>

In [0]:
y_pred_nn = model.predict(X_val)

In [26]:
mean_squared_error(y_val, y_pred_nn)

0.05252956149323265

In [27]:
blending_mean_compute_mse([lr, model], X_val, y_val)

0.054030767261114845

In [28]:
blending_mean_compute_mse([svr, model], X_val, y_val)

0.04014395906406692

In [29]:
blending_mean_compute_mse([dtr, model], X_val, y_val)

0.05206289136289062

## 決定木とニューラルネットワークでblendingすることでスコアが改善された。

## 決定木のrandom_stateを変えたものをblendingしてみる。

In [0]:
dtr_0 = DecisionTreeRegressor(random_state=0)
dtr_1 = DecisionTreeRegressor(random_state=1)
dtr_2 = DecisionTreeRegressor(random_state=2)

In [33]:
dtr_0.fit(X_train, y_train)
dtr_1.fit(X_train, y_train)
dtr_2.fit(X_train, y_train)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=2, splitter='best')

In [0]:
y_pred_dtr_0 = dtr_0.predict(X_val)
y_pred_dtr_1 = dtr_1.predict(X_val)
y_pred_dtr_2 = dtr_2.predict(X_val)

In [35]:
print(mean_squared_error(y_val, y_pred_dtr_0))
print(mean_squared_error(y_val, y_pred_dtr_1))
print(mean_squared_error(y_val, y_pred_dtr_2))

0.07500705427483274
0.07318031851080452
0.07581118123447245


In [36]:
blending_mean_compute_mse([dtr_0, dtr_1, dtr_2], X_val, y_val)

0.07357241926016397

## 悪い結果となった

## SVMのCを1,10,100に変えてblendingする

In [39]:
svr_rbf_C_1 = SVR(C=1)
svr_rbf_C_1.fit(X_train, y_train)
y_pred_svr_rbf_C_1 = svr_rbf_C_1.predict(X_val)
mean_squared_error(y_val, y_pred_svr_rbf_C_1)

0.037334430784131276

In [40]:
svr_rbf_C_10 = SVR(C=10)
svr_rbf_C_10.fit(X_train, y_train)
y_pred_svr_rbf_C_10 = svr_rbf_C_10.predict(X_val)
mean_squared_error(y_val, y_pred_svr_rbf_C_10)

0.0359508129963376

In [41]:
svr_rbf_C_100 = SVR(C=100)
svr_rbf_C_100.fit(X_train, y_train)
y_pred_svr_rbf_C_100 = svr_rbf_C_100.predict(X_val)
mean_squared_error(y_val, y_pred_svr_rbf_C_100)

0.03498448289133197

In [42]:
y_pred_svr_blending = (y_pred_svr_rbf_C_1 + y_pred_svr_rbf_C_10 + y_pred_svr_rbf_C_100)/3
mean_squared_error(y_val, y_pred_svr_blending)

0.035666337274877606

## 悪い結果となった

## ニューラルネットワークの層を増やしたものをblendingする。

In [0]:
model = models.Sequential()
model.add(layers.Dense(64, activation='relu', input_shape=(2,)))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(8, activation='relu'))
model.add(layers.Dense(1))
model.compile(optimizer='rmsprop', loss='mse', metrics=['mae'])

In [47]:
model.fit(X_train, y_train, batch_size=2, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.callbacks.History at 0x7f5897c38668>

In [0]:
y_pred_nn_2 = model.predict(X_val)

In [49]:
print(mean_squared_error(y_val, y_pred_nn))
print(mean_squared_error(y_val, y_pred_nn_2))

0.05252956149323265
0.08875823898361623


In [0]:
y_pred_nn_blending = (y_pred_nn + y_pred_nn_2)/2

In [51]:
mean_squared_error(y_val, y_pred_nn_blending)

0.05995268036474417

## ４層と２層のニューラルネットワークをblendingすることでスコアが改善された。(再起動したらスコアが悪くなりました。)

# 【問題2】バギングのスクラッチ実装

## svmでtrainデータの７割ずつで学習を行い、最後に平均する。

In [55]:
X_tr, _, y_tr, _ = train_test_split(X_train, y_train, train_size=0.7, shuffle=True)
svr_0 = SVR()
svr_0.fit(X_tr, y_tr)
y_pred_svr_0 = svr_0.predict(X_val)
mean_squared_error(y_val, y_pred_svr_0)

0.03924901834813996

In [56]:
X_tr, _, y_tr, _ = train_test_split(X_train, y_train, train_size=0.7, shuffle=True)
svr_1 = SVR()
svr_1.fit(X_tr, y_tr)
y_pred_svr_1 = svr_1.predict(X_val)
mean_squared_error(y_val, y_pred_svr_1)

0.037514832344930304

In [57]:
X_tr, _, y_tr, _ = train_test_split(X_train, y_train, train_size=0.7, shuffle=True)
svr_2 = SVR()
svr_2.fit(X_tr, y_tr)
y_pred_svr_2 = svr_2.predict(X_val)
mean_squared_error(y_val, y_pred_svr_2)

0.03841772436371518

In [58]:
y_pred_svr_bagging = (y_pred_svr_0 + y_pred_svr_1 + y_pred_svr_2)/3
mean_squared_error(y_val, y_pred_svr_blending)

0.035666337274877606

## baggingすることで結果を改善することができた

# 【問題3】スタッキングのスクラッチ実装

## svrのハイパーパラメータを変更したものを使って予測値を計算し、それを使って、svmで推測する

In [0]:
X_train, X_val, y_train, y_val = train_test_split(train, test, train_size=0.8, random_state=0)
ss = StandardScaler()
ss.fit(X_train)
X_train = ss.transform(X_train)
X_val = ss.transform(X_val)
y_train = np.array(y_train)
y_val = np.array(y_val)

In [63]:
svr = SVR()
svr.fit(X_train, y_train)
y_pred_svr = svr.predict(X_val)
mean_squared_error(y_val, y_pred_svr)

0.037334430784131276

In [0]:
kf = KFold(n_splits=3, shuffle=True, random_state=0)

In [0]:
svr_1 = SVR(C=1)
svr_10 = SVR(C=10)
model_list = list([svr_1, svr_10])
S_train = np.zeros([X_train.shape[0], len(model_list)])
S_test = np.zeros([y_val.shape[0], len(model_list)])
for i, model in enumerate(model_list):
  S_test_split = np.zeros([y_val.shape[0], 3]) 
  for j, (train_index, val_index) in enumerate(kf.split(X_train)):
    X_tr = X_train[train_index]
    y_tr = y_train[train_index]
    X_va = X_train[val_index]
    model.fit(X_tr, y_tr)
    y_pred = model.predict(X_va)
    S_train[val_index, i] = y_pred
    S_test_split[:, j] = model.predict(X_val)
  S_test[:, i] = np.mean(S_test_split, axis=1) 
  

In [66]:
svr_stacking = SVR()
svr_stacking.fit(S_train, y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [0]:
y_pred_stacking = svr_stacking.predict(S_test)

In [68]:
mean_squared_error(y_val, y_pred_stacking)

0.03652644459019285

## stackingすることで0.0373から0.0365にスコアが改善された