## hold out

In [34]:
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

df = sns.load_dataset('tips')
y_col = 'tip'
X = df.drop(columns=[y_col])

# 数値のカラムのリストだけ抽出
numeric_cols = X.select_dtypes(include=np.number).columns.to_list()
X = pd.get_dummies(X, drop_first=True)
y = df[y_col]

# 学習データとテストデータの分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# 標準化(数値のカラムだけ)
scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_train_scaled[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])


# テストデータの標準化
X_test_scaled = X_test.copy()
X_test_scaled[numeric_cols] = scaler.transform(X_test[numeric_cols])

# 線形回帰モデル学習
model = LinearRegression()
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

# 精度評価（MSE）
mean_squared_error(y_test, y_pred) # np.mean(np.square(y_test - y_pred))


0.955080898861715

## LOOCV(Leave One Out Cross Validation)

In [46]:
from sklearn.model_selection import LeaveOneOut
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# データ準備
X = df['total_bill'].values.reshape(-1, 1)
y = df['tip']

loo = LeaveOneOut()
model = LinearRegression()

mse_list = []
# ジェネレータを返す
for train_index, test_index in loo.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # model学習
    model.fit(X_train, y_train)
    # テストデータの予測
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mse_list.append(mse)

In [49]:
print(np.mean(mse_list))
print(np.std(mse_list))

1.0675673489857438
2.0997944551776313


In [56]:
from sklearn.model_selection import cross_val_score
cv = LeaveOneOut()
scores = cross_val_score(model, X, y, cv=cv, scoring='neg_mean_squared_error')

print(-np.mean(scores))
print(-np.std(scores))

1.0675673489857438
-2.0997944551776313


## k-Fold CV(K-Fold Cross Validation)

In [66]:
from sklearn.model_selection import KFold

k = 5
cv = KFold(n_splits=k, shuffle=True, random_state=0)
mse_list = []
for train_index, test_index in cv.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mse_list.append(mse)

In [65]:
print(np.mean(mse_list))
print(np.std(mse_list))

1.0802110883943914
0.16170100507039514


In [70]:
scores = cross_val_score(model, X, y, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)
print(-np.mean(scores))
print(np.std(scores))

1.080211088394392
0.1617010050703952


### Repeated K-Fold

In [73]:
from sklearn.model_selection import RepeatedKFold
n_repeats = 3
cv = RepeatedKFold(n_splits=k, n_repeats=n_repeats, random_state=0)

scores = cross_val_score(model, X, y, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)
print(-np.mean(scores))
print(np.std(scores))

1.0746387233165984
0.26517178540898434


## Pipline

### Pipeline + KFold

In [83]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline(steps=[('scaler', StandardScaler()),('model', LinearRegression())])

cv = KFold(n_splits=5, shuffle=True, random_state=0)
scores = cross_val_score(pipeline, X, y, scoring='neg_mean_squared_error', cv=cv)

In [84]:
scores

array([-0.82130906, -1.07458421, -1.08801239, -1.33238677, -1.084763  ])