In [10]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import BaggingRegressor
from warnings import filterwarnings
import pandas as pd 
import numpy as np

In [11]:
filterwarnings('ignore')

# Prepare

In [12]:
# Data
df = pd.read_csv("../datas/Hitters.csv")
df = df.dropna()

# Dummie
dms = pd.get_dummies(df[['League', 'Division', 'NewLeague']])
y = df["Salary"]
X = df.drop(['Salary', 'League', 'Division', 'NewLeague'], axis=1)
X = pd.concat([X, dms[["League_N", "Division_W", "NewLeague_N"]]], axis=1)

# Test Train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=42)

# Model

In [13]:
bag_model = BaggingRegressor(bootstrap_features = True).fit(X_train, y_train)

# Predict

In [17]:
y_pred = bag_model.predict(X_test)

for i in range(10):
    second_y_pred = bag_model.estimators_[i].predict(X_test)
    print(np.sqrt(mean_squared_error(y_test, second_y_pred)))

print(np.sqrt(mean_squared_error(y_test, y_pred)))

748.1108175424179
486.09713821611206
556.9767651783695
599.3956696894799
569.6958706001507
564.501921053244
625.6558321934285
617.9204335315618
626.1800747099129
626.7045225139752
350.60862480045023


# Model Tuning

In [15]:
# Cross Validition
bag_model = BaggingRegressor(bootstrap_features = True).fit(X_train, y_train)
bag_params = {"n_estimators":range(2,20)}
bag_model_cv = GridSearchCV(bag_model, bag_params, cv=10).fit(X_train, y_train)

# Tuned Model
bag_tuned = BaggingRegressor(bootstrap_features = True, 
                             n_estimators=pd.Series(bag_model_cv.best_params_)[0], 
                             random_state=45).fit(X_train, y_train)

# Test Error
y_pred = bag_tuned.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

350.70135653356607