In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
import pickle

In [2]:
strat_train_set = pd.read_csv(filepath_or_buffer='../../data/processed/train.csv')

In [3]:
housing = strat_train_set.drop(
    "median_house_value", axis=1
)  # drop labels for training set
housing_labels = strat_train_set["median_house_value"].copy()

In [4]:
housing_num = housing.drop("ocean_proximity", axis=1)

In [5]:
imputer = SimpleImputer(strategy="median")

In [6]:
imputer.fit(housing_num)

SimpleImputer(strategy='median')

In [7]:
pickle.dump(imputer,open('../../artifacts/imputer.pkl','wb'))

In [8]:
# file = open('../../artifacts/imputer.pkl','rb')
# imputer = pickle.load(file)

In [9]:
X = imputer.transform(housing_num)

In [10]:
housing_tr = pd.DataFrame(X, columns=housing_num.columns, index=housing.index)

In [11]:
housing_tr["rooms_per_household"] = housing_tr["total_rooms"] / housing_tr["households"]
housing_tr["bedrooms_per_room"] = (
    housing_tr["total_bedrooms"] / housing_tr["total_rooms"]
)
housing_tr["population_per_household"] = (
    housing_tr["population"] / housing_tr["households"]
)

In [12]:
housing_cat = housing[["ocean_proximity"]]
housing_prepared = housing_tr.join(pd.get_dummies(housing_cat, drop_first=True))

In [13]:
# housing_prepared

In [14]:
param_grid = [
    # try 12 (3×4) combinations of hyperparameters
    {"n_estimators": [3, 10, 30], "max_features": [2, 4, 6, 8]},
    # then try 6 (2×3) combinations with bootstrap set as False
    {"bootstrap": [False], "n_estimators": [3, 10], "max_features": [2, 3, 4]},
]

forest_reg = RandomForestRegressor(random_state=42)
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training
grid_search = GridSearchCV(
    forest_reg,
    param_grid,
    cv=5,
    scoring="neg_mean_squared_error",
    return_train_score=True,
)
grid_search.fit(housing_prepared, housing_labels)

GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [15]:
best_param = grid_search.best_params_

In [16]:
final_model = grid_search.best_estimator_

In [17]:
final_model.get_params

<bound method BaseEstimator.get_params of RandomForestRegressor(max_features=6, n_estimators=30, random_state=42)>

In [18]:
pickle.dump(final_model,open('../../artifacts/final_model.pkl','wb'))

In [19]:
# file = open('../../artifacts/final_model.pkl','rb')
# final_model = pickle.load(file)