In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


training_data = pd.read_csv('data/housing-deployment-reg.csv')

In [3]:
training_data

Unnamed: 0,LotArea,TotalBsmtSF,BedroomAbvGr,GarageCars,SalePrice
0,8450,856,3,2,208500
1,9600,1262,3,2,181500
2,11250,920,3,2,223500
3,9550,756,3,3,140000
4,14260,1145,4,3,250000
...,...,...,...,...,...
1455,7917,953,3,2,175000
1456,13175,1542,3,2,210000
1457,9042,1152,4,1,266500
1458,9717,1078,2,1,142125


In [4]:

X = training_data.drop(columns=['SalePrice'])
y = training_data['SalePrice']


# data splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)



# building the pipeline
X_cat = X_train.select_dtypes(exclude="number").copy()
X_num = X_train.select_dtypes(include="number").copy()

numeric_pipe = make_pipeline(
    SimpleImputer(strategy="mean"))
 
categoric_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A"),
    OneHotEncoder(handle_unknown="ignore")
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, X_num.columns),
        ("cat_pipe", categoric_pipe, X_cat.columns),
    ]
)

In [5]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression

full_pipeline = make_pipeline(preprocessor, 
                              LinearRegression())

param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy":["mean", "median"]
}

search = GridSearchCV(full_pipeline,
                      param_grid,
                      cv=5,
                      verbose=1)

search.fit(X_train, y_train)
 

print(
        f"""
        MSE: {mean_squared_error(search.predict(X_test), y_test)}
        RMSE: {mean_squared_error(search.predict(X_test), y_test)**0.5}
        MAE: {mean_absolute_error(search.predict(X_test), y_test)}
        R2 Score: {r2_score(search.predict(X_test), y_test)}
        """
    )

Fitting 5 folds for each of 2 candidates, totalling 10 fits

        MSE: 2738168858.1664343
        RMSE: 52327.515306638
        MAE: 34609.50046546005
        R2 Score: 0.18682300351615422
        


In [6]:
# store the trained pipeline
import pickle
pickle.dump(search, 
            open(file='deploy_model/trained_pipe_lr.sav', mode='wb')
           )