In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
import pickle

In [2]:
house_price = pd.read_csv("Models/housing-deployment-reg.csv")

In [3]:
X = house_price.drop(columns = "SalePrice")
y = house_price["SalePrice"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 8)

In [4]:
pipe = make_pipeline(
        SimpleImputer(strategy = 'median'),
        StandardScaler(),
        KNeighborsRegressor())

In [5]:
pipe_params = {
    'simpleimputer__strategy':['median', 'mean'],
    'standardscaler__with_mean':[True, False],
    'kneighborsregressor__n_neighbors': range(1, 50),
    'kneighborsregressor__weights': ['uniform', 'distance'],
    'kneighborsregressor__p': [1, 2],
    'kneighborsregressor__algorithm': ['ball_tree', 'kd_tree', 'brute']}

In [6]:
trained_pipe = GridSearchCV(pipe,
                            pipe_params, 
                            cv = 5)
trained_pipe.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('simpleimputer',
                                        SimpleImputer(strategy='median')),
                                       ('standardscaler', StandardScaler()),
                                       ('kneighborsregressor',
                                        KNeighborsRegressor())]),
             param_grid={'kneighborsregressor__algorithm': ['ball_tree',
                                                            'kd_tree',
                                                            'brute'],
                         'kneighborsregressor__n_neighbors': range(1, 50),
                         'kneighborsregressor__p': [1, 2],
                         'kneighborsregressor__weights': ['uniform',
                                                          'distance'],
                         'simpleimputer__strategy': ['median', 'mean'],
                         'standardscaler__with_mean': [True, False]})

In [8]:
trained_pipe.best_params_

{'kneighborsregressor__algorithm': 'ball_tree',
 'kneighborsregressor__n_neighbors': 8,
 'kneighborsregressor__p': 1,
 'kneighborsregressor__weights': 'distance',
 'simpleimputer__strategy': 'median',
 'standardscaler__with_mean': False}

In [9]:
trained_pipe.best_score_

0.6578685128263719

In [16]:
y_train_pred = trained_pipe.predict(X_train) 
r2 = r2_score(y_pred = y_train_pred, y_true = y_train)
print(r2)

0.9997984008019197


In [17]:
y_test_pred = trained_pipe.predict(X_test) 
r2 = r2_score(y_pred = y_test_pred, y_true = y_test)
print(r2)

0.7222877652002777


In [18]:
# store the trained pipeline
pickle.dump(trained_pipe, 
            open(file = 'Models/trained_pipe_knn.sav', 
                 mode = 'wb'))

In [19]:
# load model
loaded_model = pickle.load(open('Models/trained_pipe_knn.sav', 'rb'))

In [24]:
# new house with fake data
new_house = pd.DataFrame({
    'LotArea':[5000],
    'TotalBsmtSF':[1000], 
    'BedroomAbvGr':[5], 
    'GarageCars':[3]
})

In [25]:
# prediction
loaded_model.predict(new_house)

array([188572.43369144])