In [12]:
# reading
import pandas as pd
housing = pd.read_csv('housing-deployment-reg.csv')

In [13]:
housing.head()

Unnamed: 0,LotArea,TotalBsmtSF,BedroomAbvGr,GarageCars,SalePrice
0,8450,856,3,2,208500
1,9600,1262,3,2,181500
2,11250,920,3,2,223500
3,9550,756,3,3,140000
4,14260,1145,4,3,250000


In [15]:

# train test split
from sklearn.model_selection import train_test_split
X = housing.drop(columns='SalePrice')
y = housing["SalePrice"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=8)
 
# pipeline
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
pipe = make_pipeline(
        SimpleImputer(strategy='median'),
        StandardScaler(),
        KNeighborsRegressor())
 
# parameter grid for pipeline
pipe_params = {
    'simpleimputer__strategy':['median', 'mean'],
    'standardscaler__with_mean':[True, False],
    'kneighborsregressor__n_neighbors': list(range(1, 20)),
    'kneighborsregressor__weights': ['uniform', 'distance'],
    'kneighborsregressor__p': [1, 2],
    'kneighborsregressor__algorithm': ['ball_tree', 'kd_tree', 'brute']}
 
# grid search
from sklearn.model_selection import GridSearchCV
trained_pipe = GridSearchCV(pipe,
                            pipe_params, 
                            cv = 5)
trained_pipe.fit(X_train, y_train)
 
# test accuracy on the test set
from sklearn.metrics import r2_score
 
y_pred = trained_pipe.predict(X_test)
 
r2 = r2_score(y_test, y_pred)
print(r2)

0.7223140457388908


In [21]:
# store the trained pipeline
import pickle
pickle.dump(trained_pipe, 
            open(file='C:\\Users\\rahma\\OneDrive\\Desktop\\Bootcamp\\Section 7 - Supervised Machine learning\\models/trained_pipe_knn.sav', 
                 mode='wb'))