In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import joblib

In [2]:
df =  pd.read_csv('dataset/cleaned_dataset.csv')
df.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,view,grade,sqft_above,sqft_basement,lat,sqft_living15
0,221900.0,3,1.0,1180,0,7,1180,0,47.5112,1340
1,538000.0,3,2.25,2570,0,7,2170,400,47.721,1690
2,180000.0,2,1.0,770,0,6,770,0,47.7379,2720
3,604000.0,4,3.0,1960,0,7,1050,910,47.5208,1360
4,510000.0,3,2.0,1680,0,8,1680,0,47.6168,1800


In [3]:
X = df.drop('price', axis=1)
y = df['price']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)
print(f'X train shape {X_train.shape}')
print(f'X test shape {X_test.shape}')
print(f'y train shape {y_train.shape}')
print(f'y test shape {y_test.shape}')

X train shape (17290, 9)
X test shape (4323, 9)
y train shape (17290,)
y test shape (4323,)


In [5]:
# pipeline
model1 = Pipeline([('scaler', StandardScaler()), ('model', LinearRegression())])
model2 = Pipeline([('scaler', StandardScaler()), ('model', DecisionTreeRegressor())])
model3 = Pipeline([('scaler', StandardScaler()), ('model', RandomForestRegressor())])
model4 = Pipeline([('scaler', StandardScaler()), ('model', GradientBoostingRegressor())])

In [6]:
# hyperparameter
param1 = {'model__fit_intercept': [True, False]}
param2 = {'model__max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10]}
param3 = {'model__n_estimators': [100, 200, 300, 400, 500], 'model__max_depth': [10,20,30]}
param4 = {'model__n_estimators': [100, 200, 300, 400, 500], 'model__max_depth': [10,50,100]}

In [7]:
# gridsearch
grid_model1 = GridSearchCV(model1, param1, cv=5, scoring='r2')
grid_model2 = GridSearchCV(model2, param2, cv=5, scoring='r2')
grid_model3 = GridSearchCV(model3, param3, cv=5, scoring='r2')
grid_model4 = GridSearchCV(model4, param4, cv=5, scoring='r2')

In [8]:
# trian model 1
grid_model1.fit(X_train, y_train)
# train model 2
grid_model2.fit(X_train, y_train)
# train model 3
grid_model3.fit(X_train, y_train)
# train model 4
grid_model4.fit(X_train, y_train)

In [None]:
# evaluate model 1
print(f'Best score model 1: {grid_model1.best_score_}')
print(f'Best param model 1: {grid_model1.best_params_}')
# evaluate model 2
print(f'Best score model 2: {grid_model2.best_score_}')
print(f'Best param model 2: {grid_model2.best_params_}')
# evaluate model 3
print(f'Best score model 3: {grid_model3.best_score_}')
print(f'Best param model 3: {grid_model3.best_params_}')
# evaluate model 4
print(f'Best score model 4: {grid_model4.best_score_}')
print(f'Best param model 4: {grid_model4.best_params_}')

In [None]:
# save all model
joblib.dump(grid_model1, 'model/model1.joblib')
joblib.dump(grid_model2, 'model/model2.joblib')
joblib.dump(grid_model3, 'model/model3.joblib')
joblib.dump(grid_model4, 'model/model4.joblib')

In [None]:
", ".join(X_test.columns.tolist())

In [None]:
from joblib import load
def create_df(bedrooms, bathrooms, sqft_living, view, grade, sqft_above, sqft_basement, lat, sqft_living15):
    df = pd.DataFrame({
        'bedrooms': [bedrooms],
        'bathrooms': [bathrooms],
        'sqft_living': [sqft_living],
        'view': [view],
        'grade': [grade],
        'sqft_above': [sqft_above],
        'sqft_basement': [sqft_basement],
        'lat': [lat],
        'sqft_living15': [sqft_living15]
    })
    return df

def predict_price(df):
    model = load('model/model3.joblib')
    return model.predict(df)[0]

# test
df = create_df(3, 2, 1180, 0, 7, 1180, 0, 47.5112, 1340)
predict_price(df)