# Capstone Project - Predicting House Prices in KING COUNTY                                                     

### The purpose of this program is to prepare data readiness to make new predictions by retrieving model from the disk

### The data columns that require to make new predictions are

In [12]:
%matplotlib inline
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from sklearn.externals import joblib
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [13]:
# Load the HOUSE Dataset
sample = pd.read_csv("sample.csv")

In [14]:
sample.shape

(2000, 13)

In [15]:
sample.dtypes

cid                 int64
dayhours           object
price               int64
living_measure      int64
lot_measure         int64
coast               int64
sight               int64
ceil_measure        int64
yr_built            int64
zipcode             int64
lat               float64
long              float64
furnished           int64
dtype: object

# Data readiness to make new predictions

### Create new column representing Premium house if the cost of house (> 1128000)

In [16]:
sample['Premium_House']   = ''
rec_count = sample.shape[0]

for i in range(rec_count):
    if  (sample['price'][i] >= 1128000):
        sample['Premium_House'][i] = 1
    else:
        sample['Premium_House'][i] = 0

### Extract Year and Month from the column dayhours, add to the database and then drop the attribute dayhours

In [17]:
sample['dayhours']    = sample['dayhours'].astype(str)
sample['sold_year']   = sample['dayhours'].str[:4].astype(np.int64)
sample['sold_month']  = sample['dayhours'].str[4:6].astype(np.int64)
sample.drop('dayhours', axis = 1 , inplace= True)

### Calculate the age of the property and age after renovation before sold year

In [18]:
sample['house_age']   = (sample['sold_year'] - sample['yr_built']).astype(np.int64) 

### Check for duplicates based on the feature "cid"

In [19]:
sample= sample.drop_duplicates(['cid'])

In [20]:
sample = sample.drop(['cid', 'sold_year','sold_month'], axis=1) 
sample = sample.drop(['yr_built'], axis=1)
#sample = sample.drop(['yr_renovated'], axis=1)

In [21]:
# Considering only sight = 0 columns and dropping rest of them as model doesn't require sight = 1,2,3,4

Hot_encoding  = pd.get_dummies(sample['sight'])
Hot_encoding  = Hot_encoding[0]
sample        = sample.drop(['sight'], axis=1)
sample['sight 0'] = Hot_encoding[0].astype(np.int64)

In [22]:
# change the sequence of the columns
sample = sample[['house_age', 'Premium_House','living_measure', 'lot_measure', 'coast', 'ceil_measure', 
                   'zipcode', 'lat', 'long','furnished', 'sight 0','price']]

### break the sample data to predictor data and label 

In [23]:
sample_data,sample_label =sample.iloc[:,:-1],sample.iloc[:,-1]

# End of Data readiness to make new predictions

# Load the Model from the drive

In [24]:
from sklearn.externals import joblib
filename = 'finalized_model.sav'

In [25]:
loaded_model = joblib.load(filename)

In [26]:
Pred = loaded_model.predict(sample_data)

In [27]:
print("RMSE      : %.2f" % np.sqrt(mean_squared_error(sample_label, Pred))) 
print('R-Squared : %.2f' % r2_score(sample_label, Pred))  

RMSE      : 88012.55
R-Squared : 0.95


# End of Program