In [1]:
import pandas as pd
import numpy as np 
from geopy.distance import geodesic 
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.metrics import mean_absolute_error, mean_squared_error
import pickle

In [17]:
data = pd.read_csv('../Phase-2-Project/Data/kc_house_data_test_features.csv')
data.drop(columns = 'Unnamed: 0', inplace = True)
data.head()

Unnamed: 0,id,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,1974300020,20140827T000000,4,2.5,2270,11500,1.0,0,0,3,8,1540,730,1967,0,98034,47.7089,-122.241,2020,10918
1,1974300020,20150218T000000,4,2.5,2270,11500,1.0,0,0,3,8,1540,730,1967,0,98034,47.7089,-122.241,2020,10918
2,3630020380,20141107T000000,3,2.5,1470,1779,2.0,0,0,3,8,1160,310,2005,0,98029,47.5472,-121.998,1470,1576
3,1771000290,20141203T000000,3,1.75,1280,16200,1.0,0,0,3,8,1030,250,1976,0,98077,47.7427,-122.071,1160,10565
4,5126310470,20150115T000000,4,2.75,2830,8126,2.0,0,0,3,8,2830,0,2005,0,98059,47.4863,-122.14,2830,7916


In [18]:
infile = open("model.pickle",'rb')
model = pickle.load(infile)
infile.close()

In [19]:
model.intercept_
print(len(model.coef_))

100


In [5]:
infile = open("other_info.pickle",'rb')
other_info = pickle.load(infile)
infile.close()

In [6]:
above_5std_living = data.sqft_living.mean()+(5*data.sqft_living.std())
above_5std_bathrooms = data.bathrooms.mean()+(5*data.bathrooms.std())
above_5std_bedrooms = data.bedrooms.mean()+(5*data.bedrooms.std())

data['sqft_living'] = np.where(data['sqft_living']>above_5std_living, above_5std_living, data['sqft_living'])
data['bathrooms'] = np.where(data['bathrooms']>above_5std_bathrooms, int(round((above_5std_bathrooms))), data['bathrooms'])
data['bedrooms'] = np.where(data['bedrooms']>above_5std_bedrooms, int(round((above_5std_bedrooms))), data['bedrooms'])
data['bathrooms'] = np.where(data['bathrooms']==0, .5, data['bathrooms'])
data['bedrooms'] = np.where(data['bedrooms']==0, .5, data['bedrooms'])

In [7]:
data['coordinates'] = list(zip(data.lat, data.long))

def distance_amazon(column):
    return(geodesic(column,(47.62246, -122.336775)).miles)
def distance_tmobile(column):
    return(geodesic(column,(47.57879, -122.16547)).miles)
def distance_sb(column):
    return(geodesic(column,(47.581002000, -122.335898000)).miles)

data['dist_sb'] = data['coordinates'].map(distance_sb)
data['dist_tmobile'] = data['coordinates'].map(distance_tmobile)
data['dist_amazon'] = data['coordinates'].map(distance_amazon)

In [8]:
data['bed_bath_ratio'] = data.bedrooms/data.bathrooms
data['age'] = 2021 - data['yr_built']

In [9]:
model_df = data[['sqft_living', 'dist_amazon','dist_tmobile', 'grade', 'waterfront', 'condition', 'zipcode','bathrooms']]
dummy_df = pd.get_dummies(model_df, columns = ['grade', 'condition', 'zipcode', 'bathrooms'], drop_first = True)
dummy_df

Unnamed: 0,sqft_living,dist_amazon,dist_tmobile,waterfront,grade_4,grade_5,grade_6,grade_7,grade_8,grade_9,...,bathrooms_3.5,bathrooms_3.75,bathrooms_4.0,bathrooms_4.25,bathrooms_4.5,bathrooms_4.75,bathrooms_5.0,bathrooms_5.25,bathrooms_5.5,bathrooms_6.0
0,2270.0,7.459291,9.655763,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2270.0,7.459291,9.655763,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1470.0,16.666389,8.129364,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,1280.0,14.924836,12.152074,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,2830.0,13.159606,6.499828,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4317,1530.0,5.326028,11.847779,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4318,2310.0,7.810530,10.326595,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4319,1020.0,2.621569,6.333584,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4320,1600.0,13.914581,5.451235,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
poly_2 = PolynomialFeatures(degree=2, include_bias=False)
poly2_data = poly_2.fit_transform(dummy_df)
poly2_columns = poly_2.get_feature_names(dummy_df.columns)
df_poly2 = pd.DataFrame(poly2_data, columns=poly2_columns)
df_poly2.head()

Unnamed: 0,sqft_living,dist_amazon,dist_tmobile,waterfront,grade_4,grade_5,grade_6,grade_7,grade_8,grade_9,...,bathrooms_5.0^2,bathrooms_5.0 bathrooms_5.25,bathrooms_5.0 bathrooms_5.5,bathrooms_5.0 bathrooms_6.0,bathrooms_5.25^2,bathrooms_5.25 bathrooms_5.5,bathrooms_5.25 bathrooms_6.0,bathrooms_5.5^2,bathrooms_5.5 bathrooms_6.0,bathrooms_6.0^2
0,2270.0,7.459291,9.655763,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2270.0,7.459291,9.655763,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1470.0,16.666389,8.129364,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1280.0,14.924836,12.152074,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2830.0,13.159606,6.499828,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
final_columns = list(other_info[3])

In [12]:
len(final_columns)

100

In [21]:
final_answers = model.predict(df_poly2[final_columns])

In [22]:
final = pd.DataFrame(final_answers)

In [23]:
final.describe()

Unnamed: 0,0
count,4322.0
mean,576346.8
std,341996.0
min,-12510.51
25%,374526.7
50%,502276.8
75%,667253.9
max,4063966.0


In [16]:
final.to_csv('housing_preds_ryan_lewis.csv')