In [1]:
import pandas as pd
pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 200)

import datetime

import numpy as np
from matplotlib import pyplot as plt
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
import seaborn as sns
sns.set(style="whitegrid")

In [2]:


import pickle
# Save
with open('regression_model.pickle', 'rb') as f:
    lm = pickle.load(f)
    
with open('features.pickle', "rb") as f:
    features = pickle.load(f)



In [3]:
def comma_be_gone_into_int(x):
    return int(x.replace(',',''))

def new_string(number):
    
    new_string = df.date[number].replace("T000000",'')
    
    new_string =  new_string[:4] + "-" + new_string[4:]
    
    new_string =  new_string[:7] + "-" + new_string[7:]
    
    return new_string

def into_time(string):
    return datetime.datetime.strptime(string, "%Y-%m-%d")

In [4]:
df = pd.read_csv('kc_house_data_test_features.csv')

In [5]:


df.rename(columns = {'Unnamed: 0': 'index'}, inplace = True)

df.set_index('index', inplace = True)



zip_money_df = pd.read_csv('Seatle_Income_Population.csv')

zip_money_df['income'] = [comma_be_gone_into_int(x) for x in zip_money_df.income]

zip_money_df['population'] = [comma_be_gone_into_int(x) for x in zip_money_df.population]

zip_money_dict = dict(zip(zip_money_df.zipcode.values, zip_money_df.income.values))

zip_population_dict = dict(zip(zip_money_df.zipcode.values, zip_money_df.population.values))


average_money = round(np.mean(list(zip_money_dict.values())[:-3]))

average_population = round(np.mean(list(zip_population_dict.values())))


income = []

for i in df.zipcode.values:
    income.append(zip_money_dict.get(i,average_money))

df['income'] = np.array(income)



population = []

for i in df.zipcode.values:
    population.append(zip_population_dict.get(i,average_population))

df['population'] = np.array(population)




df['sold_time'] = [into_time(i) for i in [new_string(i) for i in range(len(df))]]
df['years_ago_sold'] = abs(pd.DatetimeIndex(df['sold_time']).year - 2021)



In [6]:
df = pd.get_dummies(df, columns=['zipcode', 'view',  'condition'], drop_first=True)

In [7]:
def last_touched(row):
    yr_updated = 2016 - max(row['yr_built'], row['yr_renovated'])
    row['yr_updated'] = yr_updated
    return row

In [8]:

df = df.drop(['sold_time','date', 'id', 'lat', 'long'], axis = 1)

In [9]:
yr_renovated_minus_yr_built = []

for i in range(len(df)): 

    if df.yr_renovated[i] == 0:
        yr_renovated_minus_yr_built.append(0)
    else:
        yr_renovated_minus_yr_built.append(df.yr_renovated[i] - df.yr_built[i])

df["renovated_after"] = yr_renovated_minus_yr_built

In [10]:
df['yr_built'] = abs(df.yr_built- 2021)

Statistical analysis showed the condition variable is categorical and has an affect on mean prices. I used one-hot encoding and dropped one of the variables to avoid multicollinearity.

The model is pickled and ready to try on the holdout data. 

In [11]:
len(features)

93

In [12]:
df.columns

Index(['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'grade', 'sqft_above', 'sqft_basement', 'yr_built',
       'yr_renovated', 'sqft_living15', 'sqft_lot15', 'income', 'population',
       'years_ago_sold', 'zipcode_98002', 'zipcode_98003', 'zipcode_98004',
       'zipcode_98005', 'zipcode_98006', 'zipcode_98007', 'zipcode_98008',
       'zipcode_98010', 'zipcode_98011', 'zipcode_98014', 'zipcode_98019',
       'zipcode_98022', 'zipcode_98023', 'zipcode_98024', 'zipcode_98027',
       'zipcode_98028', 'zipcode_98029', 'zipcode_98030', 'zipcode_98031',
       'zipcode_98032', 'zipcode_98033', 'zipcode_98034', 'zipcode_98038',
       'zipcode_98039', 'zipcode_98040', 'zipcode_98042', 'zipcode_98045',
       'zipcode_98052', 'zipcode_98053', 'zipcode_98055', 'zipcode_98056',
       'zipcode_98058', 'zipcode_98059', 'zipcode_98065', 'zipcode_98070',
       'zipcode_98072', 'zipcode_98074', 'zipcode_98075', 'zipcode_98077',
       'zipcode_98092', 'zipc

In [14]:

final_answer = lm.predict(df)

In [15]:
final_answer_df = pd.DataFrame(final_answer)

In [16]:
final_answer_df.to_csv('housing_preds_Silverman.csv')