In [58]:
import pandas as pd
pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 200)

import datetime

import numpy as np
from matplotlib import pyplot as plt
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
import seaborn as sns
sns.set(style="whitegrid")

In [59]:
df = pd.read_csv('kc_house_data_test_features.csv')

df.rename(columns = {'Unnamed: 0': 'index'}, inplace = True)

df.set_index('index', inplace = True)

In [60]:
def comma_be_gone_into_int(x):
    return int(x.replace(',',''))

def new_string(number):
    
    new_string = df.date[number].replace("T000000",'')
    
    new_string =  new_string[:4] + "-" + new_string[4:]
    
    new_string =  new_string[:7] + "-" + new_string[7:]
    
    return new_string

def into_time(string):
    return datetime.datetime.strptime(string, "%Y-%m-%d")

In [61]:
zip_money_df = pd.read_csv('Seatle_Income_Population.csv')

zip_money_df['income'] = [comma_be_gone_into_int(x) for x in zip_money_df.income]

zip_money_df['population'] = [comma_be_gone_into_int(x) for x in zip_money_df.population]

zip_money_dict = dict(zip(zip_money_df.zipcode.values, zip_money_df.income.values))

zip_population_dict = dict(zip(zip_money_df.zipcode.values, zip_money_df.population.values))


average_money = round(np.mean(list(zip_money_dict.values())[:-3]))

average_population = round(np.mean(list(zip_population_dict.values())))


income = []

for i in df.zipcode.values:
    income.append(zip_money_dict.get(i,average_money))

df['income'] = np.array(income)



population = []

for i in df.zipcode.values:
    population.append(zip_population_dict.get(i,average_population))

df['population'] = np.array(population)




df['sold_time'] = [into_time(i) for i in [new_string(i) for i in range(len(df))]]
df['years_ago_sold'] = abs(pd.DatetimeIndex(df['sold_time']).year - 2021)


df = df.drop(['sold_time','date', 'id', 'lat', 'long', "waterfront", 'grade', 'view', 'zipcode'], axis = 1)


yr_renovated_minus_yr_built = []

for i in range(len(df)): 

    if df.yr_renovated[i] == 0:
        yr_renovated_minus_yr_built.append(0)
    else:
        yr_renovated_minus_yr_built.append(df.yr_renovated[i] - df.yr_built[i])

df["renovated_after"] = yr_renovated_minus_yr_built

df['yr_built'] = abs(df.yr_built- 2021)

dummy = pd.get_dummies(df.condition)

df_condition = pd.concat([dummy,df], axis = 1)

dummy = pd.get_dummies(df.condition)

df_condition = pd.concat([dummy,df], axis = 1)

df_condition = df_condition[[1,2,3,4,5]]

df_condition.columns = ['Condition_1',"Condition_2","Condition_3","Condition_4","Condition_5"]

df = pd.concat([df_condition,df], axis = 1)
#df = pd.concat([df_grade,df], axis = 1)

#drop for multicollinearity
df = df.drop(["Condition_3", 'condition'], axis = 1)

delete up to here

In [62]:
poly_2 = PolynomialFeatures(degree=2, include_bias=False)
poly2_data = poly_2.fit_transform(df)
poly2_columns = poly_2.get_feature_names(df.columns)
df_poly2 = pd.DataFrame(poly2_data, columns=poly2_columns)

df_poly2.columns


Index(['Condition_1', 'Condition_2', 'Condition_4', 'Condition_5', 'bedrooms',
       'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'sqft_above',
       ...
       'income^2', 'income population', 'income years_ago_sold',
       'income renovated_after', 'population^2', 'population years_ago_sold',
       'population renovated_after', 'years_ago_sold^2',
       'years_ago_sold renovated_after', 'renovated_after^2'],
      dtype='object', length=209)

In [63]:
df_poly2.shape


(4323, 209)

In [64]:
import pickle

# Load the columns I need
with open('features.pickle', 'rb') as file:
    features = pickle.load(file)

with open('regression_model.pickle', 'rb') as f:
    lm = pickle.load(f)
    
with open('scaler.pickle', "rb") as f:
    scaler2 = pickle.load(f)

In [65]:
#df_poly2 = df_poly2.drop(set(list(df_poly2.columns)) - set(list(features)),axis = 1)

In [67]:
df_to_fit = scaler2.transform(df_poly2[features])

In [69]:
lm.predict(df_to_fit)

array([-2.15879878e+09, -2.15879878e+09,  1.15751537e+09, ...,
        3.42376180e+09,  2.83118467e+09,  3.42376180e+09])

In [70]:
lm.coef_

array([-8.58965155e+04, -1.15272543e+08, -6.51619815e+07,  4.60882222e+06,
        1.10904893e+06, -1.10908031e+06, -1.10911444e+06,  3.65940437e+04,
       -8.32664279e+04, -3.79208345e+06,  3.79222125e+06,  3.79221365e+06,
       -7.39716997e+08,  5.23636462e+07,  8.13603670e+09,  7.39716998e+08,
       -5.23636462e+07, -8.13603670e+09,  7.39716997e+08, -5.23636462e+07,
       -8.13603670e+09])

In [71]:
np.mean(list(lm.predict(df_poly2[features])))

560432.3268041454

In [72]:
list(features)

['Condition_5',
 'Condition_1 Condition_5',
 'Condition_1 renovated_after',
 'Condition_2 Condition_4',
 'Condition_2 sqft_living',
 'Condition_2 sqft_above',
 'Condition_2 sqft_basement',
 'Condition_4^2',
 'Condition_5^2',
 'Condition_5 sqft_living',
 'Condition_5 sqft_above',
 'Condition_5 sqft_basement',
 'sqft_living sqft_basement',
 'sqft_living yr_renovated',
 'sqft_living population',
 'sqft_above sqft_basement',
 'sqft_above yr_renovated',
 'sqft_above population',
 'sqft_basement^2',
 'sqft_basement yr_renovated',
 'sqft_basement population']