In [1]:
import pandas as pd
import numpy as np

import statsmodels.api as sm
from statsmodels.formula.api import ols

from matplotlib import pyplot as plt

from sklearn import linear_model
from sklearn.linear_model import LinearRegression

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

import seaborn as sns

import pickle

In [2]:
house_final = pd.read_csv('kc_house_data_test_features.csv', index_col = 0)

# Functions to add distance to water and last renovation to the house

In [3]:
with open('water_lat', 'rb') as handle:
    water_lat = pickle.load(handle)

In [4]:
with open('water_long', 'rb') as handle:
    water_long = pickle.load(handle)

In [5]:
from functions import water_distances, last_change

In [6]:
house_final['last_change'] = last_change(house_final)

In [7]:
house_final['water_distance'] = water_distances(house_final, water_lat, water_long)

# Replacing 1 house with 33 bedrooms to the average for a house with 1.75 baths

In [8]:
house_final['bedrooms'] = np.where(house_final['bedrooms'] != 33, house_final['bedrooms'], 3)

# Replacing basement sqft with # of basements

In [9]:
house_final['basement'] = np.where(house_final['sqft_basement'] > 0, 1, 0)

# Creating Dummies for zipcode, view and grade and removing the columns deemed unnecessary by the feature selection

In [10]:
zip_dummy = pd.get_dummies(house_final['zipcode'], prefix = 'zip', drop_first = True)
#view_dummy = pd.get_dummies(house_final['view'], prefix = 'view', drop_first = True)
grade_dummy = pd.get_dummies(house_final['grade'], prefix = 'grade', drop_first = True)

In [11]:
grade_dummy.drop(columns = ['grade_4', 'grade_5', 'grade_8'], axis=1, inplace = True)

In [12]:
#view_dummy.drop(columns = ['view_1'], axis=1, inplace = True)

In [13]:
zip_dummy = zip_dummy[['zip_98004']]

# Concat the dataframe with the dummy columns left

In [14]:
house_final_w_dummies = pd.concat([house_final, zip_dummy, grade_dummy], axis = 1)

# Dropping all the remaining columns that are not needed

In [15]:
house_final_w_dummies.drop(columns = ['view', 'condition', 'grade', 'zipcode', 'sqft_lot',
                                     'id', 'date', 'sqft_above', 'sqft_basement', 
                                      'sqft_living15', 'sqft_lot15','yr_built', 'yr_renovated', 
                                      'lat', 'long', 'waterfront', 'bathrooms', 
                                      'bedrooms', 'floors'], axis = 1, inplace = True)

# Checking for any extremes in the dataframe

In [17]:
def ext_values(df, extreme_cols):
    new_df = df.copy()
    for col in extreme_cols:
        std = new_df[col].std()
        mean = new_df[col].mean()
        value = mean+(5*std)
        new_df[col] = new_df[col].apply(lambda x: value if (np.abs(x-mean) > 5*std) else x)
    return new_df

In [18]:
house_final_w_dummies = ext_values(house_final_w_dummies, house_final_w_dummies.columns)

# Importing scaler and regression from pickle

In [26]:
with open('final_scaler', 'rb') as handle:
    final_scaler = pickle.load(handle)

FileNotFoundError: [Errno 2] No such file or directory: 'final_scaler'

In [27]:
with open('final_lasso', 'rb') as handle:
    final_lasso = pickle.load(handle)

FileNotFoundError: [Errno 2] No such file or directory: 'final_lasso'

In [None]:
with open('final_rfe', 'rb') as handle:
    final_rfe = pickle.load(handle)

In [16]:
with open('final_lasso_lc', 'rb') as handle:
    final_lasso_lc = pickle.load(handle)

# Scaling the new data using previous scale

In [19]:
# scaler = StandardScaler()
# scaler.fit(final_scaler)
# house_final_w_dummies_scaled = pd.DataFrame(data=scaler.transform(house_final_w_dummies))

NameError: name 'final_scaler' is not defined

# Making Predictions

In [20]:
from sklearn.linear_model import Lasso

In [21]:
price_predictions = np.exp(final_lasso_lc.predict(house_final_w_dummies))

In [22]:
price_predictions

array([476542.33526406, 476542.33526406, 348527.92014364, ...,
       255177.14066192, 364109.52140856, 255177.10788226])

In [23]:
price_dataframe = pd.DataFrame(price_predictions)

In [24]:
price_dataframe.rename(columns = {0 : 'Price'}, inplace = True)

In [25]:
price_dataframe.to_csv('price_predictions')

In [36]:
house_stuff = house_final_w_dummies

In [37]:
house_stuff['price'] = price_predictions

In [38]:
house_stuff.price.describe()

count    4.323000e+03
mean     4.827032e+05
std      2.653439e+05
min      1.838044e+05
25%      3.200372e+05
50%      4.046868e+05
75%      5.661158e+05
max      3.053808e+06
Name: price, dtype: float64

In [None]:
# final_lasso is a better predictor than final_lasso_lc (with the last_change column/variable)

# Fitting the model

In [26]:
final_scaler = StandardScaler()
final_scaler.fit_transform(features)

NameError: name 'features' is not defined

In [None]:
final_model = LinearRegression()
final_model.fit(features, target)

In [None]:
# houldout = pd.read_csv('whatever')

In [None]:
# transformed_holdout = final_scaler(holdout)
# final_answers = final_model.predict(transformed_holdout)
# final_answers.to_csv('housing_preds_AHSR')