# Regression Models

In [35]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import string

from statsmodels.formula.api import ols
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
import scipy.stats as stats
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE

from sklearn.metrics import r2_score

In [2]:
df = pd.read_csv("master_df.csv")

In [4]:
df.drop(columns='date')

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,...,zipcode,lat,long,sqft_living15,sqft_lot15,basement,year,Renovated,age_when_sold,month
0,221900.0,3,1.00,1180,5650,1.0,0,0,3,7,...,98178,47.5112,-122.257,1340,5650,0,2014,0,59,10
1,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,...,98125,47.7210,-122.319,1690,7639,1,2014,1,63,12
2,180000.0,2,1.00,770,10000,1.0,0,0,3,6,...,98028,47.7379,-122.233,2720,8062,0,2015,0,82,2
3,604000.0,4,3.00,1960,5000,1.0,0,0,5,7,...,98136,47.5208,-122.393,1360,5000,1,2014,0,49,12
4,510000.0,3,2.00,1680,8080,1.0,0,0,3,8,...,98074,47.6168,-122.045,1800,7503,0,2015,0,28,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21592,360000.0,3,2.50,1530,1131,3.0,0,0,3,8,...,98103,47.6993,-122.346,1530,1509,0,2014,0,5,5
21593,400000.0,4,2.50,2310,5813,2.0,0,0,3,8,...,98146,47.5107,-122.362,1830,7200,0,2015,0,1,2
21594,402101.0,2,0.75,1020,1350,2.0,0,0,3,7,...,98144,47.5944,-122.299,1020,2007,0,2014,0,5,6
21595,400000.0,3,2.50,1600,2388,2.0,0,0,3,8,...,98027,47.5345,-122.069,1410,1287,0,2015,0,11,1


## Encode categorial variables as dummies

In [7]:
categorial_cols = ['view', 'condition', 'grade']

for cc in categorial_cols:
    dummies = pd.get_dummies(df[cc], drop_first=False)
    dummies = dummies.add_prefix(f"{cc}#")
    df.drop(cc, axis=1, inplace=True)
    df = df.join(dummies)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21597 entries, 0 to 21596
Data columns (total 42 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           21597 non-null  object 
 1   price          21597 non-null  float64
 2   bedrooms       21597 non-null  int64  
 3   bathrooms      21597 non-null  float64
 4   sqft_living    21597 non-null  int64  
 5   sqft_lot       21597 non-null  int64  
 6   waterfront     21597 non-null  int64  
 7   sqft_above     21597 non-null  int64  
 8   sqft_basement  21597 non-null  float64
 9   yr_built       21597 non-null  int64  
 10  yr_renovated   21597 non-null  int64  
 11  zipcode        21597 non-null  int64  
 12  lat            21597 non-null  float64
 13  long           21597 non-null  float64
 14  sqft_living15  21597 non-null  int64  
 15  sqft_lot15     21597 non-null  int64  
 16  basement       21597 non-null  int64  
 17  year           21597 non-null  int64  
 18  Renova

### zipcodes are also related to price. However, encoded all zipcodes will add 70 dummies variables. Instead, we will only encode the 6 most expensive zipcodes

In [10]:
dummies_zipcodes = pd.get_dummies(df['zipcode'], drop_first=False)
dummies_zipcodes.reset_index(inplace=True)
dummies_zipcodes = dummies_zipcodes.add_prefix("{}#".format('zipcode'))
dummies_zipcodes = dummies_zipcodes[['zipcode#98004','zipcode#98102','zipcode#98109','zipcode#98112','zipcode#98039','zipcode#98040']]
df.drop('zipcode', axis=1, inplace=True)
df = df.join(dummies_zipcodes)

df.dtypes

date              object
price            float64
bedrooms           int64
bathrooms        float64
sqft_living        int64
sqft_lot           int64
waterfront         int64
sqft_above         int64
sqft_basement    float64
yr_built           int64
yr_renovated       int64
lat              float64
long             float64
sqft_living15      int64
sqft_lot15         int64
basement           int64
year               int64
Renovated          int64
age_when_sold      int64
month              int64
view#0             uint8
view#1             uint8
view#2             uint8
view#3             uint8
view#4             uint8
condition#1        uint8
condition#2        uint8
condition#3        uint8
condition#4        uint8
condition#5        uint8
grade#3            uint8
grade#4            uint8
grade#5            uint8
grade#6            uint8
grade#7            uint8
grade#8            uint8
grade#9            uint8
grade#10           uint8
grade#11           uint8
grade#12           uint8
