In [29]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("data/RealEstate.csv")

In [3]:
df.head()

Unnamed: 0,MLS,Location,Price,Bedrooms,Bathrooms,Size,Price/SQ.Ft,Status
0,132842,Arroyo Grande,795000.0,3,3,2371,335.3,Short Sale
1,134364,Paso Robles,399000.0,4,3,2818,141.59,Short Sale
2,135141,Paso Robles,545000.0,4,3,3032,179.75,Short Sale
3,135712,Morro Bay,909000.0,4,4,3540,256.78,Short Sale
4,136282,Santa Maria-Orcutt,109900.0,3,1,1249,87.99,Short Sale


In [4]:
df.dtypes

MLS              int64
Location        object
Price          float64
Bedrooms         int64
Bathrooms        int64
Size             int64
Price/SQ.Ft    float64
Status          object
dtype: object

In [8]:
df_reg = df.drop(['Location', 'MLS'], axis=1)
df_reg.head()

Unnamed: 0,Price,Bedrooms,Bathrooms,Size,Price/SQ.Ft,Status
0,795000.0,3,3,2371,335.3,Short Sale
1,399000.0,4,3,2818,141.59,Short Sale
2,545000.0,4,3,3032,179.75,Short Sale
3,909000.0,4,4,3540,256.78,Short Sale
4,109900.0,3,1,1249,87.99,Short Sale


In [11]:
scaler = StandardScaler()
scaler.fit(df_reg[['Bedrooms', 'Bathrooms', 'Size', 'Price/SQ.Ft']])

StandardScaler()

In [16]:
df_reg[['Bedrooms', 'Bathrooms', 'Size', 'Price/SQ.Ft']] = scaler.transform(df_reg[['Bedrooms', 'Bathrooms', 'Size', 'Price/SQ.Ft']])

In [18]:
df_reg.head()

Unnamed: 0,Price,Bedrooms,Bathrooms,Size,Price/SQ.Ft,Status
0,795000.0,-0.166186,0.761236,0.752016,1.062259,Short Sale
1,399000.0,1.003104,0.761236,1.297769,-0.622053,Short Sale
2,545000.0,1.003104,0.761236,1.559046,-0.290251,Short Sale
3,909000.0,1.003104,1.943194,2.179275,0.379526,Short Sale
4,109900.0,-0.166186,-1.602681,-0.617859,-1.088105,Short Sale


In [23]:
df['Status'].value_counts()

Short Sale     516
Foreclosure    162
Regular        103
Name: Status, dtype: int64

In [24]:
df_reg['Short Sale'] = np.where(df_reg['Status'] == 'Short Sale', 1, 0)
df_reg['Foreclosure'] = np.where(df_reg['Status'] == 'Foreclosure', 1, 0)
df_reg['Regular'] = np.where(df_reg['Status'] == 'Regular', 1, 0)

In [28]:
df_base = df_reg.drop('Status', axis=1)
df_base.head()

Unnamed: 0,Price,Bedrooms,Bathrooms,Size,Price/SQ.Ft,Short Sale,Foreclosure,Regular
0,795000.0,-0.166186,0.761236,0.752016,1.062259,1,0,0
1,399000.0,1.003104,0.761236,1.297769,-0.622053,1,0,0
2,545000.0,1.003104,0.761236,1.559046,-0.290251,1,0,0
3,909000.0,1.003104,1.943194,2.179275,0.379526,1,0,0
4,109900.0,-0.166186,-1.602681,-0.617859,-1.088105,1,0,0


## RidgeCV

In [32]:
ALPHAS = [x for x in range(1, 81)]
X = df_base.drop('Price', axis=1).values
y = df_base['Price']

In [39]:
np.random.seed(2)
ridge = RidgeCV(cv=5, alphas=ALPHAS).fit(X, y)

In [40]:
ridge.score(X, y)

0.8215343875805965

In [41]:
ridge.get_params(deep=True)

{'alpha_per_target': False,
 'alphas': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
        35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
        52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68,
        69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80]),
 'cv': 5,
 'fit_intercept': True,
 'gcv_mode': None,
 'normalize': 'deprecated',
 'scoring': None,
 'store_cv_values': False}

In [42]:
pd.DataFrame(ridge.coef_, df_base.drop('Price', axis=1).columns, columns=['Coefficients'])

Unnamed: 0,Coefficients
Bedrooms,-2908.766969
Bathrooms,34274.206082
Size,170002.304313
Price/SQ.Ft,194162.529842
Short Sale,-18415.59855
Foreclosure,-5440.992199
Regular,23856.590749


# Lasso

In [54]:
ALPHAS_LASSO = [x for x in range(1, 3001)]
lasso = LassoCV(cv=5, random_state=3, alphas=ALPHAS_LASSO).fit(X, y)
lasso.score(X, y)

0.8286896843312724

In [55]:
pd.DataFrame(lasso.coef_, df_base.drop('Price', axis=1).columns, columns=['Coefficients'])

Unnamed: 0,Coefficients
Bedrooms,-9358.291413
Bathrooms,13093.302635
Size,205584.225793
Price/SQ.Ft,211297.238128
Short Sale,-23426.590085
Foreclosure,-12898.624929
Regular,15675.613628


In [57]:
y

0       795000.0
1       399000.0
2       545000.0
3       909000.0
4       109900.0
         ...    
776     319900.0
777     495000.0
778     372000.0
779     589000.0
780    1100000.0
Name: Price, Length: 781, dtype: float64