In [174]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split

sns.set()

In [175]:
nyc = pd.read_csv('nyc-rolling-sales_clean.csv')

In [176]:
#let's set the target variable as log price. Setting the target as log sale price rather than just sale price will normalize the skewed data and help work around the linear regression's sensitivity to outliers.
y = np.log(df['SALE PRICE'])

In [260]:
#set the features that we think are relevant; in this case: zip code, gross sq feet, year built, and sale month
X = df.drop(['SALE PRICE', 'BUILDING CLASS CATEGORY', 'TAX CLASS AT PRESENT', 'BUILDING CLASS AT PRESENT', 'BOROUGH', 'NEIGHBORHOOD', 'RESIDENTIAL UNITS', 'COMMERCIAL UNITS', 'TOTAL UNITS', 'LAND SQUARE FEET', 'TAX CLASS AT TIME OF SALE', 'BUILDING CLASS AT TIME OF SALE', 'SALE DATE', 'sale_year'], axis=1)

In [261]:
#we'll need to turn sale month and zip code into categorical variables if we're going to use them
df['sale_month']=df['sale_month'].astype("object")
df['ZIP CODE']=df['ZIP CODE'].astype("object")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30236 entries, 0 to 30235
Data columns (total 19 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   BOROUGH                         30236 non-null  object        
 1   NEIGHBORHOOD                    30236 non-null  object        
 2   BUILDING CLASS CATEGORY         30236 non-null  object        
 3   TAX CLASS AT PRESENT            30236 non-null  object        
 4   BUILDING CLASS AT PRESENT       30236 non-null  object        
 5   ZIP CODE                        30236 non-null  object        
 6   RESIDENTIAL UNITS               30236 non-null  int64         
 7   COMMERCIAL UNITS                30236 non-null  int64         
 8   TOTAL UNITS                     30236 non-null  int64         
 9   LAND SQUARE FEET                30236 non-null  float64       
 10  GROSS SQUARE FEET               30236 non-null  float64       
 11  YE

In [262]:
#turn the categorical variables into dummy variables
pd.get_dummies(drop_first=True, data=X)

Unnamed: 0,GROSS SQUARE FEET,YEAR BUILT,target,ZIP CODE_10001,ZIP CODE_10002,ZIP CODE_10003,ZIP CODE_10005,ZIP CODE_10009,ZIP CODE_10010,ZIP CODE_10011,...,sale_month_3,sale_month_4,sale_month_5,sale_month_6,sale_month_7,sale_month_8,sale_month_9,sale_month_10,sale_month_11,sale_month_12
0,6794.0,1913,15.185745,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
1,4226.0,1920,14.976421,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
2,3360.0,1910,15.009433,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
3,3586.0,1899,15.123843,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
4,6330.0,1901,14.275363,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30231,2160.0,1994,13.381646,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
30232,2575.0,1998,13.017003,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
30233,2377.0,1998,13.217674,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
30234,1496.0,1925,13.038982,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [263]:
#we now have 184 features!
#now we need to split the data into the training and test sets
rand_state=1000
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=rand_state)

In [264]:
#check to make sure that we have 70% in the training set
np.round(len(X_train)/len(X),3)

0.7

In [265]:
#Train the model
reg_model = LinearRegression()

In [266]:
reg_model.fit(X_train, y_train)

LinearRegression()

In [267]:
y_hat_test = reg_model.predict(X_test)

In [268]:
df_predictions = pd.DataFrame({'y_test':y_test, 'y_hat_test':y_hat_test, 'resid': y_test - y_hat_test})
df_predictions.head()

Unnamed: 0,y_test,y_hat_test,resid
3767,14.077875,14.077875,0.0
24270,13.199324,13.199324,3.552714e-15
27718,13.384728,13.384728,1.776357e-15
6232,12.873902,12.873902,1.776357e-15
11161,13.71015,13.71015,1.776357e-15


In [269]:
#Run the MSE and RMSE tests
MSE_test = np.mean(np.square(df_predictions['resid']))
np.round(MSE_test,3)

0.0

In [270]:
RMSE_test = np.sqrt(MSE_test)
np.round(RMSE_test,3)

0.0

In [271]:
from sklearn.model_selection import cross_val_score
import sklearn.metrics
my_estimator = LinearRegression()

In [272]:
#Run the 5 fold cross validation
NMSE = cross_val_score(estimator=my_estimator, X=X_train, y=y_train, cv=5, scoring="neg_mean_squared_error")
MSE= -NMSE
MSE
RMSE = np.sqrt(MSE)
RMSE
RMSE_CV5 = np.mean(RMSE)
np.round(RMSE_CV,3)

0.0

In [273]:
#Run the 10 fold cross validation
NMSE = cross_val_score(estimator=my_estimator, X=X_train, y=y_train, cv=10, scoring="neg_mean_squared_error")
MSE= -NMSE
MSE
RMSE = np.sqrt(MSE)
RMSE
RMSE_CV10 = np.mean(RMSE)
np.round(RMSE_CV,3)

0.0

In [274]:
RMSE_test

1.0771800702033546e-14

In [275]:
RMSE_CV5

6.8246258017403295e-15

In [276]:
RMSE_CV10

6.8471758435103325e-15

In [277]:
#the RMSE_CV5 is the smallest, but RMSE_CV10 is not much further behind since both are so close to zero.