In [1]:

# IMPORT GENERIC PACKAGES
import numpy as np # numerical calc package
import pandas as pd # holds data
import matplotlib.pyplot as plt # plotting library
import seaborn as sns # pretty plotting

In [2]:
# plotting config
sns.set(style='white', rc={'figure.figsize':(20,10)})

from sklearn.linear_model import LinearRegression # linear regression package
from sklearn.model_selection import train_test_split # split dataset
from sklearn.metrics import mean_squared_error as mse # Measurement metric

from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression



In [3]:
# read data into a DataFrame
data = pd.read_csv('nyc-rolling-sales.csv', index_col=0)

In [4]:
data.head()

Unnamed: 0,BOROUGH,NEIGHBORHOOD,BUILDING CLASS CATEGORY,TAX CLASS AT PRESENT,BLOCK,LOT,EASE-MENT,BUILDING CLASS AT PRESENT,ADDRESS,APARTMENT NUMBER,...,RESIDENTIAL UNITS,COMMERCIAL UNITS,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE PRICE,SALE DATE
4,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2A,392,6,,C2,153 AVENUE B,,...,5,0,5,1633,6440,1900,2,C2,6625000,2017-07-19 00:00:00
5,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2,399,26,,C7,234 EAST 4TH STREET,,...,28,3,31,4616,18690,1900,2,C7,-,2016-12-14 00:00:00
6,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2,399,39,,C7,197 EAST 3RD STREET,,...,16,1,17,2212,7803,1900,2,C7,-,2016-12-09 00:00:00
7,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2B,402,21,,C4,154 EAST 7TH STREET,,...,10,0,10,2272,6794,1913,2,C4,3936272,2016-09-23 00:00:00
8,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2A,404,55,,C2,301 EAST 10TH STREET,,...,6,0,6,2369,4615,1900,2,C2,8000000,2016-11-17 00:00:00


In [5]:
data.columns

Index(['BOROUGH', 'NEIGHBORHOOD', 'BUILDING CLASS CATEGORY',
       'TAX CLASS AT PRESENT', 'BLOCK', 'LOT', 'EASE-MENT',
       'BUILDING CLASS AT PRESENT', 'ADDRESS', 'APARTMENT NUMBER', 'ZIP CODE',
       'RESIDENTIAL UNITS', 'COMMERCIAL UNITS', 'TOTAL UNITS',
       'LAND SQUARE FEET', 'GROSS SQUARE FEET', 'YEAR BUILT',
       'TAX CLASS AT TIME OF SALE', 'BUILDING CLASS AT TIME OF SALE',
       'SALE PRICE', 'SALE DATE'],
      dtype='object')

In [6]:
# take a subset of our data
columns = [ 'BOROUGH', 'SALE PRICE','COMMERCIAL UNITS','LAND SQUARE FEET', 'GROSS SQUARE FEET' ]
subset_data = data[columns]

In [7]:
# Get number of (rows, columns)
subset_data.shape

(84548, 5)

In [8]:
# Get first 5 rows
subset_data.head()

Unnamed: 0,BOROUGH,SALE PRICE,COMMERCIAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET
4,1,6625000,0,1633,6440
5,1,-,3,4616,18690
6,1,-,1,2212,7803
7,1,3936272,0,2272,6794
8,1,8000000,0,2369,4615


In [9]:
# Convert to float
data['SALE PRICE'] = pd.to_numeric(data['SALE PRICE'], errors='coerce')
data['SALE PRICE'] = data['SALE PRICE'].fillna(0)

In [10]:
data['GROSS SQUARE FEET'] = pd.to_numeric(data['GROSS SQUARE FEET'], errors='coerce')
data['LAND SQUARE FEET'] = pd.to_numeric(data['LAND SQUARE FEET'], errors='coerce')

In [11]:
# Convert to date
data['SALE DATE'] = pd.to_datetime(data['SALE DATE'], errors='coerce')

In [12]:
# Remove 5th and 95th percentile tails
zero = 0
fifth = data['SALE PRICE'].describe(np.arange(0.05, 1, 0.05)).T['15%']
ninetyfifth = data['SALE PRICE'].describe(np.arange(0.05, 1, 0.05)).T['95%']
data = data[(data['SALE PRICE'] > zero) &
             (data['SALE PRICE'] <= ninetyfifth)].copy()

In [13]:

# Handle Missing Values by Dropping (for now)
data.dropna(inplace=True)

data.describe()



Unnamed: 0,BOROUGH,BLOCK,LOT,ZIP CODE,RESIDENTIAL UNITS,COMMERCIAL UNITS,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,SALE PRICE
count,36128.0,36128.0,36128.0,36128.0,36128.0,36128.0,36128.0,36128.0,36128.0,36128.0,36128.0,36128.0
mean,3.424297,5284.63311,227.366364,10952.996097,1.735662,0.145649,1.887705,2897.62,2278.095,1850.794481,1.357424,694216.8
std,0.924368,3625.421333,485.371196,1018.3254,11.859228,11.924159,16.893532,29398.87,22821.13,422.482484,0.677031,508370.9
min,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
25%,3.0,2376.0,21.0,10467.0,1.0,0.0,1.0,1317.0,828.0,1920.0,1.0,375000.0
50%,3.0,4832.0,45.0,11219.0,1.0,0.0,1.0,2175.0,1505.0,1935.0,1.0,579000.0
75%,4.0,7380.0,96.0,11361.0,2.0,0.0,2.0,3125.0,2232.0,1963.0,2.0,880000.0
max,5.0,16319.0,7501.0,11694.0,1844.0,2261.0,2261.0,4228300.0,3750565.0,2017.0,4.0,3000000.0


In [16]:
# Define Features
features = ['BOROUGH', 'COMMERCIAL UNITS','LAND SQUARE FEET','GROSS SQUARE FEET','RESIDENTIAL UNITS','LOT', 'BLOCK', 'ZIP CODE' ]

# Set X
X = data[features]

# Set y
y = data['SALE PRICE']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42)

# Initialize model
model = RandomForestRegressor()

# Fit Model
model.fit(X_train, y_train)

# calculate the R-squared
model.score(X_test, y_test)

y_predicted = model.predict(X_test)




In [17]:

# We input new advertising data into the model to predict future sales

# Sample
new_data = [[2, 2, 10000,1,1,1,1,1]]
model.predict(new_data)

"""### Model Error"""

np.sqrt(mse(y_predicted, y_test)) # Root mean squared error

"""This means that the root mean square error of any prediction done by the model against the actual value should be ~2 thousand dollars per campaign. Your predictions should deviate from the real values only by about 2 thousand dollars. This is high but we can further improve on this model by doing feature engineering but for now, this will do."""


'This means that the root mean square error of any prediction done by the model against the actual value should be ~2 thousand dollars per campaign. Your predictions should deviate from the real values only by about 2 thousand dollars. This is high but we can further improve on this model by doing feature engineering but for now, this will do.'