# Using Multiple Linear Regression to Predict NYC Rent Prices

This project is to find out if I am overpaying my East Village apartment...

In [1]:
# imports
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [15, 7]
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import seaborn as sns

In [1]:
# read kaggle dataset: https://www.kaggle.com/zohaib30/streeteasy-dataset?select=manhattan.csv
streeteasy = pd.read_csv("../input/streeteasy-dataset/manhattan.csv")

df = pd.DataFrame(streeteasy)

In [1]:
# select columns for regression model
# 1 label
y = df[['rent']]

# 14 features
x = df[[
  'bedrooms',
  'bathrooms',
  'size_sqft',
  'min_to_subway',
  'floor',
  'building_age_yrs',
  'no_fee',
  'has_roofdeck',
  'has_washer_dryer',
  'has_doorman',
  'has_elevator',
  'has_dishwasher',
  'has_patio',
  'has_gym'
]]

# Use scikit-learn’s train_test_split() method to split x into 80% training set and 20% testing set
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, test_size=0.2, random_state=6)

print(x_train.shape)
print(x_test.shape)
 
print(y_train.shape)
print(y_test.shape)

In [1]:
# create linear regression model
mlr = LinearRegression()

# fit model
mlr.fit(x_train, y_train)

# predict y-values
y_predict = mlr.predict(x_test)

## Let's test the model on the apartment!

In [1]:
# print our apartment features
print(x.columns)

In [1]:
# Set values for our 14 features, stored in 'x'

# for example, my apartment has 3 bedrooms, 1.5 bathrooms, 820 sqft, 3 min to subway... and so on
shaneys_apartment = [[3, 1.5, 820, 3, 2, 10, 1, 1, 0, 0, 0, 1, 1, 0]]
 
predict = mlr.predict(shaneys_apartment)
 
print("Predicted rent: $%.2f" % predict)

We predicted the rent to be $4212.44. Pretty close - not a bad prediction!

In [1]:
# coefficients from mlr, determines which independent variable carries more weight
print(mlr.coef_)

In [1]:
# correlation matrix for bedrooms v. rent
plt.scatter(df[['bedrooms']], df[['rent']], alpha=0.4)
plt.xlabel("Bedrooms")
plt.ylabel("Rent")
plt.title("Bedrooms vs Rent")
plt.show()

In [1]:
# Define a nice color map for our heatmap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

sns.heatmap(
        data=df.corr(),  # our correlation matrix
        linewidths=0.2,  # the width of lines separating the matrix squares
        square=True,   # enforce 1:1 ratios among correlation cells
        cmap=cmap,  # use the color map we defined above
        vmax=1,  # define the max of our correlation scale
        vmin=-1, # define the min of our correlation scale
        center=0,  # The value at which the color map is centered about (white)
        cbar_kws={"shrink": .75}  # shrink the scale a bit
    )

# adjust y-axis ticks for legibility
plt.yticks(rotation=0)  

# show the heatmap
plt.show()

In [1]:
# checking our train model's score
print("Train score:")
print(mlr.score(x_train, y_train))

# checking our test model's score
print("Test score:")
print(mlr.score(x_test, y_test))

In [1]:
# run a residual analysis to assess the accuracy of our multiple linear regression model
residuals = y_predict - y_test
 
plt.scatter(y_predict, residuals, alpha=0.4)
plt.title('Residual Analysis')
 
plt.show()