In [1]:
import pandas as pd
import cufflinks as cf
import sklearn
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.model_selection import train_test_split
import plotly.graph_objects as go
from sklearn.metrics import mean_squared_error

TEST_SIZE = 0

cf.go_offline()
df = pd.DataFrame(pd.read_excel('apartments.xlsx'))
display(df)
df.drop(columns=['furnished', 'move_in_date'], inplace=True)

Unnamed: 0,ad_id,unit_type,bedrooms,bathrooms,hydro,heat,water,internet,cable,num_utils,...,Fridge,washer_drier,dishwasher,a_c,num_appliances,outdoor_space,smoking,mins_guelph,mins_waterloo,price
0,1494496767,basement,1,1,1,1,1,1,0,4,...,1,0,0,1,2,1,0,10,28,1200
1,1495630096,duplex,1,1,0,1,1,0,0,2,...,1,1,0,1,3,1,0,20,19,1300
2,1494306061,"shared, basement",1,1,0,0,0,0,0,0,...,1,1,0,1,3,1,1,5,37,900
3,1495004948,apartment,1,1,0,1,1,0,0,2,...,0,0,0,0,0,1,0,8,29,1550
4,1493993007,apartment,1,1,0,1,1,0,0,2,...,0,0,0,0,0,1,0,8,29,1675
5,1494785007,apartment,1,1,0,1,1,0,0,2,...,0,0,0,0,0,1,0,8,29,1725
6,1390040368,apartment,1,1,0,1,1,0,0,2,...,1,1,0,0,2,1,0,25,15,1385
7,1494556820,apartment,1,1,1,1,1,0,0,3,...,1,0,0,0,1,1,0,3,31,1450
8,1494393272,apartment,1,1,1,1,1,0,0,3,...,0,1,0,1,2,1,0,23,24,1100
9,1485930218,apartment,1,1,0,1,1,0,0,2,...,1,1,0,0,2,1,0,11,26,1360


### Note that the original data did not have num_utils, num_appliances and num_extras. They are composite attributes that I made as follows:

#### num_utils are hydro, heat, water, internet, tv
#### num_appliances are fridge, washer&dryer, a/c, dishwasher
#### num_extras are pool, gym, elevator

In [2]:
def summary(df):
    ret = {}
    ret['null'] = df.isnull().sum()
    ret['null_avg'] = df.isnull().mean()
    ret['dtypes'] = df.dtypes
    ret['count'] = df.count()
    ret['mean'] = df.mean()
    ret['median'] = df.median()
    ret['min'] = df.min()
    ret['max'] = df.max()
    return pd.DataFrame(ret)

display(summary(df))
print(df.dtypes.value_counts())


Unnamed: 0,null,null_avg,dtypes,count,mean,median,min,max
Fridge,0,0.0,int64,54,0.8333333,1.0,0,1
a_c,0,0.0,int64,54,0.3518519,0.0,0,1
ad_id,0,0.0,int64,54,1481105000.0,1491098000.0,1321009006,1495630096
agreement,0,0.0,int64,54,11.59259,12.0,1,12
bathrooms,0,0.0,int64,54,1.111111,1.0,1,3
bedrooms,0,0.0,int64,54,1.703704,1.0,1,6
cable,0,0.0,int64,54,0.01851852,0.0,0,1
days_until_move_in,0,0.0,int64,54,1.685185,0.0,-99,50
dishwasher,0,0.0,int64,54,0.2777778,0.0,0,1
elevator,0,0.0,int64,54,0.4074074,0.0,0,1


int64      27
float64     1
object      1
dtype: int64


## Change non-categorical numeric fields to float and drop fields that were made into composite

In [3]:
df = df.drop(columns = ['hydro', 'heat', 'water',
                   'internet', 'cable', 'gym', 'pool', 'elevator',
                   'Fridge', 'washer_drier', 'dishwasher', 'a_c'])
    
change_int_to_float = ['bathrooms', 'bedrooms','num_appliances', 'num_extras','num_utils', 'mins_guelph', 'mins_waterloo',
                      'price', 'size']

for feat in change_int_to_float:
    df[feat] = df[feat].astype('float64')
    
display(summary(df))

Unnamed: 0,null,null_avg,dtypes,count,mean,median,min,max
ad_id,0,0.0,int64,54,1481105000.0,1491098000.0,1321009006,1495630096
agreement,0,0.0,int64,54,11.59259,12.0,1,12
bathrooms,0,0.0,float64,54,1.111111,1.0,1,3
bedrooms,0,0.0,float64,54,1.703704,1.0,1,6
days_until_move_in,0,0.0,int64,54,1.685185,0.0,-99,50
mins_guelph,0,0.0,float64,54,16.94444,14.5,3,54
mins_waterloo,0,0.0,float64,54,25.25926,28.0,4,51
num_appliances,0,0.0,float64,54,2.111111,2.0,0,4
num_extras,0,0.0,float64,54,0.7222222,0.0,0,3
num_utils,0,0.0,float64,54,1.907407,2.0,0,5


## Originally about half of the ads did not specify 'size'. Most of them could be found on the realtor websites, but there are still some missing values (because I could not find on the websites)
### To fix this I tried a mean imputation

In [4]:
def impute_size(row):
    if str(row['size']) == 'nan':
        row['size'] = df['size'].mean()
        
    return row

df = df.apply(impute_size, axis = 1)
    

display(summary(df))

Unnamed: 0,null,null_avg,dtypes,count,mean,median,min,max
ad_id,0,0.0,int64,54,1481105000.0,1491098000.0,1321009006,1495630096
agreement,0,0.0,int64,54,11.59259,12.0,1,12
bathrooms,0,0.0,float64,54,1.111111,1.0,1,3
bedrooms,0,0.0,float64,54,1.703704,1.0,1,6
days_until_move_in,0,0.0,int64,54,1.685185,0.0,-99,50
mins_guelph,0,0.0,float64,54,16.94444,14.5,3,54
mins_waterloo,0,0.0,float64,54,25.25926,28.0,4,51
num_appliances,0,0.0,float64,54,2.111111,2.0,0,4
num_extras,0,0.0,float64,54,0.7222222,0.0,0,3
num_utils,0,0.0,float64,54,1.907407,2.0,0,5


## Scatter plots between selected classes

In [5]:
drop_columns = ['bathrooms','parking_spots', 'agreement', 'days_until_move_in',
       'pets', 'outdoor_space', 'smoking']

scatter_df = df.drop(columns=drop_columns)

import plotly.express as px

for i, ca in enumerate(scatter_df.drop(columns=['ad_id', 'unit_type']).columns):
    for j, cb in enumerate(scatter_df.drop(columns=['ad_id', 'unit_type']).columns[i:]):
            if ca != cb:
                fig = px.scatter(scatter_df.loc[:, list(set(['ad_id', 'unit_type', ca, cb, 'price']))], 
                                 x=ca,
                                 y=cb,
                                 color='unit_type',
                                 title='{} vs {}'.format(ca, cb),
                                 size='price',
                                 hover_data=['ad_id'])

                fig.show()



## Plotting price and size, we can see there is a clear relationship between both attributes

In [6]:
df['price'].iplot(kind='histogram', xTitle='Price of Apartment', title='Frequencies of Apartment Prices')
df[['size']].iplot(kind='histogram', xTitle='Size of Apartment (SqFt)', title='Frequencies of Apartment Sizes in Sqft')

## Lets look at the bedrooms and the different utilities and extras

In [7]:
df[['num_utils', 'num_extras', 'num_appliances']].iplot(kind='histogram', 
                                                    title='Number of Amenities by Groups')

In [8]:
df[['bedrooms']].iplot(kind='histogram')

In [9]:
df[['mins_guelph', 'mins_waterloo']].iplot(kind='histogram', 
                                           xTitle='Distances in minutes to Univ. Guelph (Orange) /Waterloo (Blue)',
                                           title='Frequencies of Distances to Respective Universities')

In [10]:
import matplotlib.pyplot as plt 

layout1 = cf.Layout(
    height=900,
    width=1000
)
df.corr().iplot(kind='heatmap',
                colorscale="PuRd",
                title="Feature Correlation Matrix",
               layout=layout1)

## Simple Regression, Baseline Model

In [11]:
# scatter plot

train = df[df['size'] > 0]
fig = train.iplot(asFigure=True, 
                  x='size', 
                  y='price',
                  mode='markers',
                  title='Simple Scatter Plot to Visualize Usefulness of Linear Regression')
fig.show()

In [12]:
regressor = LinearRegression()
if TEST_SIZE:
    X_train, X_test, y_train, y_test = train_test_split(
        train.loc[:, ['size']],
        train.loc[:, ['price']],
        test_size=0.15,
        random_state=123
    )
else:
    
    X_train, y_train, = train.loc[:, ['size']], train.loc[:, ['price']]
    X_test, y_test = X_train, y_train
    
regressor.fit(X_train, y_train)

# #To retrieve the intercept:
print(regressor.intercept_)
# #For retrieving the slope:
print(regressor.coef_)

[471.35167255]
[[1.28173856]]


In [13]:
# X = np.array(np.linspace(min(train['size']), max(train['size']), 50))
y_pred = list( regressor.predict( X_test ).reshape(-1) )

test = pd.DataFrame({'x' : X_test['size'],
                          'y_test' : y_test['price'],
                          'y_pred' : y_pred
                    })

fig = go.Figure()

fig.update_layout(title='Regression Line Plotted vs Training Points')
# draw training points
fig.add_trace(go.Scatter(x=train['size'], 
                         y=train['price'],
                         mode='markers',
                         name='Points',
                        ))


# draw regression line
fig.add_trace(go.Scatter(x=test['x'], y=test['y_pred'],mode='lines', name='Regression Line'))


MSE = mean_squared_error(train['price'], test['y_pred'])
RMS = MSE**(1/2)
print('MSE', MSE, '\nRMS', RMS)

fig.show()

MSE 30987.17909171746 
RMS 176.03175591840656


## Add More Features, Multiple Linear Regression

In [14]:
mult_regressor = LinearRegression()
features = ['ad_id', 'num_appliances', 'num_utils', 'num_extras', 'size']
if TEST_SIZE:

    X_train, X_test, y_train, y_test = train_test_split(
        train.loc[:, features].drop(columns=['ad_id']),
        train.loc[:, ['price']],
        test_size=0.15,
        random_state=123
    )
else:
    
    X_train, y_train = train.loc[:, features].drop(columns=['ad_id']), train.loc[:, ['price']]
    X_test, y_test = X_train,  y_train

mult_regressor.fit(X_train, y_train)
# #To retrieve the intercept:
print(mult_regressor.intercept_)
# #For retrieving the slope:
print(mult_regressor.coef_)

[455.05170261]
[[  2.937434   -29.95831417  80.7221918    1.29229989]]


In [15]:
y_pred = list( mult_regressor.predict( X_test ).reshape(-1) )

mult_test = pd.DataFrame({'x' : X_test['size'],
                          'y_test' : y_test['price'],
                          'y_pred' : y_pred
                    })

fig = go.Figure()

fig.update_layout(title='Multiple Regression Predictions (Red) Plotted vs Ground Truths (Blue)',
                 width=1800, height=600)
fig.add_trace(go.Scatter(x=train.index, 
                         y=y_test['price'],
                         mode='markers+lines',
                         name='ground_truths'))

fig.add_trace(go.Scatter(x=train.index,
                         y=y_pred,
                         mode='markers+lines',
                         name='predictions'))



fig.show()

MSE = mean_squared_error(y_test, y_pred)
RMS = MSE**(1/2)
print('MSE', MSE, '\nRMS', RMS)

MSE 23184.791401158272 
RMS 152.2655292610848


# While not an enormous difference, the mean squared error is lower when using the multiple regression as opposed to the simple linear regression
## Note that the line between the prediction points means nothing, it merely gives the graph more structure