In [2]:
import pandas as pd
import cufflinks as cf
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.model_selection import train_test_split
import plotly.graph_objects as go
from sklearn.metrics import mean_squared_error

TEST_SIZE = 0

cf.go_offline()
df = pd.DataFrame(pd.read_excel('apartments.xlsx'))
display(df)
df.drop(columns=['ad_id', 'furnished', 'move_in_date'], inplace=True)
print(df.columns)

Unnamed: 0,ad_id,unit_type,bedrooms,bathrooms,hydro,heat,water,internet,cable,num_utils,...,Fridge,washer_drier,dishwasher,a_c,num_appliances,outdoor_space,smoking,mins_guelph,mins_waterloo,price
0,1494496767,basement,1,1,1,1,1,1,0,4,...,1,0,0,1,2,1,0,10,28,1200
1,1495630096,duplex,1,1,0,1,1,0,0,2,...,1,1,0,1,3,1,0,20,19,1300
2,1494306061,"shared, basement",1,1,0,0,0,0,0,0,...,1,1,0,1,3,1,1,5,37,900
3,1495004948,apartment,1,1,0,1,1,0,0,2,...,0,0,0,0,0,1,0,8,29,1550
4,1493993007,apartment,1,1,0,1,1,0,0,2,...,0,0,0,0,0,1,0,8,29,1675
5,1494785007,apartment,1,1,0,1,1,0,0,2,...,0,0,0,0,0,1,0,8,29,1725
6,1390040368,apartment,1,1,0,1,1,0,0,2,...,1,1,0,0,2,1,0,25,15,1385
7,1494556820,apartment,1,1,1,1,1,0,0,3,...,1,0,0,0,1,1,0,3,31,1450
8,1494393272,apartment,1,1,1,1,1,0,0,3,...,0,1,0,1,2,1,0,23,24,1100
9,1485930218,apartment,1,1,0,1,1,0,0,2,...,1,1,0,0,2,1,0,11,26,1360


Index(['unit_type', 'bedrooms', 'bathrooms', 'hydro', 'heat', 'water',
       'internet', 'cable', 'num_utils', 'gym', 'pool', 'elevator',
       'num_extras', 'parking_spots', 'agreement', 'days_until_move_in',
       'pets', 'size', 'Fridge', 'washer_drier', 'dishwasher', 'a_c',
       'num_appliances', 'outdoor_space', 'smoking', 'mins_guelph',
       'mins_waterloo', 'price'],
      dtype='object')


In [None]:
drop_columns = ['unit_type', 'bedrooms','num_utils', 
       'num_extras', 'parking_spots', 'pets', 
                'size', 'num_appliances', 'price']
df.drop(columns=drop_columns).scatter_matrix()

## Plotting price and size, we can see there is a clear relationship between both attributes

In [None]:
df['price'].iplot(kind='histogram', xTitle='Price of Apartment', title='frequencies of apartment prices')
df[['size']].iplot(kind='histogram', xTitle='Size of Apartment (SqFt)', title='frequencies of apartment sizes in sqft')

## Lets look at the bedrooms and the different utilities and extras

In [None]:
df[['num_utils', 'num_extras', 'num_appliances']].iplot(kind='histogram', 
                                                    title='Number of Amenities by Groups')
print('Utils are hydro, heat, water, internet, tv, \n\
appliances are fridge, washer&dryer, a/c, dishwasher \n\
extras are pool, gym, elevator')

In [None]:
df[['bedrooms']].iplot(kind='histogram')

In [None]:
df[['mins_guelph', 'mins_waterloo']].iplot(kind='histogram', 
                                           xTitle='Distances in minutes to Univ. Guelph (Orange) /Waterloo (Blue)',
                                           title='Frequencies of Distances to Respective Universities')

In [None]:
import matplotlib.pyplot as plt 

layout1 = cf.Layout(
    height=900,
    width=1000
)
df.corr().iplot(kind='heatmap',
                colorscale="PuRd",
                title="Feature Correlation Matrix",
               layout=layout1)

## Simple Regression, Baseline Model

In [None]:
# scatter plot

train = df[df['size'] > 0]
fig = train.iplot(asFigure=True, 
                  x='size', 
                  y='price',
                  mode='markers',
                  title='Simple Scatter Plot to Visualize Usefulness of Linear Regression')
fig.show()

In [None]:
regressor = LinearRegression()
if TEST_SIZE:
    X_train, X_test, y_train, y_test = train_test_split(
        train.loc[:, ['size']],
        train.loc[:, ['price']],
        test_size=0.15,
        random_state=123
    )
else:
    
    X_train, y_train, = train.loc[:, ['size']], train.loc[:, ['price']]
    X_test, y_test = X_train, y_train
    
regressor.fit(X_train, y_train)

# #To retrieve the intercept:
print(regressor.intercept_)
# #For retrieving the slope:
print(regressor.coef_)

In [None]:
# X = np.array(np.linspace(min(train['size']), max(train['size']), 50))
y_pred = list( regressor.predict( X_test ).reshape(-1) )

test = pd.DataFrame({'x' : X_test['size'],
                          'y_test' : y_test['price'],
                          'y_pred' : y_pred
                    })

fig = go.Figure()

fig.update_layout(title='Regression Line Plotted vs Training Points')
# draw training points
fig.add_trace(go.Scatter(x=train['size'], 
                         y=train['price'],
                         mode='markers',
                         name='Points',
                        ))


# draw regression line
fig.add_trace(go.Scatter(x=test['x'], y=test['y_pred'],mode='lines', name='Regression Line'))


MSE = mean_squared_error(train['price'], test['y_pred'])
RMS = MSE**(1/2)
print('MSE', MSE, '\nRMS', RMS)

fig.show()

## Add More Features, Multiple Linear Regression

In [None]:
mult_regressor = LinearRegression()
features = ['num_appliances', 'num_utils', 'num_extras', 'size']
if TEST_SIZE:

    X_train, X_test, y_train, y_test = train_test_split(
        train.loc[:, features],
        train.loc[:, ['price']],
        test_size=0.15,
        random_state=123
    )
else:
    
    X_train, y_train = train.loc[:, features], train.loc[:, ['price']]
    X_test, y_test = X_train,  y_train

mult_regressor.fit(X_train, y_train)
# #To retrieve the intercept:
print(mult_regressor.intercept_)
# #For retrieving the slope:
print(mult_regressor.coef_)

In [None]:
y_pred = list( mult_regressor.predict( X_test ).reshape(-1) )

mult_test = pd.DataFrame({'x' : X_test['size'],
                          'y_test' : y_test['price'],
                          'y_pred' : y_pred
                    })

fig = go.Figure()

fig.update_layout(title='Multiple Regression Predictions (Red) Plotted vs Ground Truths (Blue)')

fig.add_trace(go.Scatter(x=X_test['size'], 
                         y=y_test['price'],
                         mode='markers',
                         name='ground_truths'))

fig.add_trace(go.Scatter(x=X_test['size'],
                         y=y_pred,
                         mode='markers',
                         name='predictions'))



fig.show()

MSE = mean_squared_error(y_test, y_pred)
RMS = MSE**(1/2)
print('MSE', MSE, '\nRMS', RMS)