In [20]:
# Generic inputs for most ML tasks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor

pd.options.display.float_format = '{:,.2f}'.format

# setup interactive notebook mode
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display, HTML

#### Read and pre-process data

In [21]:
# fetch data 

house_price_data = pd.read_csv('house_price_subset.csv')

house_price_data.head()

Unnamed: 0,LotArea,OverallQual,OverallCond,GrLivArea,FullBath,BedroomAbvGr,GarageCars,SaleCondition,SalePrice
0,8450,7,5,1710,2,3,2,Normal,208500
1,9600,6,8,1262,2,3,2,Normal,181500
2,11250,7,5,1786,2,3,2,Normal,223500
3,9550,7,5,1717,1,3,3,Abnorml,140000
4,14260,8,5,2198,2,4,3,Normal,250000


In [22]:
# subset data to only include columns for smokers

filtered_data = house_price_data[house_price_data['SaleCondition'] == 'Normal']

filtered_data.head()

len(filtered_data)

Unnamed: 0,LotArea,OverallQual,OverallCond,GrLivArea,FullBath,BedroomAbvGr,GarageCars,SaleCondition,SalePrice
0,8450,7,5,1710,2,3,2,Normal,208500
1,9600,6,8,1262,2,3,2,Normal,181500
2,11250,7,5,1786,2,3,2,Normal,223500
4,14260,8,5,2198,2,4,3,Normal,250000
5,14115,5,5,1362,1,1,2,Normal,143000


1198

In [23]:
house_price_data.isna().sum()

LotArea          0
OverallQual      0
OverallCond      0
GrLivArea        0
FullBath         0
BedroomAbvGr     0
GarageCars       0
SaleCondition    0
SalePrice        0
dtype: int64

In [24]:
Y = house_price_data['SalePrice']

In [25]:
X = house_price_data.drop(['SaleCondition', 'SalePrice'], axis=1)

In [26]:
# This is just to show how we can do test train split like we did for excel in class
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state = 42)

# Mostly we will do one of the following int he course
# # X_train, X_test, y_train, y_test = train_test_split(subset_data['bmi'], subset_data['charges'], test_size=0.25)
# # Pick a random_state as below and keep using the same number (example 35) to repeat the same test and train data
# X_train, X_test, y_train, y_test = train_test_split(subset_data['bmi'], subset_data['charges'], test_size=0.25, random_state=42)

X_train
X_test
y_train
y_test

Unnamed: 0,LotArea,OverallQual,OverallCond,GrLivArea,FullBath,BedroomAbvGr,GarageCars
1023,3182,7,5,1504,2,2,2
810,10140,6,6,1309,1,3,2
1384,9060,6,5,1258,1,2,1
626,12342,5,5,1422,1,3,1
813,9750,6,6,1442,1,4,1
...,...,...,...,...,...,...,...
1095,9317,6,5,1314,2,3,2
1130,7804,4,3,1981,2,4,2
1294,8172,5,7,864,1,2,2
860,7642,7,8,1426,1,3,1


Unnamed: 0,LotArea,OverallQual,OverallCond,GrLivArea,FullBath,BedroomAbvGr,GarageCars
892,8414,6,8,1068,1,3,1
1105,12256,8,5,2622,2,3,2
413,8960,5,6,1028,1,2,2
522,5000,6,7,1664,2,3,2
1036,12898,9,5,1620,2,2,3
...,...,...,...,...,...,...,...
988,12046,6,6,2030,2,4,2
243,10762,6,6,1217,1,3,1
1342,9375,8,5,2169,2,3,2
1057,29959,7,6,1850,2,3,2


1023    191000
810     181000
1384    105000
626     139900
813     157900
         ...  
1095    176432
1130    135000
1294    115000
860     189950
1126    174000
Name: SalePrice, Length: 1095, dtype: int64

892     154500
1105    325000
413     115000
522     159000
1036    315500
         ...  
988     195000
243     120000
1342    228500
1057    248000
1418    124000
Name: SalePrice, Length: 365, dtype: int64

In [29]:
model = LinearRegression(fit_intercept = True)
model.fit(X_train, y_train) # When extending to multiple features remove .array.reshape(-1, 1)

# The following gives the R-square score
model.score(X_train, y_train) # When extending to multiple features remove .array.reshape(-1, 1)

# This is the coefficient Beta_1 (or slope of the Simple Linear Regression line)
model.coef_

# This is the coefficient Beta_0
model.intercept_

0.7460238966632944

array([ 7.61037889e-01,  2.60066063e+04,  1.44005633e+03,  4.96953053e+01,
        4.17479701e+03, -8.70312125e+03,  2.07865169e+04])

-88466.30978377056

In [30]:
test_output = pd.DataFrame(model.predict(X_test), index = X_test.index, columns = ['pred_charges'])
# When extending to multiple features remove .array.reshape(-1, 1)
test_output.head()

Unnamed: 0,pred_charges
892,137423.69
1105,290228.46
413,136454.32
522,187965.16
1036,296418.59


In [31]:
test_output = test_output.merge(y_test, left_index = True, right_index = True)
test_output.head()
mean_absolute_error = abs(test_output['pred_charges'] - test_output['charges']).mean()
print('Mean absolute error is ')
print(mean_absolute_error)

Unnamed: 0,pred_charges,SalePrice
892,137423.69,154500
1105,290228.46,325000
413,136454.32,115000
522,187965.16,159000
1036,296418.59,315500


KeyError: 'charges'

#### Visualize data

In [32]:
# define function to import viz libraries
import plotly
plotly.offline.init_notebook_mode(connected=True)
from plotly.graph_objs import *
from plotly import tools
import plotly.graph_objects as go
import seaborn as sns

In [33]:
plot_data = []
plot_data.append(go.Scatter(x= X_train, y= y_train, name = 'Train data actual', mode = 'markers'))
plot_data.append(go.Scatter(x= X_train, y= model.predict(X_train.array.reshape(-1, 1)), name = 'Train data predicted', mode = 'markers'))
plot_data.append(go.Scatter(x= X_test, y= y_test, name = 'Test data actual', mode = 'markers'))
plot_data.append(go.Scatter(x= X_test, y= model.predict(X_test.array.reshape(-1, 1)), name = 'Test data predicted', mode = 'markers'))
# When extending to multiple features remove .array.reshape(-1, 1) in above (but remember 2-d we cannot draw)

layout = go.Layout(xaxis = dict(title='bmi'), yaxis = dict(title= 'charges'), 
                   title = 'Plot of predicted and actual')
fig = go.Figure(data= plot_data, layout=layout)
plotly.offline.iplot(fig)


AttributeError: 'DataFrame' object has no attribute 'array'