# Regression Week 2: Multiple Regression (Interpretation)

In [1]:
import graphlab as gl

### Load in house sales data

In [3]:
sales = gl.SFrame('kc_house_data.gl/')

[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1486894331.log


This non-commercial license of GraphLab Create for academic use is assigned to sridharreddyyedla@gmail.com and will expire on January 29, 2018.


In [4]:
# splitting the data
train_data,test_data = sales.random_split(.8,seed=0)

### Learning a multiple regression model using graph lab in-built functions

In [6]:
example_features = ['sqft_living', 'bedrooms', 'bathrooms']
example_model = gl.linear_regression.create(train_data, target = 'price', 
                                                  features = example_features, 
                                                  validation_set = None)

In [7]:
# getting the weights of features
example_weight_summary = example_model.get("coefficients")
print example_weight_summary

+-------------+-------+----------------+---------------+
|     name    | index |     value      |     stderr    |
+-------------+-------+----------------+---------------+
| (intercept) |  None | 87910.0724924  |  7873.3381434 |
| sqft_living |  None | 315.403440552  | 3.45570032585 |
|   bedrooms  |  None | -65080.2155528 | 2717.45685442 |
|  bathrooms  |  None | 6944.02019265  | 3923.11493144 |
+-------------+-------+----------------+---------------+
[4 rows x 4 columns]



In [8]:
# using predict function
example_predictions = example_model.predict(train_data)
print example_predictions[0]

271789.505878


### Build a function to computer RSS

In [40]:
def get_residual_sum_of_squares(model,data,outcome):
    predicted_outcome=model.predict(data)
    residuals=outcome-predicted_outcome
    RSS=(residuals*residuals).sum()
    return(RSS)

### Computing RSS on test data

In [10]:
rss_example_test = get_residual_sum_of_squares(example_model, test_data, test_data['price'])
print rss_example_test

2.7376153833e+14


# Create some new features

In [11]:
# using log transformation
from math import log

creating 4 new features in both TRAIN and TEST data:
* bedrooms_squared = bedrooms*bedrooms
* bed_bath_rooms = bedrooms*bathrooms
* log_sqft_living = log(sqft_living)
* lat_plus_long = lat + long

In [12]:
train_data['bedrooms_squared'] = train_data['bedrooms'].apply(lambda x: x**2)
test_data['bedrooms_squared'] = test_data['bedrooms'].apply(lambda x: x**2)

In [20]:
train_data['bed_bath_rooms'] = train_data['bedrooms']*train_data['bathrooms']
test_data['bed_bath_rooms'] = test_data['bedrooms']*test_data['bathrooms']

In [22]:
train_data['log_sqft_living']=train_data['sqft_living'].apply(lambda x: log(x))
test_data['log_sqft_living']=test_data['sqft_living'].apply(lambda x: log(x))

In [25]:
train_data['lat_plus_long'] = train_data['lat']+train_data['long']
test_data['lat_plus_long'] = test_data['lat']+test_data['long']

#### What is the mean (arithmetic average) value of  4 new features on TEST data?

In [27]:
print test_data['bedrooms_squared'].mean()
print test_data['bed_bath_rooms'].mean()
print test_data['log_sqft_living'].mean()
print test_data['lat_plus_long'].mean()

12.4466777016
7.50390163159
7.55027467965
-74.6533349722


## Learning multiple models

In [28]:
model_1_features = ['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long']
model_2_features = model_1_features + ['bed_bath_rooms']
model_3_features = model_2_features + ['bedrooms_squared', 'log_sqft_living', 'lat_plus_long']

In [29]:
model_1=gl.linear_regression.create(train_data,target='price',features=model_1_features,validation_set=None,verbose=False)

In [30]:
model_2=gl.linear_regression.create(train_data,target='price',features=model_2_features,validation_set=None,verbose=False)

In [31]:
model_3=gl.linear_regression.create(train_data,target='price',features=model_3_features,validation_set=None,verbose=False)

### Comparing coefficients

In [39]:
model_1.get('coefficients')

name,index,value,stderr
(intercept),,-56140675.7444,1649985.42028
sqft_living,,310.263325778,3.18882960408
bedrooms,,-59577.1160682,2487.27977322
bathrooms,,13811.8405418,3593.54213297
lat,,629865.789485,13120.7100323
long,,-214790.285186,13284.2851607


In [38]:
model_2.get('coefficients')

name,index,value,stderr
(intercept),,-54410676.1152,1650405.16541
sqft_living,,304.449298057,3.20217535637
bedrooms,,-116366.043231,4805.54966546
bathrooms,,-77972.3305135,7565.05991091
lat,,625433.834953,13058.3530972
long,,-203958.60296,13268.1283711
bed_bath_rooms,,26961.6249092,1956.36561555


### Comparing RSS of 3 models on training data

In [41]:
model_1_RSS=get_residual_sum_of_squares(model_1,train_data,train_data['price'])
print "model 1 RSS "+ str(model_1_RSS)

model 1 RSS 9.71328233544e+14


In [42]:
model_2_RSS=get_residual_sum_of_squares(model_2,train_data,train_data['price'])
print "model 2 RSS "+ str(model_2_RSS)

model 2 RSS 9.61592067856e+14


In [43]:
model_3_RSS=get_residual_sum_of_squares(model_3,train_data,train_data['price'])
print "model 3 RSS "+ str(model_3_RSS)

model 3 RSS 9.05276314555e+14


### Comparing RSS of 3 models on test data

In [44]:
model_1_RSS=get_residual_sum_of_squares(model_1,test_data,test_data['price'])
print "model 1 RSS "+ str(model_1_RSS)

model 1 RSS 2.26568089093e+14


In [46]:
model_2_RSS=get_residual_sum_of_squares(model_2,test_data,test_data['price'])
print "model 2 RSS "+ str(model_2_RSS)

model 2 RSS 2.24368799994e+14


In [47]:
model_3_RSS=get_residual_sum_of_squares(model_3,test_data,test_data['price'])
print "model 3 RSS "+ str(model_3_RSS)

model 3 RSS 2.51829318952e+14
