In [1]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import timeit

In [2]:
"""
Read csv file into DataFrame
Parameters: Path of csv file
Returns: Pandas DataFrame 
"""  
def load_data(csv_path):
    return pd.read_csv(csv_path)

In [3]:
#load housing.csv into dataframe object
housing = load_data("dataset/housing.csv")

#have a look at the data
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
#create an index on the data
housing = housing.reset_index()

#have a look at the data
housing.head()

Unnamed: 0,index,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [5]:
#create a column income_cat for better dividing the data into training and test set
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)

#have a look at the data
housing.head()

Unnamed: 0,index,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,income_cat
0,0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,5.0
1,1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,5.0
2,2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,5.0
3,3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,4.0
4,4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,3.0


In [6]:
# Split data into training/test set using StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits = 3, test_size = 0.2, random_state = 42)
for train_index,test_index in split.split(housing,housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [7]:
#drop income_cat from training and test set
for set in strat_train_set,strat_test_set :
    set.drop(columns=['income_cat'])

In [8]:
#drop median_house_value from training set
housing = strat_train_set.drop("median_house_value",axis=1)

#store median_house_value in housing_labels
housing_labels = strat_train_set["median_house_value"].copy()

In [9]:
#store housing data in housing_num excluding ocean_proximity column as it's values are not integer
housing_num = housing.drop("ocean_proximity",axis=1)

In [10]:
#Fill missing values with median of the data
imputer = Imputer(strategy = "median")
#Fit to data, then transform it
imputer.fit(housing_num)
X = imputer.transform(housing_num)

#have a look at the transformed array
X

array([[ 7.1750e+03, -1.1819e+02,  3.4050e+01, ...,  2.6000e+02,
         2.4375e+00,  2.0000e+00],
       [ 1.4713e+04, -1.1704e+02,  3.2800e+01, ...,  3.5000e+02,
         5.7416e+00,  4.0000e+00],
       [ 9.7300e+03, -1.2174e+02,  3.6790e+01, ...,  6.1100e+02,
         4.3814e+00,  3.0000e+00],
       ...,
       [ 1.6034e+04, -1.2246e+02,  3.7720e+01, ...,  3.9900e+02,
         3.3208e+00,  3.0000e+00],
       [ 8.6230e+03, -1.1838e+02,  3.3880e+01, ...,  2.7000e+02,
         4.8611e+00,  4.0000e+00],
       [ 1.4072e+04, -1.1711e+02,  3.2760e+01, ...,  5.1800e+02,
         2.2409e+00,  2.0000e+00]])

In [11]:
#Convert transformed array X into pandas dataframe
housing_tr = pd.DataFrame(X, columns = housing_num.columns)

In [12]:
#use encoding to convert ocean_proximity data into numerical values
encoder = LabelEncoder()
housing_cat = housing["ocean_proximity"]
housing_cat_encoded = encoder.fit_transform(housing_cat)

In [13]:
#Convert encoded data into pandas dataframe
housing_tr_1 = pd.DataFrame(housing_cat_encoded, columns = ["ocean_proximity"])

In [14]:
#Scaling the features
scalar = StandardScaler()
scalar.fit(housing_tr)
housing_tr = scalar.transform(housing_tr)

#Convert into dataframe
housing_tr = pd.DataFrame(housing_tr,columns=housing_num.columns)
#have a look at the data
housing_tr.head()

Unnamed: 0,index,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,income_cat
0,-0.527959,0.684384,-0.738029,1.464498,-0.626342,-0.650005,-0.204092,-0.625833,-0.753784,-0.954456
1,0.739448,1.258295,-1.323678,-0.2884,-0.059509,-0.456875,-0.314209,-0.390613,0.985693,0.942051
2,-0.098373,-1.087254,0.545714,-1.005495,0.556133,0.198812,0.325518,0.291527,0.269602,-0.006202
3,0.160388,0.779204,-0.859844,-0.846141,0.481538,0.375251,0.343871,0.398683,-0.113819,-0.006202
4,0.071108,0.824119,-0.948863,0.667726,-0.251063,-0.063463,-0.658543,-0.28607,0.650443,0.942051


In [15]:
#add ocean_proximity data into housing_tr
housing_tr = housing_tr.join(housing_tr_1, lsuffix='_housing_tr', rsuffix='_housing_tr_1')

#have a look at the data
housing_tr.head()

Unnamed: 0,index,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,income_cat,ocean_proximity
0,-0.527959,0.684384,-0.738029,1.464498,-0.626342,-0.650005,-0.204092,-0.625833,-0.753784,-0.954456,0
1,0.739448,1.258295,-1.323678,-0.2884,-0.059509,-0.456875,-0.314209,-0.390613,0.985693,0.942051,0
2,-0.098373,-1.087254,0.545714,-1.005495,0.556133,0.198812,0.325518,0.291527,0.269602,-0.006202,0
3,0.160388,0.779204,-0.859844,-0.846141,0.481538,0.375251,0.343871,0.398683,-0.113819,-0.006202,0
4,0.071108,0.824119,-0.948863,0.667726,-0.251063,-0.063463,-0.658543,-0.28607,0.650443,0.942051,0


In [16]:
#get size of the dataset
data_size = housing_tr['latitude'].size
#create a matrix of 1's with size as data_size
ones_matrix = np.ones(data_size)
ones_matrix_1 = ones_matrix.reshape((data_size,1))

In [17]:
#Convert dataframe into matrix
housing_matrix = housing_tr.as_matrix()

#Create data matrix(X_matrix) by appending matrix of 1's
X_matrix = np.append(ones_matrix_1,housing_matrix,axis=1)

#Create label matrix(Y_matrix) from housing_labels
Y_matrix = housing_labels.as_matrix()

In [18]:
#store current time int start_time
start_time = timeit.default_timer()

#Ridge Regression
theta = np.ones(X_matrix.shape[1])
m = len(Y_matrix)
alpha = 0.02
regC = 0.2
num_iter = 3000

for i in range(num_iter):
    cost_elem = np.dot(X_matrix, theta) - Y_matrix
    derivative = np.dot(cost_elem, X_matrix)  
    derivative = derivative + (regC * theta)
    theta = theta - 2 * ((alpha / m) * derivative)
    
#Find training time
end_time = timeit.default_timer()
training_time = end_time - start_time

print("Theta: ", theta)
print("\nTraining Time: ", training_time, "seconds")

Theta:  [ 2.06732415e+05  3.33237617e+03 -8.18105560e+04 -8.68226312e+04
  1.56142948e+04 -1.93670666e+04  3.68937333e+04 -4.30326461e+04
  3.04863137e+04  6.57940884e+04  1.20815240e+04  8.60304606e+01]

Training Time:  1.6476080020001973 seconds


In [19]:
#Calculate error as difference of actual label(Y_matrix) and predicted label(Y_predict)
Y_predict = np.dot(X_matrix, theta)
error = Y_predict - Y_matrix
#Calculate mean square error
mean_squared_error = np.dot(error.transpose(), error) / data_size
print("Mean Squared Error: ", mean_squared_error)
#Calculate root_mean_square_error
root_mean_squared_error = np.sqrt(mean_squared_error)
print("Root Mean Squared Error: ", root_mean_squared_error)

Mean Squared Error:  4815099696.484726
Root Mean Squared Error:  69390.91940942075
