In [227]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns
sns.set_style('darkgrid')
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split


## (1) Info of Data

In [228]:
df = pd.read_csv("housing.csv")
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [229]:
df.shape

(20640, 10)

Above shown the data size: we have 20640 data points and 10 features in the form of column

In [230]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


Above shown the information about data set like number of values it contain and data type here all column contain 20640 values except total_bedrooms column which contain 20433 values (it contain null values). All values contain float64 datatypes except ocean_proximity that is object type

In [231]:
df.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

Above shown the different features of our dataset are:

Longitude
Latitude
Median Age of the House
Total Rooms in the block
Total Bedrooms in the block
Population of the block
Number of Households
Ocean Proximity

## (2) Compute the Correlation between each feature and the target median_house_value

In [232]:
df.corr()['median_house_value']

longitude            -0.045967
latitude             -0.144160
housing_median_age    0.105623
total_rooms           0.134153
total_bedrooms        0.049686
population           -0.024650
households            0.065843
median_income         0.688075
median_house_value    1.000000
Name: median_house_value, dtype: float64

Above shown the correlation between each feature and the target median_house_value.
And we found out that median_income has the strongest correlation to target median_house_value.

## (3) Actions to do to prepare the data set for leaning algorithm

1. Format data to make it consistent
2. Reduce data
3. Complete data cleaning
4. Create new features out of existing ones

## (5) Extract the extra three features and prove the correlation

In [233]:
df['avgRooms'] = df['total_rooms'] / df['households']
df['avgBedrooms'] = df['total_bedrooms'] / df['households']
df['pop_per_household'] = df['population'] / df['households']

Above we add three extra features to the data set.
Number of Rooms per House
Number of Bedrooms per House
Number of people per Household

In [234]:
df.median_house_value.corr(df.avgRooms)

0.1519482897414577

In [235]:
df.median_house_value.corr(df.avgBedrooms)

-0.04673948984474207

In [236]:
df.median_house_value.corr(df.pop_per_household)

-0.023737412956134345

## (4) Clean amd Filter the Data: (Drop duplicates and fill up null)

In [237]:
df = df.drop_duplicates() 
df.duplicated().values.any()

False

In [238]:
df = df.fillna(method="ffill")
pd.isnull(df).any()

longitude             False
latitude              False
housing_median_age    False
total_rooms           False
total_bedrooms        False
population            False
households            False
median_income         False
median_house_value    False
ocean_proximity       False
avgRooms              False
avgBedrooms           False
pop_per_household     False
dtype: bool

## Train and Test data split

In [239]:
dum = pd.get_dummies(df.ocean_proximity)

In [240]:
merged_df = pd.concat([df, dum], axis = 'columns')
merged_df = merged_df.drop(['ocean_proximity', 'ISLAND'], axis= 'columns')
merged_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,avgRooms,avgBedrooms,pop_per_household,<1H OCEAN,INLAND,NEAR BAY,NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,6.984127,1.02381,2.555556,0,0,1,0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,6.238137,0.97188,2.109842,0,0,1,0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,8.288136,1.073446,2.80226,0,0,1,0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,5.817352,1.073059,2.547945,0,0,1,0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,6.281853,1.081081,2.181467,0,0,1,0


In [241]:
prices = merged_df['median_house_value'].values.reshape(-1,1)
features = merged_df.drop('median_house_value', axis = 1)

In [242]:
x_train, x_test, y_train, y_test = train_test_split(features, prices, test_size = 0.2)

## (6) Linear Regression

In [243]:
regr = LinearRegression()
regr.fit(x_train, y_train)


LinearRegression()

In [244]:
my_train_predict = regr.predict(x_train)
my_test_predict = regr.predict(x_test)

In [245]:
r1 = regr.score(x_train, y_train)
print("R square for train:", r1)

R square for train: 0.6491784865203749


In [246]:
r2 = regr.score(x_test, y_test)
print("R square for test:", r2)

R square for test: 0.6419971579389958


In [247]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_train, my_train_predict)
rmse = np.sqrt(mse)
print("RMSE for train:",rmse)

RMSE for train: 68038.23990034299


In [248]:
mse2 = mean_squared_error(y_test, my_test_predict)
rmse2 = np.sqrt(mse2)
print("RMSE for test:",rmse2)

RMSE for test: 70273.31293041215


## (7) Grid Search & Randomized Search

In [249]:
from scipy.stats import uniform as sp_rand
from sklearn.linear_model import Ridge
from sklearn.model_selection import RandomizedSearchCV

param_grid = {'alpha': sp_rand()}
model = Ridge()
rsearch = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=100)
rsearch.fit(x_train, y_train)
print("lambda:",rsearch.best_estimator_.alpha)

lambda: 0.02469150343115767


In [259]:
rr = Ridge(alpha = rsearch.best_estimator_.alpha)
result1 = rr.fit(x_train, y_train)
r_predict = result1.predict(x_test)
print("RMSE for randomized test",np.sqrt(mean_squared_error(y_test, r_predict)))

RMSE for randomized test 70273.70387095469


In [251]:
# Grid Search for Algorithm Tuning
import numpy as np
from sklearn import datasets
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
# load the diabetes datasets
dataset = datasets.load_diabetes()
# prepare a range of alpha values to test
alphas = np.array([1,0.1,0.01,0.001,0.0001,0])
# create and fit a ridge regression model, testing each alpha
model = Ridge()
grid = GridSearchCV(estimator=model, param_grid=dict(alpha=alphas))
grid.fit(dataset.data, dataset.target)
print(grid)
# summarize the results of the grid search
print(grid.best_score_)
print(grid.best_estimator_.alpha)

GridSearchCV(estimator=Ridge(),
             param_grid={'alpha': array([1.e+00, 1.e-01, 1.e-02, 1.e-03, 1.e-04, 0.e+00])})
0.4823231384163484
0.0001


In [260]:
model = Ridge(alpha = grid.best_estimator_.alpha)
result2 = model.fit(x_train, y_train)
result_predict = result2.predict(x_test)

print("RMSE for grid test",np.sqrt(mean_squared_error(y_test, result_predict)))

RMSE for grid test 70273.31445377544


From above randomized Search and Grid Search, I found both of those model perform worse than linear regression. It might because that random search will yield high variance during conputing and grid search create subsamples of the data repeatedly.

## (8) DecisionTreeRegressor

In [253]:
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(x_train, y_train)

DecisionTreeRegressor()

In [254]:
pred_tree = tree_reg.predict(x_train)
mse_tree = mean_squared_error(y_train, pred_tree)
rmse_tree = np.sqrt(mse_tree)
print("RSME for decision tree", rmse_tree)

RSME for decision tree 0.0


In [255]:
pred_tree2 = tree_reg.predict(x_test)
mse_tree2 = mean_squared_error(y_test, pred_tree2)
rmse_tree2 = np.sqrt(mse_tree2)
print("RSME for decision tree", rmse_tree2)

RSME for decision tree 71942.86984244449


From above, I found that DecisionTreeRegressor yield a higher RSME than linear regression, meaning the decision Tree model performs worse than Linear Regression model.