# California Housing Price Prediction
#### Author: Sayorn Chin
#### Date: 2021-07-19

### Objective

The project aims at building a model of housing prices to predict median house values in California using the provided dataset. This model should learn from the data and be able to predict the median housing price in any district, given all the other metrics.



### Import required libraries

In [144]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

### Data handling

In [145]:
## Assign the data frame as housing
housing = pd.read_excel('/Users/schinlfc/data-science-Python/california_housing_price_prediction/data/housing.xlsx')

In [146]:
## View the first 5 rows
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value
0,-122.23,37.88,41,880,129.0,322,126,8.3252,NEAR BAY,452600
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,NEAR BAY,358500
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,NEAR BAY,352100
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,NEAR BAY,341300
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,NEAR BAY,342200


In [147]:
## Get the number of rows and columns
housing.shape

(20640, 10)

In [148]:
## Get data types for each column
housing.dtypes

longitude             float64
latitude              float64
housing_median_age      int64
total_rooms             int64
total_bedrooms        float64
population              int64
households              int64
median_income         float64
ocean_proximity        object
median_house_value      int64
dtype: object

In [149]:
## Get basic summary statistics for float and integer column types
housing.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [150]:
## Check for the sum of missing values for each column
housing.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
ocean_proximity         0
median_house_value      0
dtype: int64

In [151]:
## Fill the missing values of the 'total_bedrooms' column with its mean value
housing['total_bedrooms'].fillna((housing['total_bedrooms'].mean()), inplace=True)

In [152]:
## Check whether the missing values are filled
housing.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
ocean_proximity       0
median_house_value    0
dtype: int64

In [153]:
## Convert categorical column 'ocean_proximity' in the dataset to numerical data
housing = pd.get_dummies(housing, columns=['ocean_proximity'])

In [154]:
## Get name of all columns
print(housing.columns)

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity_<1H OCEAN',
       'ocean_proximity_INLAND', 'ocean_proximity_ISLAND',
       'ocean_proximity_NEAR BAY', 'ocean_proximity_NEAR OCEAN'],
      dtype='object')


In [155]:
## Rename columns

# Strip any white space
housing = housing.rename(columns=lambda x: x.strip())

# Define a dictionary of columns we want to rename
col_map = {'ocean_proximity_<1H OCEAN': '1h_ocean', 
           'ocean_proximity_INLAND': 'inland',
           'ocean_proximity_ISLAND': 'island',
           'ocean_proximity_NEAR BAY': 'near_bay',
           'ocean_proximity_NEAR OCEAN': 'near_ocean'}
# Rename columns with inplace=True
housing.rename(columns=col_map, inplace=True)

In [156]:
## Check to see if columns are successfully renamed
print(housing.columns)

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', '1h_ocean', 'inland', 'island', 'near_bay',
       'near_ocean'],
      dtype='object')


In [157]:
## Check the data type of each column
housing.dtypes

longitude             float64
latitude              float64
housing_median_age      int64
total_rooms             int64
total_bedrooms        float64
population              int64
households              int64
median_income         float64
median_house_value      int64
1h_ocean                uint8
inland                  uint8
island                  uint8
near_bay                uint8
near_ocean              uint8
dtype: object

### Modeling

In [158]:
## Split the data into 80% training dataset and 20% test dataset
housing_train, housing_test = train_test_split(housing, test_size=0.2, random_state=100)

In [159]:
## Check to see if the if the split is correct
print(f"The number of rows of the train dataset are: {housing_train.shape[0]}\nThe number of rows of the test dataset are: {housing_test.shape[0]}")

The number of rows of the train dataset are: 16512
The number of rows of the test dataset are: 4128


In [160]:
## Standardize training and test datasets

# Note: Standardize features by removing the mean and scaling to unit variance
# The standard score of a sample x is calculated as:
# z = (x - u) / s
# where u is the mean of the training samples or zero if with_mean=False, 
# and s is the standard deviation of the training samples or one if with_std=False.

# Define a scaler object
scaler = StandardScaler()

In [161]:
# Standardize training dataset
scaler.fit(housing_train)
housing_train_standardized = scaler.transform(housing_train)
print(housing_train_standardized)

[[-1.55444193  1.63677864 -1.24008206 ... -0.01556621 -0.35650943
  -0.3900092 ]
 [-0.62304675 -0.14864428 -1.39861471 ... -0.01556621 -0.35650943
   2.56404211]
 [-1.36019374  2.2677527  -0.84375045 ... -0.01556621 -0.35650943
  -0.3900092 ]
 ...
 [ 1.17997494 -0.77961835 -2.03274529 ... -0.01556621 -0.35650943
  -0.3900092 ]
 [ 1.24472434 -1.35450583  0.50377704 ... -0.01556621 -0.35650943
   2.56404211]
 [ 0.63707615 -0.88244375  1.53423923 ... -0.01556621 -0.35650943
  -0.3900092 ]]


In [162]:
# Standardize testing dataset
housing_test_standardized = scaler.fit_transform(housing_test)
print(housing_test_standardized)

[[ 0.72117114 -0.84730103  0.59221256 ... -0.01556621 -0.34011127
  -0.36188245]
 [-1.37861235  1.04325901  1.87604277 ... -0.01556621  2.94021425
  -0.36188245]
 [ 0.20755504 -0.11182131 -1.25329337 ... -0.01556621 -0.34011127
  -0.36188245]
 ...
 [ 1.59733979 -0.86144487 -1.81496909 ... -0.01556621 -0.34011127
  -0.36188245]
 [-0.66357816  0.89239138  1.87604277 ... -0.01556621 -0.34011127
  -0.36188245]
 [ 1.06358188 -0.78601106  0.51197317 ... -0.01556621 -0.34011127
  -0.36188245]]
