In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('../input/california-housing-prices/housing.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df['ocean_proximity'].value_counts()

# Using OneHotEncoder to deal with Categorical Variables

In [None]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(handle_unknown = 'ignore', sparse = False)
df_ohe = pd.DataFrame(ohe.fit_transform(df[['ocean_proximity']]))

Reassigning the index to output of ohe

In [None]:
df_ohe.index = df.index
df_num = df.drop(['ocean_proximity'], axis=1)
df2 = pd.concat([df_num, df_ohe], axis=1)
df2.head()

# One hot encoded

In [None]:
data = df2.dropna(axis=0)
data.isnull().sum()

In [None]:
data.shape

# Plots

In [None]:
data.corr()

In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

fig, ax = plt.subplots(figsize=(12,10))
sns.heatmap(data.corr(), annot=True, annot_kws = {'size':9}, xticklabels=data.columns, yticklabels=data.columns, ax=ax)


In [None]:
data.hist(figsize=(20, 20))

median_house_value as the target and rest as features.

In [None]:
y = data.median_house_value
X = data.drop(columns = 'median_house_value')

In [None]:
X.head()

In [None]:
from sklearn.model_selection import train_test_split
train_x, text_x, train_y, test_y = train_test_split(X, y, random_state=0, test_size=0.25)

In [None]:
from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor(random_state=0)

model.fit(train_x, train_y)

In [None]:
predic_y = model.predict(text_x)

In [None]:
from sklearn.metrics import mean_absolute_error as mae
print(mae(test_y, predic_y))

# get_mae

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

def get_mae(max_leaf_nodes, train_x, test_x, train_y, test_y):
    model = DecisionTreeRegressor(max_leaf_nodes = max_leaf_nodes, random_state=0)
    model.fit(train_x, train_y)
    preds_y = model.predict(test_x)
    return mean_absolute_error(test_y, preds_y)

train_x, test_x, train_y, test_y = train_test_split(X, y, random_state=0, test_size=0.2)

# getting the mae for different values of leaf nodes in a DecisionTreeRegressor
for max_leaf_nodes in [5, 50, 500, 5000]:
    op_mae = get_mae(max_leaf_nodes, train_x, test_x, train_y, test_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, op_mae))

Got the least mean absolute error for max_leaf_nodes = 500
# Random Forest
Getting the values of train_x, test_x, train_y, test_y from above train_test_split<br>


In [None]:
from sklearn.ensemble import RandomForestRegressor

model2 = RandomForestRegressor(random_state=1)
model2.fit(train_x, train_y)
preds_y2 = model2.predict(test_x)
print(mean_absolute_error(test_y, preds_y2))

In [None]:
def get_mae_rf(n_estimators, train_x, test_x, train_y, test_y):
    model5 = RandomForestRegressor(n_estimators = n_estimators, random_state=1)
    model5.fit(train_x, train_y)
    preds_y5 = model5.predict(test_x)
    return mean_absolute_error(test_y, preds_y5)

train_x5, test_x5, train_y5, test_y5 = train_test_split(X, y, random_state=0, test_size=0.2)

# getting mae for different number of estimators in a RandomForestRegressor
for n_estimators in [5, 50, 500]:
    op_mae = get_mae_rf(n_estimators, train_x5, test_x5, train_y5, test_y5)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(n_estimators, op_mae))

So far the RandomForestRegressor is performing much better than the DecisionTrees and LinearRegression. The mae achieved is lower, the best I could get was for the n_estimators around 500 and around 5000 there wasn't much change.<br>
Max leaf nodes: 5  		 Mean Absolute Error:  36111<br>
Max leaf nodes: 50  		 Mean Absolute Error:  32317<br>
Max leaf nodes: 500  		 Mean Absolute Error:  31866<br>
Max leaf nodes: 5000  		 Mean Absolute Error:  31841<br>


In [None]:
from sklearn.metrics import r2_score
print(r2_score(test_y, preds_y2))

Got even less mae than decision trees
# Using Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

model3 = LinearRegression()
model3.fit(train_x, train_y)

In [None]:
preds_y3 = model3.predict(test_x)

In [None]:
print(mean_absolute_error(test_y, preds_y3))

In [None]:
test_y.mean()

In [None]:
preds_y3.mean()

# Geo Plots
The description mentions that this data is with respect to 1990 California census. Although there shouldn't be anything weird on the plots as real estate can be fine for somewhat these many years, but something might change, plots are with respect to 1990 census.

In [None]:
import geopandas as gpd

In [None]:
data_geodf = gpd.GeoDataFrame(df, geometry = gpd.points_from_xy(df.longitude, df.latitude))
#Set the coordinate reference system CRS to EPSG 4326
data_geodf.crs = {'init': 'epsg:4326'}

data_geodf.head()

A map showing the state of California

In [None]:
import folium
from folium import Choropleth, Circle, Marker

latitude = 36.7783
longitude = -119.4179

m1 = folium.Map(location = [latitude, longitude], zoom_start=5)
m1

Plotting the houses near bay 

In [None]:
nearbay_df = data_geodf.loc[data_geodf['ocean_proximity'] == 'NEAR BAY']

In [None]:
nearbay_df.head()

In [None]:
latitude = 36.7783
longitude = -119.4179

m2 = folium.Map(location = [latitude, longitude], tiles='cartodbpositron', zoom_start=7)
for idx, row in nearbay_df.iterrows():
    Marker([row['latitude'], row['longitude']]).add_to(m2)
m2

A better plot using marker cluster as we have a lot of points eventhough we are just plotting the NEARBAY ones

In [None]:
import math
from folium.plugins import MarkerCluster

In [None]:
m3 = folium.Map(location=[latitude, longitude], tiles='cartodbpositron', zoom_start=5)

mc = MarkerCluster()

for idx, row in nearbay_df.iterrows():
    if not math.isnan(row['longitude']) and not math.isnan(row['latitude']):
        mc.add_child(Marker([row['latitude'], row['longitude']]))
m3.add_child(mc)














<1H OCEAN     9136<br>
INLAND        6551<br>
NEAR OCEAN    2658<br>
NEAR BAY      2290<br>
ISLAND           5<br>
Name: ocean_proximity, dtype: int64

In [None]:
df.groupby('ocean_proximity').mean()

The mean value of median_house_value for the ocean_proximity is quite high as compared to others, have look where they are located on the basis of their provided lat and long.

In [None]:
island_df = data_geodf.loc[data_geodf['ocean_proximity'] == 'ISLAND']

m4 = folium.Map(location = [latitude, longitude], tiles='cartodbpositron', zoom_start=4)
for idx, row in island_df.iterrows():
    Marker([row['latitude'], row['longitude']]).add_to(m4)
m4
