In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib as mpl
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns 

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Read and understand the data

In [None]:
housing=pd.read_csv('../input/california-housing-prices/housing.csv')

In [None]:
housing.head()

In [None]:
housing.describe()

In [None]:
housing.info()

In [None]:
housing["ocean_proximity"].value_counts()

In [None]:
housing.hist(figsize=(20,20))

In [None]:
housing.corr()

In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(housing.corr(),annot=True)

In [None]:
housing=housing.dropna()

In [None]:
from pandas.plotting import scatter_matrix

attributes = ["median_house_value", "median_income", "total_rooms",
              "housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12, 8))

# Geo plots

In [None]:
import geopandas as gpd
data_geodf = gpd.GeoDataFrame(housing, geometry = gpd.points_from_xy(housing.longitude, housing.latitude))
#Set the coordinate reference system CRS to EPSG 4326
data_geodf.crs = {'init': 'epsg:4326'}

data_geodf.head()

In [None]:
nearbay_df = data_geodf.loc[data_geodf['ocean_proximity'] == 'NEAR BAY']
inland_df = data_geodf.loc[data_geodf['ocean_proximity'] == 'INLAND']
nearocean_df = data_geodf.loc[data_geodf['ocean_proximity'] == 'NEAR OCEAN']
island_df = data_geodf.loc[data_geodf['ocean_proximity'] == 'ISLAND ']

In [None]:
import folium
from folium import Choropleth, Circle, Marker

latitude = 36.7783
longitude = -119.4179

map2 = folium.Map(location = [latitude, longitude], tiles='cartodbpositron', zoom_start=7)
for idx, row in nearbay_df.iterrows():
    Marker([row['latitude'], row['longitude']]).add_to(map2)
map2

In [None]:

map3 = folium.Map(location = [latitude, longitude], tiles='cartodbpositron', zoom_start=5)
for idx, row in nearocean_df.iterrows():
    Marker([row['latitude'], row['longitude']]).add_to(map3)
map3

In [None]:
import math
from folium.plugins import MarkerCluster
map4 = folium.Map(location=[latitude, longitude], tiles='cartodbpositron', zoom_start=5)

mc = MarkerCluster()

for idx, row in inland_df.iterrows():
    if not math.isnan(row['longitude']) and not math.isnan(row['latitude']):
        mc.add_child(Marker([row['latitude'], row['longitude']]))
map4.add_child(mc)

In [None]:
housing.groupby('ocean_proximity').mean()


# Visualize Population

In [None]:
ax=housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
             s=housing["population"]/100, label="population", figsize=(10,7),
             c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,
             sharex=False)
plt.legend()

#  Using One hot encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(handle_unknown = 'ignore', sparse = False)
df_ohe = pd.DataFrame(ohe.fit_transform(housing[['ocean_proximity']]))

In [None]:
df_ohe.index = housing.index
df_num = housing.drop(['ocean_proximity'], axis=1)
df2 = pd.concat([df_num, df_ohe], axis=1)
df2.head()

In [None]:
y = df2.median_house_value
X = df2.drop(columns = 'median_house_value')

In [None]:
X.head()

# ML algorithms

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

# Linear regression

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(x_train, y_train)

In [None]:
y_pred=model.predict(x_test)

In [None]:
y_test.mean()

In [None]:
y_pred.mean()

# MSE

In [None]:
from sklearn.metrics import mean_squared_error

housing_predictions = model.predict(x_train)
lin_mse = mean_squared_error(y_train, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

# MAE

In [None]:
from sklearn.metrics import mean_absolute_error

lin_mae = mean_absolute_error(y_train, housing_predictions)
lin_mae

# Decision tree

In [None]:
from sklearn.tree import DecisionTreeRegressor

model2 = DecisionTreeRegressor(random_state=42)
model2.fit(x_train, y_train)

# MSE for tree

In [None]:
housing_predictions2 = model2.predict(x_train)
tree_mse = mean_squared_error(y_train, housing_predictions2)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

# SVR

In [None]:
from sklearn.svm import SVR

model3 = SVR(kernel="linear")
model3.fit(x_train, y_train)
housing_predictions3 = model3.predict(x_train)
svm_mse = mean_squared_error(y_train, housing_predictions3)
svm_rmse = np.sqrt(svm_mse)
svm_rmse

# Cross validation

In [None]:
from sklearn.model_selection import cross_val_score

lin_scores = cross_val_score(model, x_train, y_train,
                         scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)

In [None]:
def display_scores(lin_scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

display_scores(lin_rmse_scores)

In [None]:
tree_scores = cross_val_score(model, x_train, y_train,
                         scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-tree_scores)
display_scores(tree_rmse_scores)

# I got some help from this Notebook:
https://www.kaggle.com/aditya26sg/california-house-price-prediction