In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error

%matplotlib inline

In [None]:
df = pd.read_csv("/kaggle/input/california-housing-prices/housing.csv")

# Basic Data Exploration

In [None]:
df.head()

In [None]:
df['median_house_value'].hist(bins=40)

In [None]:
df.shape

In [None]:
ax = df.hist(figsize=[20, 18], bins = 50)

In [None]:
ax = pd.plotting.scatter_matrix(df, alpha=0.2, figsize = [20, 20])

# Creating New Features 

In [None]:
df['peopleperrooms'] = df['households']/df['total_rooms']

In [None]:
df['latlon'] = df['longitude'] * df['latitude']

In [None]:
df['roomsperbedrooms'] = df['total_rooms']/df['total_bedrooms']

In [None]:
df['peopleperbedrooms'] = df['households']/df['total_bedrooms']

In [None]:
df['ageperpop'] = df['housing_median_age']/df['households']

In [None]:
df['ageperbeds'] = df['housing_median_age']/df['total_bedrooms']

In [None]:
list(df.columns.values)

In [None]:
df.head()

In [None]:
ax = pd.plotting.scatter_matrix(df[['median_house_value', 
                                    'peopleperrooms', 
                                    'latlon', 
                                    'roomsperbedrooms',
                                    'peopleperbedrooms',
                                    'ageperpop',
                                    'ageperbeds']], figsize = [32, 28])

## Remove null data

In [None]:
df[df.isnull().any(axis=1)]

In [None]:
df = df[~df.isnull().any(axis=1)]

# Create clustering for lat and long

In [None]:
plt.scatter(df['longitude'], df['latitude'], marker = ".")

In [None]:
X = df[['longitude','latitude']]
means = KMeans(n_clusters=60, random_state=0).fit(X)

In [None]:
means.cluster_centers_[0:5]

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))
ax.scatter(df['longitude'], df['latitude'], marker = ".")
centroids = means.cluster_centers_

ax.scatter(centroids[:, 0], centroids[:, 1], color = "black", marker = ",")

In [None]:
df['geo_cluster'] = means.predict(X)

In [None]:
df.groupby("geo_cluster")['median_house_value'].mean().head()

In [None]:
df.groupby("geo_cluster")['median_house_value'].mean().sort_values().head()

- Let's order the clusters centers by housing value and use this as a feature.

In [None]:
index = df.groupby("geo_cluster")['median_house_value'].mean().sort_values().index
new_index = range(0, 60)
dict_index = dict(zip(index, new_index))
dict_index

In [None]:
df['geo_cluster_ordered'] = df['geo_cluster'].apply(lambda x: dict_index[x])

In [None]:
df.head()

# Transform ocean_proximity into a numerical feature

In [None]:
df.groupby('ocean_proximity')['median_house_value'].mean()

In [None]:
values = df.groupby('ocean_proximity')['median_house_value'].mean()
new_values = dict(zip(values.index, values.values))
new_values

In [None]:
df['n_ocean_proximity'] = df['ocean_proximity'].apply(lambda x: new_values[x])

In [None]:
ax = pd.plotting.scatter_matrix(df[['median_house_value', 'n_ocean_proximity', 'geo_cluster_ordered']], alpha=0.8, figsize = [40, 36])

# Split Dataset Into Training and Test

In [None]:
X = df.drop(columns=['median_house_value', 'ocean_proximity', 'latlon', 'geo_cluster'])
y = df['median_house_value']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Prediction

In [None]:
model = GradientBoostingRegressor().fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

# Results

In [None]:
print(np.sqrt(mean_squared_error(y_test,y_pred)))

In [None]:
r2_score(y_test, y_pred)

In [None]:
mean_absolute_error(y_test, y_pred)