In [None]:
import numpy as np
import pandas as pd
from collections import Counter
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('../input/california-housing-prices/housing.csv')

In [None]:
print(df.shape)
df.head(10)

In [None]:
df.describe()

In [None]:
df.hist(bins=50, figsize=(20,15))

In [None]:
labels_count = Counter(list(df['ocean_proximity']))
names = list(labels_count.keys())
values = list(labels_count.values())
plt.bar(names, values)

In [None]:
scatter_matrix(df[df.columns], figsize=(20,20))

In [None]:
plt.scatter(x=df["longitude"], y=df["latitude"], alpha=0.1)

In [None]:
cal_map =plt.scatter(x=df["longitude"], y=df["latitude"], alpha=0.4,
        c=df["median_house_value"], cmap=plt.get_cmap("jet"))
plt.colorbar(cal_map)

In [None]:
cal_map = plt.scatter(x=df["longitude"], y=df["latitude"],  alpha=0.4,
        s=df["population"]/300,
        c=df["median_house_value"], cmap=plt.get_cmap("jet"))
plt.colorbar(cal_map)

# Data Pipelines

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [None]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer()),
    ('std_scaler', StandardScaler())
])

In [None]:
num_attributes = list(df.drop(['ocean_proximity', 'median_house_value'], axis=1))
cat_attributes = ['ocean_proximity']

pipe = ColumnTransformer([
    ('numerical', num_pipeline, num_attributes),
    ('categorical', OneHotEncoder(), cat_attributes)
])

# Prepare data

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop('median_house_value', axis=1)
y = df['median_house_value']

In [None]:
X_prepared = pipe.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(
    X_prepared, y,
    test_size=.30,
    random_state=1234
)

# Model training and evaluation

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

cls = RandomForestRegressor(n_estimators=10, random_state=13)
cls.fit(X_train, y_train)
predictions = cls.predict(X_test)
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
print("Estimators: 10\tmax depth: None\tMSE: {}\tRMSE: {}".format(mse, rmse))