In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

### Loading the dataset

In [None]:
data_df = pd.read_csv("/kaggle/input/california-housing-prices/housing.csv")

### Viewing the Dataset

In [None]:
data_df.head()

### Seeing the Shape

In [None]:
print(data_df.shape)

### Information about the dataset

In [None]:
data_df.info()

### Visualize the Data

In [None]:
data_df.hist(bins=50, figsize=(15,15))
plt.show()

### Checking Null Values

In [None]:
data_df.isnull().sum()

### Handling the Missing Values

In [None]:
import math 
total_bedrooms_median = math.floor(data_df["total_bedrooms"].median())
print(total_bedrooms_median)

In [None]:
data_df["total_bedrooms"] = data_df["total_bedrooms"].fillna(total_bedrooms_median)

In [None]:
data_df.isnull().sum()

### Convert the categorical data of "ocean_proximity" column

In [None]:
from sklearn.preprocessing import LabelEncoder,StandardScaler
l_er = LabelEncoder()
data_df['ocean_proximity'] = l_er.fit_transform(data_df['ocean_proximity'])

In [None]:
data_df.info()

### Viewing the Feature data using a Scatter Plot

In [None]:
X_Features_1=['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'ocean_proximity']
viewing = data_df[X_Features_1]
viewing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
            s=viewing["population"]/100, label="population",
            c=data_df["median_house_value"], cmap=plt.get_cmap("jet"), colorbar=True,
            figsize=(20,10))
plt.legend() # cmap is showing the median house value

### Relation between median House Value and the Median House Income

In [None]:
"""
House value increases with the increasing of the Median Income
"""

data_df.plot(kind='scatter', x='median_income', y='median_house_value',
            alpha=0.1, figsize=(8,5))

### Scaling the Column Values

In [None]:
from sklearn import preprocessing

# Get column names first
col_names = data_df.columns

"""
StandardScaler is useful for the features that follow a Normal distribution.
"""

# create the scaler object
scaler = preprocessing.StandardScaler() # MUST USE Standscaler

# Fit the data on the scaler object
data_df_scaled = scaler.fit_transform(data_df)
data_df_scaled = pd.DataFrame(data_df_scaled, columns=col_names)

In [None]:
data_df_scaled.head()

### Extract Features and Label Data

In [None]:
X_Features=['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'ocean_proximity']

X=data_df_scaled[X_Features]
X.head()

In [None]:
print(X.shape)

In [None]:
Y=data_df_scaled['median_house_value']
Y.head()

In [None]:
print(Y.shape)

### Split the dataset

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=0.25,random_state=1)

print ("x_train ", x_train.shape, ", y_train " ,y_train.shape)
print ("x_test ", x_test.shape, ", y_test " ,y_test.shape)

### Train the Model

#### Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

rfr_model = RandomForestRegressor(max_depth=5, random_state=0)
rfr_model.fit(x_train, y_train)
acc_rfr_model = rfr_model.score(x_test, y_test) 
print(acc_rfr_model)

#### Finding the best value for max_depth

In [None]:
dict_rfr_model_acc = {}
for i in range(5, 25):
  rfr_model = RandomForestRegressor(max_depth=i, random_state=0)
  rfr_model.fit(x_train, y_train)
  acc_rfr_model = rfr_model.score(x_test, y_test) 
  dict_rfr_model_acc[i]= acc_rfr_model

import operator
max_accuracy_key = max(dict_rfr_model_acc.items(), key=operator.itemgetter(1))[0]
print(max_accuracy_key)

#### Train the model with the max_depth value which has shown the highest accuracy

In [None]:
rfr_model = RandomForestRegressor(max_depth=24, random_state=0)
rfr_model.fit(x_train, y_train)
acc_rfr_model = rfr_model.score(x_test, y_test) 
print(acc_rfr_model)