In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('ca_housing.csv')

# Data Preprocessing

In [3]:
df = df.dropna(how='any',axis=0)

In [4]:
def remove_outliers(df, columns, std_dev_away=3):
    for column in columns:
        upper_limit = df[column].mean() + std_dev_away * df[column].std()
        lower_limit = df[column].mean() - std_dev_away * df[column].std()
        df = df[(df[column] < upper_limit) & (df[column] > lower_limit)]
    return df

In [5]:
df = remove_outliers(df, ['longitude', 'latitude', 'housing_median_age', 
                          'total_rooms', 'total_bedrooms', 'population', 
                          'households', 'median_income', 'median_house_value'])

In [6]:
df['total_rooms'] = np.log(df['total_rooms'] + 1)
df['total_bedrooms'] = np.log(df['total_bedrooms'] + 1)
df['population'] = np.log(df['population'] + 1)
df['households'] = np.log(df['households'] + 1)

In [7]:
df['bedroom_ratio'] = df['total_bedrooms'] / df['total_rooms']
df['household_ratio'] = df['households'] / df['population']
df['room_ratio'] = df['total_rooms'] / df['households']

In [8]:
df = df.join(pd.get_dummies(df.ocean_proximity)).drop(['ocean_proximity'], axis=1)

# Splitting the Data

In [10]:
from sklearn.model_selection import train_test_split

X = df.drop(['median_house_value'], axis=1)
y = df['median_house_value']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Training/Testing the Model

In [13]:
from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor()
regressor.fit(X_train, y_train)

In [17]:
regressor.score(X_test, y_test)

0.8007300372593953

# Hyperparamater Tuning

In [19]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': [2, 4, 6, 8]
}

grid_search = GridSearchCV(regressor, param_grid, cv=5, 
             scoring='neg_mean_squared_error', 
             return_train_score=True)

grid_search.fit(X_train, y_train)

In [21]:
grid_search.best_estimator_.score(X_test, y_test)

0.8146467618611383