In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('../input/california-housing-prices/housing.csv')

In [None]:
df.head()

In [None]:
from pandas_profiling import ProfileReport as pp
profile = pp(
    df, title="California House ", html={"style": {"full_width": True}}, sort="None"
)
# The Notebook Widgets Interface
profile.to_widgets()
# Or use the HTML report in an iframe
profile

In [None]:
df.info()

In [None]:
import matplotlib.pyplot as plt
df.hist(bins=50, figsize=(15, 12))
plt.show()

We see that most of data is left-skewed.We will try to make it non-skewed.

In [None]:
df['income_cat'] = pd.cut(df['median_income'], bins=[0., 1.5, 3.0, 4.5, 6., np.inf], labels=[1, 2, 3, 4, 5])
df['income_cat'].hist()

In [None]:
sns.distplot((df['median_income']))

In [None]:
sns.distplot(df['median_house_value'])

In [None]:
q = df["median_house_value"].quantile(0.96)
df = df[df["median_house_value"]<q]

In [None]:
sns.distplot((df['housing_median_age']))

In [None]:
f, axes = plt.subplots(1, 2,figsize=(14,6))
sns.distplot(np.sqrt(df['total_rooms']),ax=axes[0])
sns.boxplot(df['total_rooms'],ax=axes[1])

In [None]:
df['total_rooms'] = np.sqrt(df['total_rooms'])
q = df["total_rooms"].quantile(0.98)
df = df[df["total_rooms"]<q]

In [None]:
f, axes = plt.subplots(1, 2,figsize=(14,6))
sns.distplot(np.sqrt(df['total_bedrooms']),ax=axes[0])
sns.boxplot(df['total_bedrooms'],ax=axes[1])

In [None]:
df['total_bedrooms'] = np.sqrt(df['total_bedrooms'])
q = df["total_bedrooms"].quantile(0.95)
df = df[df["total_bedrooms"]<q]

In [None]:
sns.distplot(df['median_income'])

In [None]:
q = df["median_income"].quantile(0.98)
df = df[df["median_income"]<q]

In [None]:
f, axes = plt.subplots(1, 2,figsize=(14,6))
sns.distplot(np.sqrt(df['population']),ax=axes[0])
sns.boxplot(df['population'],ax=axes[1])

In [None]:
df['population'] = np.sqrt(df['population'])
q = df["population"].quantile(0.98)
df = df[df["population"]<q]

In [None]:
f, axes = plt.subplots(1, 2,figsize=(14,6))
sns.distplot(np.sqrt(df['households']),ax=axes[0])
sns.boxplot(df['households'],ax=axes[1])

In [None]:
df['households'] = np.sqrt(df['households'])
q = df["households"].quantile(0.98)
df = df[df["households"]<q]

In [None]:
df.plot(kind='scatter', x='longitude', y='latitude', alpha=0.4, s=df['population']/100, label='population',
figsize=(12, 8), c='median_house_value', cmap=plt.get_cmap('jet'), colorbar=True)
plt.title("Population Density")
plt.legend()
plt.show()

In [None]:
print(df['ocean_proximity'].value_counts())
sns.countplot(df['ocean_proximity'])

FEATURE ENGINEERING

In [None]:
ocean_ohe = pd.get_dummies(df['ocean_proximity'],drop_first=True)
ocean_ohe

In [None]:
data = pd.concat([df,ocean_ohe],axis=1)
data.head()

In [None]:
data = data.drop(['longitude','latitude','ocean_proximity'],1)
data.head()

In [None]:
plt.figure(figsize=(12,12))
sns.heatmap(data.corr(),annot=True,cbar=False)

In [None]:
from sklearn.preprocessing import StandardScaler
std = StandardScaler()
col = ['housing_median_age','total_rooms','total_bedrooms','population','households','median_income','median_house_value']
for i in col:
    data[[i]] = std.fit_transform(data[[i]]) 

In [None]:
data["rooms_per_household"] = data["total_rooms"]/data["households"]
data["bedrooms_per_room"] = data["total_bedrooms"]/data["total_rooms"]
data["population_per_household"] = data["population"]/data["households"]

In [None]:
data.head()

In [None]:
data['income_cat'] = data['income_cat'].astype('int')

In [None]:
X = data.drop(['median_house_value'],1)
y = data['median_house_value']

In [None]:
print(X.shape)
print(y.shape)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_log_error, r2_score
from sklearn.model_selection import train_test_split

def training_model(model):
    X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.2,random_state=42)
    model.fit(X_train,y_train)
    print("RMSE ==> ",np.sqrt(mean_squared_error(y_train,model.predict(X_train))))
    print("RMSE ==> ",np.sqrt(mean_squared_error(y_test,model.predict(X_test))))
        
        
    
    

In [None]:
#LINEAR REGRESSION
from sklearn.linear_model import LinearRegression
ln = LinearRegression()
training_model(ln)

In [None]:
#RANDOM FOREST REGRESSOR
from sklearn.ensemble import RandomForestRegressor
algo_reg = RandomForestRegressor(n_estimators=5, random_state=42)
training_model(algo_reg)

The Random Forest is overfitting, hence we need to do certain parameter tuning.

In [None]:
## ADABOOST REGRESSOR
from sklearn.ensemble import AdaBoostRegressor
algo_ada = AdaBoostRegressor(n_estimators=5, random_state=42)
training_model(algo_ada)

In [None]:
#GRADIENT BOOSTING REGRESSOR
from sklearn.ensemble import GradientBoostingRegressor
algo_gb = GradientBoostingRegressor(n_estimators=5, random_state=42)
training_model(algo_gb)

In [None]:
## DECISION TREE REGRESSOR
from sklearn.tree import DecisionTreeRegressor

algo_dec = DecisionTreeRegressor(random_state=42)
training_model(algo_dec)

In [None]:
X.info()

In [None]:
## XGBOOST
import xgboost
xgb = xgboost.XGBRFRegressor()
training_model(xgb)

In [None]:
## LIGHTGBM
import lightgbm
lgb = lightgbm.LGBMRegressor()
training_model(lgb)

In [None]:
## CATBOOST
import catboost
cb = catboost.CatBoostRegressor()
training_model(cb)

In [None]:
## USING NEURAL NETWORK
import tensorflow as tf
from tensorflow import keras
from math import sqrt
from sklearn import preprocessing

model = keras.Sequential([keras.layers.Dense(512, input_dim = X.shape[1], kernel_initializer="normal", activation="relu"),
                          keras.layers.Dense(512, kernel_initializer="normal", activation="relu", kernel_regularizer=tf.keras.regularizers.l1(0.1)),
                          keras.layers.Dense(256, kernel_initializer="normal", activation="relu", kernel_regularizer=tf.keras.regularizers.l1(0.1)),
                          keras.layers.Dense(128, kernel_initializer="normal", activation="relu", kernel_regularizer=tf.keras.regularizers.l1(0.1)),
                          keras.layers.Dense(64, kernel_initializer="normal", activation="relu", kernel_regularizer=tf.keras.regularizers.l1(0.1)),
                          keras.layers.Dense(32, kernel_initializer="normal", activation="relu", kernel_regularizer=tf.keras.regularizers.l1(0.1)),
                          keras.layers.Dense(1, kernel_initializer="normal", activation="linear")])

model.compile(loss="mean_absolute_error", optimizer="adam", metrics=["mean_absolute_error"])

In [None]:
history = model.fit(X,y, epochs=30, validation_split=0.25, shuffle=True)

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()


In [None]:
## WITH SLIGHTLY MOR HYPERPARAMETER TUNING OF CATBOOST MODEL , WE CAN SCORE MORE.