In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

sns.set()
pd.set_option("max_columns", None)

In [None]:
dataset = pd.read_csv('../input/california-housing-prices-data-extra-features/California_Houses.csv')

In [None]:
dataset.head()

In [None]:
dataset.isna().sum()

In [None]:
dataset.describe(include = 'all')

## Start with some EDA 

In [None]:
dataset.plot(kind = "scatter", x = 'Median_Income', y = 'Median_House_Value', figsize = (10, 8))
plt.xlabel("Median Income", size = 15)
plt.ylabel("Median House Value", size = 15)
plt.title("Median Income vs. Median House Value", size = 20)
plt.show()

In [None]:
dataset.plot(kind = "scatter", x = 'Median_Age', y = 'Median_House_Value', figsize = (10, 8))
plt.xlabel("Median Age", size = 15)
plt.ylabel("Median House Value", size = 15)
plt.title("Median Age vs. Median House Value", size = 20)
plt.show()
plt.show()

In [None]:
dataset.plot(kind = "scatter", x = 'Tot_Rooms', y = 'Median_House_Value', figsize = (10, 8))
plt.xlabel("Total Rooms", size = 15)
plt.ylabel("Median House Value", size = 15)
plt.title("Total Rooms vs. Median House Value", size = 20)
plt.show()

In [None]:
dataset.plot(kind = "scatter", x = 'Tot_Bedrooms', y = 'Median_House_Value', figsize = (10, 8))
plt.xlabel("Total Bedrooms", size = 15)
plt.ylabel("Median House Value", size = 15)
plt.title("Total Bedrooms vs. Median House Value", size = 20)
plt.show()

In [None]:
dataset.plot(kind = "scatter", x = 'Population', y = 'Median_House_Value', figsize = (10, 8))
plt.xlabel("Population", size = 15)
plt.ylabel("Median House Value", size = 15)
plt.title("Population vs. Median House Value", size = 20)
plt.show()

In [None]:
dataset.plot(kind = "scatter", x = 'Households', y = 'Median_House_Value', figsize = (10, 8))
plt.xlabel("Households", size = 15)
plt.ylabel("Median House Value", size = 15)
plt.title("Households vs. Median House Value", size = 20)
plt.show()

In [None]:
dataset.plot(kind = "scatter", x = 'Distance_to_coast', y = 'Median_House_Value', figsize = (10, 8))
plt.xlabel("Distance to Coast", size = 15)
plt.ylabel("Median House Value", size = 15)
plt.title("Distance to Coast vs. Median House Value", size = 20)
plt.show()

In [None]:
dataset.plot(kind = "scatter", x = 'Distance_to_LA', y = 'Median_House_Value', figsize = (10, 8))
plt.xlabel("Distance to LA", size = 15)
plt.ylabel("Median House Value", size = 15)
plt.title("Distance to LA vs. Median House Value", size = 20)
plt.show()

In [None]:
dataset.plot(kind = "scatter", x = 'Distance_to_SanDiego', y = 'Median_House_Value', figsize = (10, 8))
plt.xlabel("Distance to San Diego", size = 15)
plt.ylabel("Median House Value", size = 15)
plt.title("Distance to San Diego vs. Median House Value", size = 20)
plt.show()

In [None]:
dataset.plot(kind = "scatter", x = 'Distance_to_SanJose', y = 'Median_House_Value', figsize = (10, 8))
plt.xlabel("Distance to San Jose", size = 15)
plt.ylabel("Median House Value", size = 15)
plt.title("Distance to San Jose vs. Median House Value", size = 20)
plt.show()

In [None]:
dataset.plot(kind = "scatter", x = 'Distance_to_SanFrancisco', y = 'Median_House_Value', figsize = (10, 8))
plt.xlabel("Distance to San Francisco", size = 15)
plt.ylabel("Median House Value", size = 15)
plt.title("Distance to San Francisco vs. Median House Value", size = 20)
plt.show()

## Build the Models

### Doing the data preprocessing

In [None]:
X = dataset.iloc[:,1:]
X.head()

In [None]:
y = dataset.iloc[:, 0]
y.head()

In [None]:
# Split the data into train and test sets 

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [None]:
# Perfrom feature scaling

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train.iloc[:, :] = sc.fit_transform(X_train.iloc[:, :])
X_test.iloc[:, :] = sc.transform(X_test.iloc[:, :])

In [None]:
X_train.head()

In [None]:
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

### Test some Modles 

### Multiple Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [None]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

In [None]:
from sklearn.metrics import r2_score 

r2_score(y_test, y_pred)

### Support Vector Regression

In [None]:
from sklearn.svm import SVR
regressor = SVR(kernel = 'rbf')
regressor.fit(X_train, y_train)

In [None]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

In [None]:
r2_score(y_test, y_pred)

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 10)
regressor.fit(X_train, y_train)

In [None]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

In [None]:
r2_score(y_test, y_pred)

### Artificial Neural Networks

In [None]:
import tensorflow as tf

ann = tf.keras.models.Sequential()

ann.add(tf.keras.layers.Dense(units=12, activation='relu'))

ann.add(tf.keras.layers.Dense(units=12, activation='relu'))

ann.add(tf.keras.layers.Dense(units=1))

ann.compile(optimizer = 'adam', loss = 'mean_squared_error')

In [None]:
ann.fit(np.asarray(X_train).astype(np.float32), y_train, batch_size = 32, epochs = 100)

In [None]:
y_pred_ann = ann.predict(X_test)
print(np.concatenate((y_pred_ann.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

In [None]:
r2_score(y_test, y_pred_ann)

### XGBoost

In [None]:
from xgboost import XGBRegressor

regressor = XGBRegressor()
regressor.fit(X_train, y_train)

In [None]:
y_pred_xgboost = regressor.predict(X_test)

In [None]:
r2_score(y_test, y_pred_xgboost)

#### It looks like the best model is XGBoost with and r^2 of 0.83, followed closely by the random forest which has an r^2 of 0.80 and an MAPE of 0.177