## California Housing Price
- predict median price per district
- model: regression/labeled supervised learning
- dataset: https://github.com/ageron/handson-ml2/tree/master/datasets/housing

### 1. Read Data

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

import tensorflow as tf
from tensorflow import feature_column
from tensorflow.keras import layers

In [None]:
df=pd.read_csv("../input/california-housing-prices/housing.csv")
df['median_house_value']/=1000
df.head()

In [None]:
df.shape

In [None]:
df.isna().sum()
#total_badrooms 207/20640 is missing

In [None]:
df=df.dropna()
df.isna().sum()

### 2. Data Exploration

In [None]:
df.describe()

#### Take away:
- comparing to 75%, max for `total_rooms`, `population`, `households` need a further check.
- abnormal data for target col `median_house_value`.

In [None]:
df=df[(df.total_rooms<=5000)&(df.total_bedrooms<=1000)&(df.population<=2500)&(df.households<=1000)&
      (df.median_income<=8)&(df.median_house_value<500)]

#df.median_house_value.hist(bins=100)
#df.total_bedrooms.hist(bins=100)
#df.median_income.hist(bins=100)
#df.total_rooms.hist(bins=100)
#df.population.hist(bins=100)
#df.households.hist(bins=100)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()
scaled_df=scaler.fit_transform(df.loc[:,df.columns!='ocean_proximity']) 
scaled_df=pd.DataFrame(scaled_df,columns=df.columns.values[0:-1])
scaled_df.head()

In [None]:
scaled_df.hist(bins=100,figsize=(15,10))
plt.show()

### 3. Feature Engineering

In [None]:
feature_columns = []

In [None]:
#location
resolution_in_degrees = 0.4 

latitude_num = tf.feature_column.numeric_column("latitude")
latitude_bins = list(np.arange(int(min(scaled_df['latitude'])), int(max(scaled_df['latitude'])), resolution_in_degrees))
latitude = tf.feature_column.bucketized_column(latitude_num, latitude_bins)

longitude_num = tf.feature_column.numeric_column("longitude")
longitude_bins = list(np.arange(int(min(scaled_df['longitude'])), int(max(scaled_df['longitude'])), resolution_in_degrees))
longitude = tf.feature_column.bucketized_column(longitude_num, longitude_bins)

lat_x_lon = tf.feature_column.crossed_column([latitude, longitude], hash_bucket_size=100)
crossed_feature = tf.feature_column.indicator_column(lat_x_lon)
feature_columns.append(crossed_feature)

In [None]:
#demographic
med_income = tf.feature_column.numeric_column("median_income")
feature_columns.append(med_income)

population = tf.feature_column.numeric_column("population")
feature_columns.append(population)

households = tf.feature_column.numeric_column("households")
feature_columns.append(households)

In [None]:
#house
house_age=tf.feature_column.numeric_column("housing_median_age")
feature_columns.append(house_age)

ttl_room=tf.feature_column.numeric_column("total_rooms")
feature_columns.append(ttl_room)

ttl_bedroom=tf.feature_column.numeric_column("total_bedrooms")
feature_columns.append(ttl_bedroom)

In [None]:
feature_layer = layers.DenseFeatures(feature_columns)

### Train/Test Split

In [None]:
from sklearn.model_selection import train_test_split
train_df, test_df= train_test_split(scaled_df,test_size=0.22, random_state=123)
print("Total df size: %i\n train_df size: %i \n test_df size: %i"\
%(df.shape[0],train_df.shape[0],test_df.shape[0]))

### 4. Modeling

### 4.1 Define functions that build and train a model
- build_model(learning_rate), which builds a randomly-initialized model.
- train_model(model, feature, label, epochs), which trains the model from the examples (feature and label) you pass.

In [None]:
#Define the functions that build and train a model
def build_model(my_learning_rate, feature_layer):
  # Create and compile a simple linear regression model.
  model = tf.keras.models.Sequential() # Most simple tf.keras models are sequential.

  # Describe the topography of the model.
  model.add(feature_layer)
  model.add(tf.keras.layers.Dense(units=1, input_shape=(1,))) #a single node in a single layer.

  # Compile into TensorFlow. 
  model.compile(optimizer=tf.keras.optimizers.RMSprop(lr=my_learning_rate),
                loss="mean_squared_error",
                metrics=[tf.keras.metrics.RootMeanSquaredError()])

  return model        

#Train the model by feeding feature and label.
def train_model(model, df, epochs, batch_size, label_name):
    
  features = {name:np.array(value) for name, value in df.items()}
  label = np.array(features.pop(label_name))
    
  history = model.fit(x=features,y=label,
                      batch_size=batch_size,epochs=epochs,
                      shuffle=True) # specified number of epochs. 

  # Gather the trained model's weight and bias.
  #trained_weight = model.get_weights()[0]
  #trained_bias = model.get_weights()[1]

  epochs = history.epoch # The list of epochs is stored separately from the rest of history.
  
  hist = pd.DataFrame(history.history) # Isolate the error for each epoch.
  rmse = hist["root_mean_squared_error"] # Take a snapshot of the model's root mean squared error at each epoch. 

  return epochs, rmse

### 4.2 Define plotting functions
- a loss curve

In [None]:
#Plot the trained model against 200 random training examples.
def plot_the_model(trained_weight, trained_bias, feature, label):
  
  plt.xlabel(feature)
  plt.ylabel(label)

  random_examples = training_df.sample(n=200)
  plt.scatter(random_examples[feature], random_examples[label])

  # Create a red line starts at coordinates (x0, y0) and ends at coordinates (x1, y1).
  x0 = 0
  y0 = trained_bias
  x1 = 10000
  y1 = trained_bias + (trained_weight * x1)
  plt.plot([x0, x1], [y0, y1], c='r')

  plt.show()

#Plot a curve of loss vs. epoch.
def plot_the_loss_curve(epochs, rmse):
  plt.figure()
  plt.xlabel("Epoch")
  plt.ylabel("Root Mean Squared Error")

  plt.plot(epochs, rmse, label="Loss")
  plt.legend()
  plt.ylim([rmse.min()*0.94, rmse.max()* 1.05])
  plt.show()  

### 4.3 Call the model functions

In [None]:
# Hyperparameters:
learning_rate = 0.05
epochs = 200
batch_size = 100

label_name="median_house_value" 

# Invoke the functions.
my_model = build_model(learning_rate,feature_layer)
epochs, rmse = train_model(my_model, train_df, 
                           epochs, batch_size, label_name)

plot_the_loss_curve(epochs, rmse)

In [None]:
print("\n: Evaluate the new model on the test set:")
test_features = {name:np.array(value) for name, value in test_df.items()}
test_label = np.array(test_features.pop(label_name))
my_model.evaluate(x=test_features, y=test_label, batch_size=batch_size)