## California Housing Price
- predict median price per district
- model: binary classification/labeled supervised learning
- dataset: https://github.com/ageron/handson-ml2/tree/master/datasets/housing

### 1. Read Data

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

import tensorflow as tf
from tensorflow import feature_column
from tensorflow.keras import layers

In [None]:
df=pd.read_csv("../input/california-housing-prices/housing.csv")
df['median_house_value']/=1000
df.head()

In [None]:
df.shape

In [None]:
df.isna().sum()
#total_badrooms 207/20640 is missing

In [None]:
df=df.dropna()
df.isna().sum()

### 2. Data Exploration

In [None]:
df.describe()

#### Take away:
- comparing to 75%, max for `total_rooms`, `population`, `households` need a further check.
- abnormal data for target col `median_house_value`.

In [None]:
df=df[(df.total_rooms<=5000)&(df.total_bedrooms<=1000)&(df.population<=2500)&(df.households<=1000)&
      (df.median_income<=8)&(df.median_house_value<500)]

#df.median_house_value.hist(bins=100)
#df.total_bedrooms.hist(bins=100)
#df.median_income.hist(bins=100)
#df.total_rooms.hist(bins=100)
#df.population.hist(bins=100)
#df.households.hist(bins=100)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()
scaled_df=scaler.fit_transform(df.loc[:,df.columns!='ocean_proximity']) 
scaled_df=pd.DataFrame(scaled_df,columns=df.columns.values[0:-1])
scaled_df.head()

In [None]:
scaled_df.hist(bins=100,figsize=(15,10))
plt.show()

### 3. Feature Engineering

In [None]:
# binary classification, if over threshold, label as 1, otherwise, label as 0.
threshold= 1.0 
scaled_df["median_house_value_is_high"] = (scaled_df["median_house_value"] > threshold).astype(float)

In [None]:
feature_columns = []

In [None]:
#location
resolution_in_degrees = 0.4 

latitude_num = tf.feature_column.numeric_column("latitude")
latitude_bins = list(np.arange(int(min(train_df['latitude'])), int(max(train_df['latitude'])), resolution_in_degrees))
latitude = tf.feature_column.bucketized_column(latitude_num, latitude_bins)

longitude_num = tf.feature_column.numeric_column("longitude")
longitude_bins = list(np.arange(int(min(train_df['longitude'])), int(max(train_df['longitude'])), resolution_in_degrees))
longitude = tf.feature_column.bucketized_column(longitude_num, longitude_bins)

lat_x_lon = tf.feature_column.crossed_column([latitude, longitude], hash_bucket_size=100)
crossed_location = tf.feature_column.indicator_column(lat_x_lon)
feature_columns.append(crossed_location)

In [None]:
df.ocean_proximity.value_counts()

In [None]:
#ocean proximity
ocean_prox= tf.feature_column.categorical_column_with_vocabulary_list(
      'ocean_proximity', ['<1H OCEAN', 'INLAND','NEAR OCEAN','NEAR BAY','ISLAND'])

#ocean_prox_ohe = feature_column.indicator_column(ocean_prox)
feature_columns.append(ocean_prox)

In [None]:
#demographic
med_income = tf.feature_column.numeric_column("median_income")
feature_columns.append(med_income)

population = tf.feature_column.numeric_column("population")
feature_columns.append(population)

households = tf.feature_column.numeric_column("households")
feature_columns.append(households)

In [None]:
#house
house_age=tf.feature_column.numeric_column("housing_median_age")
feature_columns.append(house_age)

ttl_room=tf.feature_column.numeric_column("total_rooms")
feature_columns.append(ttl_room)

ttl_bedroom=tf.feature_column.numeric_column("total_bedrooms")
feature_columns.append(ttl_bedroom)

In [None]:
feature_layer = layers.DenseFeatures(feature_columns)

### Train/Test Split

In [None]:
from sklearn.model_selection import train_test_split
train_df, test_df= train_test_split(scaled_df,test_size=0.22, random_state=123)
print("Total df size: %i\n train_df size: %i \n test_df size: %i"\
%(df.shape[0],train_df.shape[0],test_df.shape[0]))

### 4. Modeling

### 4.1 Define functions that build and train a model
- build_model(learning_rate), which builds a randomly-initialized model.
- train_model(model, feature, label, epochs), which trains the model from the examples (feature and label) you pass.

In [None]:
#Define the functions that create and train a model.
def create_model(my_learning_rate, feature_layer, my_metrics): #add one more var
  model = tf.keras.models.Sequential()

  model.add(feature_layer)
  model.add(tf.keras.layers.Dense(units=1, input_shape=(1,),
                                  activation=tf.sigmoid),)

  model.compile(optimizer=tf.keras.optimizers.RMSprop(lr=my_learning_rate),                                                   
                loss=tf.keras.losses.BinaryCrossentropy(), #binaryloss vs rmse
                metrics=my_metrics) #my_metrics
  return model        


def train_model(model, dataset, epochs, label_name,
                batch_size=None, shuffle=True):
    
  features = {name:np.array(value) for name, value in dataset.items()}
  label = np.array(features.pop(label_name)) 
  history = model.fit(x=features, y=label, batch_size=batch_size,
                      epochs=epochs, shuffle=shuffle)
  
  epochs = history.epoch
  hist = pd.DataFrame(history.history) #hist vs rmse

  return epochs, hist  

### 4.2 Define plotting functions
- a loss curve

In [None]:
#Define the plotting function.
def plot_curve(epochs, hist, list_of_metrics):
  plt.figure()
  plt.xlabel("Epoch")
  plt.ylabel("Value")

  for m in list_of_metrics:
    x = hist[m]
    plt.plot(epochs[1:], x[1:], label=m)

  plt.legend()

### 4.3 Call the model functions

In [None]:
# Hyperparameters.
learning_rate = 0.001
epochs = 20
batch_size = 100
label_name = "median_house_value_is_high"
threshold = 0.35

# Establish the metrics the model will measure.
METRICS = [
           tf.keras.metrics.BinaryAccuracy(name='accuracy', threshold=threshold),
           tf.keras.metrics.Precision(name='precision',thresholds=threshold),
           tf.keras.metrics.Recall(name='recall',thresholds=threshold),
           #tf.keras.metrics.AUC(name='auc',thresholds=threshold)
          ]

my_model = create_model(learning_rate, feature_layer, METRICS)

epochs, hist = train_model(my_model, train_df, epochs, 
                           label_name, batch_size)

list_of_metrics_to_plot = ['accuracy','precision','recall'] 

plot_curve(epochs, hist, list_of_metrics_to_plot)

In [None]:
# change thresholds
learning_rate = 0.001
epochs = 20
batch_size = 100
label_name = "median_house_value_is_high"
threshold = 0.52

# Establish the metrics the model will measure.
METRICS = [
           tf.keras.metrics.BinaryAccuracy(name='accuracy', threshold=threshold),
           tf.keras.metrics.Precision(name='precision',thresholds=threshold),
           tf.keras.metrics.Recall(name='recall',thresholds=threshold),
           #tf.keras.metrics.AUC(name='auc',thresholds=threshold)
          ]

my_model = create_model(learning_rate, feature_layer, METRICS)

epochs, hist = train_model(my_model, train_df, epochs, 
                           label_name, batch_size)

list_of_metrics_to_plot = ['accuracy','precision','recall'] 

plot_curve(epochs, hist, list_of_metrics_to_plot)

- A `threshold` of slightly over 0.5 appears to produce the highest accuracy (about 83%). 
- Raising the `threshold` to 0.9 drops accuracy by about 5%.
- Lowering the `threshold` to 0.3 drops accuracy by about 3%. 

In [None]:
print("\n: Evaluate the new model on the test set:")
test_features = {name:np.array(value) for name, value in test_df.items()}
test_label = np.array(test_features.pop(label_name))
my_model.evaluate(x=test_features, y=test_label, batch_size=batch_size)
#87%

In [None]:
#AUC
learning_rate = 0.001
epochs = 20
batch_size = 100
label_name = "median_house_value_is_high"

METRICS = [
      tf.keras.metrics.AUC(num_thresholds=100, name='auc'),
]

# Establish the model's topography.
my_model = create_model(learning_rate, feature_layer, METRICS)

# Train the model on the training set.
epochs, hist = train_model(my_model, train_df, epochs, 
                           label_name, batch_size)

# Plot metrics vs. epochs
list_of_metrics_to_plot = ['auc'] 
plot_curve(epochs, hist, list_of_metrics_to_plot)

In [None]:
print("\n: Evaluate the new model on the test set:")
test_features = {name:np.array(value) for name, value in test_df.items()}
test_label = np.array(test_features.pop(label_name))
my_model.evaluate(x=test_features, y=test_label, batch_size=batch_size)
#AUC:0.9059