#### Feature Crosses
##### In this exercise we learn about feature crosses and create code to use feature crosses and experiment with different ways to represent features

##### We also learn to use different features of tensor flow and pands like
 * Use tf.feature_column methods to represent features in different ways.
 * Represent features as bins.
 * Cross bins to create a feature cross.

<b> We make use of the same californa housing dataset for our coding purpose </b>

In [None]:
%tensorflow_version 2.x

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import feature_column
from tensorflow.keras import layers

from matplotlib import pyplot as plt

# The following lines adjust the granularity of reporting.
pd.options.display.max_rows = 10
pd.options.display.float_format = "{:.1f}".format

tf.keras.backend.set_floatx('float32')

<b> We do the same loading, scaling and shuffling of data before we use the dataset for training the model for better accuracy
and also as a means to prepare the data.</b>

In [None]:
# Load the dataset
train_df = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/california_housing_train.csv")
test_df = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/california_housing_test.csv")

# Scale the labels
scale_factor = 1000.0
# Scale the training set's label.
train_df["median_house_value"] /= scale_factor 

# Scale the test set's label
test_df["median_house_value"] /= scale_factor

# Shuffle the examples
train_df = train_df.reindex(np.random.permutation(train_df.index))

<b> We will try to represent the Latitude and Longitude values as floating point values
    To create feature colums we call</b> 
    * tf.feature_column to represent a single feature, single feature cross or a single synthetic feature. To represent a feature as a numeric floating point number we can use tf.feature_column.numeric_column, to represent as a bucket or bins use tf.feature_column.bucketized_column
    * Add the columns into a python list
    

In [None]:
# Create a python list
feature_columns = []

# numerical feature to represent latiutde
latitude = tf.feature_column.numeric_column("latitude")
feature_columns.append(latitude)

# numerical feature represent longitude.
longitude = tf.feature_column.numeric_column("longitude")

# add the features into the python list
feature_columns.append(longitude)

# convert the features into a layer which will be part of the model.
fp_feature_layer = layers.DenseFeatures(feature_columns)

#### Lets define the functions used to create the model

  * create_model - defines the Tensorflow to build a linear regression model and use the fp_feature_layer to represent the model's feature
  * train_model - trains the model which the specified features.
  * plot_the_loss_curve - generates a loss curve

In [None]:
#define the functions
def create_model(my_learning_rate, feature_layer):
  #Create and compile a simple linear regression model.
  model = tf.keras.models.Sequential()

  # Add the layer containing the feature columns to the model.
  model.add(feature_layer)

  # Add one linear layer to the model to yield a simple linear regressor.
  model.add(tf.keras.layers.Dense(units=1, input_shape=(1,)))

  # Construct the layers into a model that TensorFlow can execute.
  model.compile(optimizer=tf.keras.optimizers.RMSprop(lr=my_learning_rate),
                loss="mean_squared_error",
                metrics=[tf.keras.metrics.RootMeanSquaredError()])

  return model           


def train_model(model, dataset, epochs, batch_size, label_name):
  # we feed the dataset into the model here so it can train it.

  features = {name:np.array(value) for name, value in dataset.items()}
  label = np.array(features.pop(label_name))
  history = model.fit(x=features, y=label, batch_size=batch_size,
                      epochs=epochs, shuffle=True)

  # store the list of epochs
  epochs = history.epoch
  
  # Isolate the mean absolute error for each epoch.
  hist = pd.DataFrame(history.history)
  rmse = hist["root_mean_squared_error"]

  return epochs, rmse   


def plot_the_loss_curve(epochs, rmse):
  # Plot the loss curve against the epoch.

  plt.figure()
  plt.xlabel("Epoch")
  plt.ylabel("Root Mean Squared Error")

  plt.plot(epochs, rmse, label="Loss")
  plt.legend()
  plt.ylim([rmse.min()*0.94, rmse.max()* 1.05])
  plt.show()  


##### We now train the model with the floating point representations that was created earlier.

In [None]:
# define the hyperparameters.
learning_rate = 0.05
epochs = 30
batch_size = 100
label_name = 'median_house_value'

# compile the model's topography.
my_model = create_model(learning_rate, fp_feature_layer)

# Train the model.
epochs, rmse = train_model(my_model, train_df, epochs, batch_size, label_name)

plot_the_loss_curve(epochs, rmse)

test_features = {name:np.array(value) for name, value in test_df.items()}
test_label = np.array(test_features.pop(label_name))
my_model.evaluate(x=test_features, y=test_label, batch_size=batch_size)

##### Lets represent the Latitude and Longitude values as buckets.

<b> We can create the latitude and longitude values as buckets or bins. Each bin represents all the neighbourhoods within a single degree. for example neighbourhoods within 34.4 to 34.8 are in a single bucket but neighbourhoods in 34.4 to 35.2 are in different buckets. 
The model will learn a new weight for each bucket.
    We create 10 buckets each for Latitude and Longitude </b>

In [None]:
resolution_in_degrees = 1.0 

# Create a python list to hold the features
feature_columns = []

# Create a bucket feature for latitude.
latitude_as_a_numeric_column = tf.feature_column.numeric_column("latitude")
latitude_boundaries = list(np.arange(int(min(train_df['latitude'])), 
                                     int(max(train_df['latitude'])), 
                                     resolution_in_degrees))
latitude = tf.feature_column.bucketized_column(latitude_as_a_numeric_column, 
                                               latitude_boundaries)
feature_columns.append(latitude)

# Create a bucket feature column for longitude.
longitude_as_a_numeric_column = tf.feature_column.numeric_column("longitude")
longitude_boundaries = list(np.arange(int(min(train_df['longitude'])), 
                                      int(max(train_df['longitude'])), 
                                      resolution_in_degrees))
longitude = tf.feature_column.bucketized_column(longitude_as_a_numeric_column, 
                                                longitude_boundaries)
feature_columns.append(longitude)

# Convert the list of feature columns into a layer which is part of the model.
buckets_feature_layer = layers.DenseFeatures(feature_columns)

In [None]:
# Train the model with the bucketized representations

# define the hyperparameters.
learning_rate = 0.04
epochs = 35

# Build the model passing in the buckets_feature_layer.
my_model = create_model(learning_rate, buckets_feature_layer)

# Train the model.
epochs, rmse = train_model(my_model, train_df, epochs, batch_size, label_name)

plot_the_loss_curve(epochs, rmse)

my_model.evaluate(x=test_features, y=test_label, batch_size=batch_size)

#### After you run the model we notice the bucket representation does better than the floating point representation of features.
##### But we can do better by using feature crosses, ie, by using a feature cross of latitude and longitude and creating a single feature cross, as in real life scenarios the location exists in two dimensions of latitude and longitude it makes sense to use the latitude and longitude as a feature cross

In [None]:
# following code demonstrates the feature cross.

resolution_in_degrees = 1.0 

# python list to hold generated feature column.
feature_columns = []

# bucket feature column for latitude.
latitude_as_a_numeric_column = tf.feature_column.numeric_column("latitude")
latitude_boundaries = list(np.arange(int(min(train_df['latitude'])), int(max(train_df['latitude'])), resolution_in_degrees))
latitude = tf.feature_column.bucketized_column(latitude_as_a_numeric_column, latitude_boundaries)

# feature column for longitude.
longitude_as_a_numeric_column = tf.feature_column.numeric_column("longitude")
longitude_boundaries = list(np.arange(int(min(train_df['longitude'])), int(max(train_df['longitude'])), resolution_in_degrees))
longitude = tf.feature_column.bucketized_column(longitude_as_a_numeric_column, longitude_boundaries)

# feature cross of latitude and longitude.
latitude_x_longitude = tf.feature_column.crossed_column([latitude, longitude], hash_bucket_size=125)
crossed_feature = tf.feature_column.indicator_column(latitude_x_longitude)
feature_columns.append(crossed_feature)

# Convert the list of feature columns into a layer to feed to the model.
feature_cross_feature_layer = layers.DenseFeatures(feature_columns)

In [None]:
# lets run the model with the feature crossed columns.

# define the hyperparameters.
learning_rate = 0.04
epochs = 35

# Build the model passing in the feature_cross_feature_layer: 
my_model = create_model(learning_rate, feature_cross_feature_layer)

# Train the model on the training set.
epochs, rmse = train_model(my_model, train_df, epochs, batch_size, label_name)

plot_the_loss_curve(epochs, rmse)

my_model.evaluate(x=test_features, y=test_label, batch_size=batch_size)