In [0]:
# import all required libraries
from __future__ import print_function

import math
import random
import tensorflow as tf
from sklearn import metrics
from tensorflow.python.data import Dataset
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


tf.logging.set_verbosity(tf.logging.ERROR)

In [0]:
# import the DataSet
california_housing = pd.read_csv("https://dl.google.com/mlcc/mledu-datasets/california_housing_train.csv",sep=",")
california_housing.head()

In [0]:
# data preprocessing
def preprocess_features(california_housing):
  
  features = california_housing[
      [
          "housing_median_age",
          "total_rooms",
          "total_bedrooms",
          "population",
          "median_income"
      ]
  ]
  selected_features = features.copy()
  # Create a synthetic feature.
  selected_features["rooms_per_person"] = (california_housing["total_rooms"] / california_housing["population"])
  return selected_features

def preprocess_labels(california_housing):
  selected_labels = (california_housing['median_house_value'] / 1000)
  return selected_labels

In [0]:
training_features = preprocess_features(california_housing.head(10000))
training_features.describe()

In [0]:
training_labels = preprocess_labels(california_housing.head(10000))
training_labels.head()

In [0]:
validation_features = preprocess_features(california_housing.tail(3000))
validation_features.head()

In [0]:
validation_labels = preprocess_labels(california_housing.tail(3000))
validation_labels.head()

In [0]:
# generalised input function
def my_input_fn(
  features,
  labels,
  batch_size=1,
  shuffle = False,
  num_epochs=None):
  
  features = {key:np.array(value) for key,value in dict(features).items()}
  
  ds = Dataset.from_tensor_slices((features,labels))
  ds = ds.batch(batch_size).repeat(num_epochs)
  
  if shuffle:
    ds = ds.shuffle(1000)
    
  features,labels = ds.make_one_shot_iterator().get_next()
  return features,labels

In [0]:
def create_feature_columns(input_features):
  return set([tf.feature_column.numeric_column(my_feature) for my_feature in input_features])

In [0]:
def train_model(
  learning_rate,
  steps,
  batch_size,
  training_features,
  training_labels,
  validation_features,
  validation_labels):
  
  periods = 10
  steps_per_period = steps / periods
  
  my_optimizer = tf.train.GradientDescentOptimizer(learning_rate = learning_rate)
  my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer,5.0)
  linear_regressor = tf.estimator.LinearRegressor(feature_columns = create_feature_columns(training_features),optimizer = my_optimizer)
  
  training_input_fn = lambda : my_input_fn(training_features,training_labels,batch_size=batch_size)
  predict_training_input_fn = lambda : my_input_fn(training_features,training_labels,shuffle=False,num_epochs=1)
  predict_validation_input_fn = lambda : my_input_fn(validation_features,validation_labels,shuffle=False,num_epochs=1)
  
  
  print("Training the model..")
  print("RMSE (on train data) : ")
  training_rmse = []
  validation_rmse = []
  for period in range(0,periods):
    linear_regressor.train(input_fn=training_input_fn,steps = steps_per_period)
    
    training_predictions = linear_regressor.predict(input_fn=predict_training_input_fn)
    training_predictions = np.array([item['predictions'][0] for item in training_predictions])
    
    validation_predictions = linear_regressor.predict(input_fn=predict_validation_input_fn)
    validation_predictions = np.array([item['predictions'][0] for item in validation_predictions])
    
    training_RMSE = math.sqrt(metrics.mean_squared_error(training_predictions,training_labels))
    validation_RMSE = math.sqrt(metrics.mean_squared_error(validation_predictions,validation_labels))
    
    print("period %02d : %.2f" % (period,training_RMSE))
    
    training_rmse.append(training_RMSE)
    validation_rmse.append(validation_RMSE)
    
  
  print('model training finished.')
  
  plt.xlabel('periods')
  plt.ylabel('RMSE')
  plt.title('RMSE vs Periods')
  plt.tight_layout()
  plt.plot(training_rmse, label="training")
  plt.plot(validation_rmse, label="validation")
  plt.legend()
  
  return linear_regressor

In [0]:
linear_regressor = train_model(
  learning_rate=0.00003,
  steps=1000,
  batch_size=15,
  training_features=training_features,
  training_labels=training_labels,
  validation_features=validation_features,
  validation_labels=validation_labels)

In [0]:
#test data which is unseen by our model.
california_housing_test_data = pd.read_csv("https://dl.google.com/mlcc/mledu-datasets/california_housing_test.csv", sep=",")
test_features = preprocess_features(california_housing_test_data)
test_labels = preprocess_labels(california_housing_test_data)

test_input_fn = lambda : my_input_fn(test_features,test_labels,shuffle=False,num_epochs=1)

test_predictions = linear_regressor.predict(input_fn = test_input_fn)
test_predictions = np.array([item['predictions'][0] for item in test_predictions])


root_mean_squared_error = math.sqrt(
    metrics.mean_squared_error(test_predictions, test_labels))

print("Final RMSE (on test data): %0.2f" % root_mean_squared_error)