In [1]:
from __future__ import print_function

import math
from functools import partial

from IPython import display
from matplotlib import cm
from matplotlib import gridspec
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn.metrics as metrics
import tensorflow as tf
from tensorflow.keras import optimizers
from tensorflow.python.data import Dataset

tf.get_logger().setLevel('ERROR')
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format

california_housing_dataframe = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/california_housing_train.csv", sep=",")
california_housing_dataframe = california_housing_dataframe.reindex(np.random.permutation(california_housing_dataframe.index))

In [3]:
def preprocess_features(california_housing_dataframe):
  selected_features = california_housing_dataframe[[
    "latitude",
    "longitude",
    "housing_median_age",
    "total_rooms",
    "total_bedrooms",
    "population",
    "households",
    "median_income"
  ]]
  
  processed_features = selected_features.copy()
  processed_features["rooms_per_person"] = \
    (california_housing_dataframe["total_rooms"] / california_housing_dataframe['population'])
  return processed_features

In [4]:
def preprocess_targets(california_housing_dataframe):
  output_targets = pd.DataFrame()
  output_targets["median_house_value_is_high"] = \
    (california_housing_dataframe["median_house_value"] > 265000).astype(float)
  return output_targets

In [5]:
training_examples = preprocess_features(california_housing_dataframe.head(12000))
training_targets = preprocess_targets(california_housing_dataframe.head(12000))

validation_examples = preprocess_features(california_housing_dataframe.tail(5000))
validation_targets = preprocess_targets(california_housing_dataframe.tail(5000))

In [9]:
print("Training examples summary:")
display.display(training_examples.describe())
print("Validation examples summary:")
display.display(validation_examples.describe())

print("Training targets summary:")
display.display(training_targets.describe())
print("Vlidation targets summary:")
display.display(validation_targets.describe())

Training examples summary:


Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_person
count,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0
mean,35.6,-119.5,28.5,2660.4,541.8,1434.4,503.4,3.9,2.0
std,2.1,2.0,12.6,2201.1,423.8,1157.8,386.4,1.9,0.9
min,32.5,-124.3,1.0,8.0,1.0,3.0,1.0,0.5,0.0
25%,33.9,-121.8,18.0,1471.0,300.0,792.0,284.0,2.6,1.5
50%,34.2,-118.5,29.0,2134.0,435.0,1169.0,409.0,3.6,1.9
75%,37.7,-118.0,37.0,3165.0,652.0,1724.2,608.0,4.8,2.3
max,42.0,-114.6,52.0,37937.0,5471.0,35682.0,5189.0,15.0,22.6


Validation examples summary:


Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_person
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,35.7,-119.6,28.8,2603.5,533.8,1417.9,495.9,3.8,2.0
std,2.2,2.0,12.6,2128.0,415.9,1123.6,379.9,1.9,1.7
min,32.5,-124.3,2.0,2.0,2.0,6.0,2.0,0.5,0.2
25%,33.9,-121.8,18.0,1435.8,291.0,782.0,276.0,2.5,1.5
50%,34.3,-118.5,29.0,2112.5,431.0,1163.0,410.0,3.5,1.9
75%,37.7,-118.0,37.0,3117.0,642.0,1715.5,597.0,4.7,2.3
max,41.9,-114.3,52.0,32627.0,6445.0,28566.0,6082.0,15.0,55.2


Training targets summary:


Unnamed: 0,median_house_value_is_high
count,12000.0
mean,0.3
std,0.4
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


Vlidation targets summary:


Unnamed: 0,median_house_value_is_high
count,5000.0
mean,0.2
std,0.4
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [12]:
training_targets[['median_house_value_is_high']]

Unnamed: 0,median_house_value_is_high
4991,0.0
2172,0.0
8748,1.0
4153,0.0
15177,0.0
...,...
13111,0.0
4014,0.0
6052,0.0
7610,1.0


In [None]:
def construct_feature_columns(input_features):
  return set([tf.feature_column.nueric_column(my_feature)])