**Synthetic Features and Outliers**

- create a synthetic feature which is a ratio of two other features
- use this new feature as an input to a linear regression model
- improve the effectiveness of the model by identifying and clipping (removing) outliers out of the input data

In [1]:
import math
from IPython import display
from matplotlib import cm
from matplotlib import gridspec
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn.metrics as metrics
import tensorflow as tf
from tensorflow.python.data import Dataset

tf.logging.set_verbosity(tf.logging.ERROR)
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format

california_housing_dataframe = pd.read_csv('../input/california_housing_train.csv', sep=',')

california_housing_dataframe = california_housing_dataframe.reindex(np.random.permutation(california_housing_dataframe.index))
california_housing_dataframe['median_house_value'] /= 1000.0
california_housing_dataframe

In [2]:
# setup input function and define function for model training

def my_input_fn(features, targets, batch_size=1, shuffle=True, num_epochs=None):
    """Trains a linear regression model of one feature.
    
    Args:
        features: pandas DataFrame of feature
        targets: pandas DataFrame of targets
        batch_size: size of btaches to be passed to the model
        shuffle: True or False. Whether to shuffle the data
        num_epochs: Number of epochs for which data should be repeated. None=repeat indefinitely
    Returns:
        Tuple of (features, labels) for next data batch
    """
    
    # convert pandas data into a dict of np.arrays
    features = {key: np.array(value) for key, value in dict(features).items()}
    
    # construct a dataset, and configure batching/repeating
    ds = Dataset.from_tensor_slices((features, targets)) # beware of the limits
    ds = ds.batch(batch_size).repeat(num_epochs)
    
    # shuffle the data, if specified
    if shuffle:
        ds = ds.shuffle(buffer_size=10000)
        
    # return the next batch of data
    features, labels = ds.make_one_shot_iterator().get_next()
    return features, labels

In [3]:
def train_model(learning_rate, steps, batch_size, input_feature):
    """Trains a linear regression model.
    
    Args:
        learning_rate: A `float`, the learning rate
        steps: A non-zero `int`, the total number of training steps. A training step consists of a forward and backward pass using a single batch
        batch_size: A non-zero `int`, the batch size
        input_feature: A `string` specifying a column from `california_housing_dataframe` to use as input feature
        
    Returns:
        A pandas `DataFrame` containing a targets and the corresponding predictions done after training the model
    """
    
    periods = 10
    steps_per_period = steps / periods
    
    my_feature = input_feature
    my_feature_data = california_housing_dataframe[[my_feature]].astype('float32')
    my_label = 'median_house_value'
    targets = california_housing_dataframe[my_label].astype('float32')
    
    # create input functions
    training_input_fn = lambda: my_input_fn(my_feature_data, targets, batch_size=batch_size)
    predict_training_input_fn = lambda: my_input_fn(my_feature_data, targets, num_epochs=1, shuffle=False)
    
    # create feature columns
    feature_columns = [tf.feature_column.numeric_column(my_feature)]
    
    # create a linear regressor object
    my_optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
    my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)
    linear_regressor = tf.estimator.LinearRegressor(
        feature_columns=feature_columns,
        optimizer=my_optimizer
    )
    
    # set up to plot the state of our model's line each period
    plt.figure(figsize=(15, 6))
    plt.subplot(1, 2, 1)
    plt.title('Learned line by period')
    plt.ylabel(my_label)
    plt.xlabel(my_feature)
    sample = california_housing_dataframe.sample(n=300)
    plt.scatter(sample[my_feature], sample[my_label])
    colors = [cm.coolwarm(x) for x in np.linspace(-1, 1, periods)]
    
    # train the model, but do it in a loop so that we can periodicaly asses loss metrics
    print('Training model...')
    print('RMSE (on training data):')
    root_mean_squared_errors = []
    for period in range(0, periods):
        # train the model, starting from prior state
        linear_regressor.train(
            input_fn=training_input_fn,
            steps=steps_per_period
        )
        
        # take a break and compute predictions
        predictions = linear_regressor.predict(input_fn=predict_training_input_fn)
        predictions = np.array([item['predictions'][0] for item in predictions])
        
        # compute loss
        root_mean_squared_error = math.sqrt(metrics.mean_squared_error(predictions, targets))
        
        # occasionally print the current loss
        print(' period %02d: %0.2f' % (period, root_mean_squared_error))
        
        # add the loss metrics from this period to our list
        root_mean_squared_errors.append(root_mean_squared_error)
        
        # finally, track the weights and biases over time
        # apply some math to ensure that the data and line are plotted neatly
        y_extents = np.array([0, sample[my_label].max()])
        
        weight = linear_regressor.get_variable_value('linear/linear_model/%s/weights' % input_feature)[0]
        bias = linear_regressor.get_variable_value('linear/linear_model/bias_weights')
        
        x_extents = (y_extents - bias) / weight
        x_extents = np.maximum(np.minimum(x_extents, sample[my_feature].max()), sample[my_feature].min())
        y_extents = weight * x_extents + bias
        plt.plot(x_extents, y_extents, color=colors[period])
        
    print('Model training finished')
    
    # output a graph of loss metrics over periods
    plt.subplot(1, 2, 2)
    plt.ylabel('RMSE')
    plt.xlabel('Periods')
    plt.title('Root mean squared erros vs periods')
    plt.tight_layout()
    plt.plot(root_mean_squared_errors)
    
    # create a table with calibration data
    calibration_data = pd.DataFrame()
    calibration_data['predictions'] = pd.Series(predictions)
    calibration_data['targets'] = pd.Series(targets)
    display.display(calibration_data.describe())
    
    print('Final RMSE (on training data): %0.2f' % root_mean_squared_error)
    
    return calibration_data

**Try a synthetic feature**

`total_room` and `population` features count totals for a given city block. What if one city block was more densely populated that another? Let's create a synthetic feature `rooms_per_person` which is a ratio of `total_rooms` and `population` use that as `input_feature` to `train_model()`. What's the best performance you can get with this single feature by tweaking the learning rate?

In [None]:
california_housing_dataframe['rooms_per_person'] = california_housing_dataframe['total_rooms'] / california_housing_dataframe['population']
california_housing_dataframe['rooms_per_person'].describe()

In [None]:
calibration_data = train_model(
    learning_rate=0.00005,
    steps=500,
    batch_size=5,
    input_feature='rooms_per_person'
)

In [None]:
calibration_data = train_model(
    learning_rate=0.0005,
    steps=500,
    batch_size=5,
    input_feature='rooms_per_person'
)

In [None]:
calibration_data = train_model(
    learning_rate=0.005,
    steps=500,
    batch_size=5,
    input_feature='rooms_per_person'
)

In [None]:
calibration_data = train_model(
    learning_rate=0.08,
    steps=500,
    batch_size=5,
    input_feature='rooms_per_person'
)

In [None]:
calibration_data = train_model(
    learning_rate=0.05,
    steps=500,
    batch_size=5,
    input_feature="rooms_per_person")

**Idenitify outliers**

Use scatter plot of predictions vs targets to find any oddities.

In [None]:
plt.figure(figsize=(15, 6))
plt.subplot(1, 2, 1)
plt.scatter(calibration_data['predictions'], calibration_data['targets'])

In [None]:
# most of the predictions seem to be on the left side of the plot, let's confirm this with our input data as well
plt.subplot(1, 2, 2)
_ = california_housing_dataframe['rooms_per_person'].hist()

**Clip Outliers**

Can we improve the model's accuracy by setting the outlier values of `room_per_person` to some reasonable min and max.

In [None]:
# since most of input values are less than 5, let's clip at 5 and confirm with histogram plot
california_housing_dataframe['clipped_rooms_per_person'] = (california_housing_dataframe['rooms_per_person']).apply(lambda x: min(x, 5))
_ = california_housing_dataframe['clipped_rooms_per_person'].hist()

In [None]:
# to check if clipping had any effect, let's retry training
calibration_data = train_model(
    learning_rate=0.05,
    steps=500,
    batch_size=5,
    input_feature='clipped_rooms_per_person'
)

In [None]:
_ = plt.scatter(calibration_data['predictions'], calibration_data['targets'])