In [None]:
#RESOURECES
#https://www.kaggle.com/dgawlik/house-prices-eda

In [None]:
from __future__ import print_function

import math

from IPython import display
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import metrics
import tensorflow as tf
from tensorflow.python.data import Dataset

tf.logging.set_verbosity(tf.logging.ERROR)
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format

In [None]:
train = pd.read_csv('train.csv', sep=",")
train

In [None]:
train_data = train.reindex(
    np.random.permutation(train.index))

train.describe()

In [None]:
correlation_dataframe = train.copy()
correlation_dataframe["target"] = training_targets["SalePrice"]

correlation_dataframe.corr()

In [None]:
def linear_scale(series):
    min_val = series.min()
    max_val = series.max()
    scale = (max_val - min_val) / 2.0
    return series.apply(lambda x:((x - min_val) / scale) - 1.0)

def normalize(df):
    fd = pd.DataFrame()
    for i in range(len(df)):
        fd[feat_index[i]] = linear_scale(df[feat_index[i]])
    return fd

def preprocess_features(train):
    selected_features = train[
    ['OverallQual', 'GrLivArea', 'GarageCars', 'FullBath']]
    processed_features = selected_features.copy()
    processed_features['GrLivArea'] = np.log(processed_features['GrLivArea'])
    normalize(processed_features)
    return processed_features

def preprocess_targets(train):
    output_targets = pd.DataFrame()
      # Scale the target to be in units of thousands of dollars.
    output_targets['SalePrice'] = np.log(train['SalePrice'])
    output_targets["SalePrice"] = (train["SalePrice"] / 1000.0)
    return output_targets

In [190]:
training_examples = preprocess_features(train.head(1460))
training_targets = preprocess_targets(train.head(1460))

In [191]:
def construct_feature_columns(input_features):
  """Construct the TensorFlow Feature Columns.

  Args:
    input_features: The names of the numerical input features to use.
  Returns:
    A set of feature columns
  """ 
  return set([tf.feature_column.numeric_column(my_feature)
              for my_feature in input_features])

In [192]:
def my_input_fn(features, targets, batch_size=1, shuffle=True, num_epochs=None):
    # Convert pandas data into a dict of np arrays.
    features = {key:np.array(value) for key,value in dict(features).items()}                                           
    
    # Construct a dataset, and configure batching/repeating.
    ds = Dataset.from_tensor_slices((features,targets)) # warning: 2GB limit
    ds = ds.batch(batch_size).repeat(num_epochs)

    # Shuffle the data, if specified.
    if shuffle:
        ds = ds.shuffle(10000)
    
    # Return the next batch of data.
    features, labels = ds.make_one_shot_iterator().get_next()
    return features, labels

In [193]:
def train_model(
    learning_rate,
    steps,
    batch_size,
    training_examples,
    training_targets,
    ):

  

  periods = 10
  steps_per_period = steps / periods

  # Create a linear regressor object.
  my_optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
  my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)
  linear_regressor = tf.estimator.LinearRegressor(
      feature_columns=construct_feature_columns(training_examples),
      optimizer=my_optimizer
  )
    
  # Create input functions.
  training_input_fn = lambda: my_input_fn(training_examples, 
                                          training_targets["SalePrice"], 
                                          batch_size=batch_size)
  predict_training_input_fn = lambda: my_input_fn(training_examples, 
                                                  training_targets["SalePrice"], 
                                                  num_epochs=1, 
                                                  shuffle=False)

  # Train the model, but do so inside a loop so that we can periodically assess
  # loss metrics.
  print("Training model...")
  print("RMSE (on training data):")
  training_rmse = []
  validation_rmse = []
  for period in range (0, periods):
    # Train the model, starting from the prior state.
    linear_regressor.train(
        input_fn=training_input_fn,
        steps=steps_per_period,
    )
    # Take a break and compute predictions.
    training_predictions = linear_regressor.predict(input_fn=predict_training_input_fn)
    training_predictions = np.array([item['predictions'][0] for item in training_predictions])

    # Compute training and validation loss.
    training_root_mean_squared_error = math.sqrt(
        metrics.mean_squared_error(training_predictions, training_targets))

    # Occasionally print the current loss.
    print("  period %02d : %0.2f" % (period, training_root_mean_squared_error))
    # Add the loss metrics from this period to our list.
    training_rmse.append(training_root_mean_squared_error)
  print("Model training finished.")

  
  # Output a graph of loss metrics over periods.
  plt.ylabel("RMSE")
  plt.xlabel("Periods")
  plt.title("Root Mean Squared Error vs. Periods")
  plt.tight_layout()
  plt.plot(training_rmse, label="training")
  plt.legend()

  return linear_regressor

In [None]:
#
# Your code here: add your features of choice as a list of quoted strings.
#
minimal_features = ['OverallQual', 'GrLivArea', 'GarageCars', 'FullBath'
]

assert minimal_features, "You must select at least one feature!"

minimal_training_examples = training_examples[minimal_features]

linear_regressor = train_model(
    learning_rate=.0001,
    steps=50000,
    batch_size=1000,
    training_examples=minimal_training_examples,
    training_targets=training_targets,
    )

Training model...
RMSE (on training data):
  period 00 : 174.25
  period 01 : 151.37
  period 02 : 129.18
  period 03 : 108.09
  period 04 : 88.89
  period 05 : 73.07
  period 06 : 63.13


In [None]:
def my_test_input_fn(features, batch_size=1, shuffle=True, num_epochs=None):
    # Convert pandas data into a dict of np arrays.
    features = {key:np.array(value) for key,value in dict(features).items()}                                           
    
    # Construct a dataset, and configure batching/repeating.
    ds = Dataset.from_tensor_slices((features)) # warning: 2GB limit
    ds = ds.batch(batch_size).repeat(num_epochs)

    # Shuffle the data, if specified.
    if shuffle:
        ds = ds.shuffle(10000)
    
    # Return the next batch of data.
    features = ds.make_one_shot_iterator().get_next()
    return features

test_data = pd.read_csv("test.csv", sep=",")

test_examples = preprocess_features(test_data)

predict_test_input_fn = lambda: my_test_input_fn(
        test_examples,
        num_epochs=1, 
        shuffle=False)

test_predictions = linear_regressor.predict(input_fn=predict_test_input_fn)
test_predictions = np.array([item['predictions'][0] for item in test_predictions])
test_predictions = [x * 1000 for x in test_predictions]

test_predictions


In [None]:
#my_submission = pd.DataFrame({'Id': test_data.Id, 'SalePrice': test_predictions})
# you could use any filename. We choose submission here
#my_submission.to_csv('submission1.csv', index=False)