## trying some TensorFlow model to get used to with it

This notebook is inspired by that of Julien Heiduk https://www.kaggle.com/zoupet/neural-network-model-for-house-prices-tensorflow, thanks to him, and google machine learning mooc


In [5]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import warnings
warnings.filterwarnings('ignore')

import itertools
import tensorflow as tf
import pandas as pd
from tensorflow.python.data import Dataset

import numpy as np
import math
from sklearn.ensemble import IsolationForest
from pylab import rcParams
import matplotlib

from scipy import stats
from scipy.stats import skew, norm
from scipy.special import boxcox1p
from scipy.stats.stats import pearsonr
import seaborn as sns
mycols = ["#66c2ff", "#5cd6d6", "#00cc99", "#85e085", "#ffd966", "#ffb366", "#ffb3b3", "#dab3ff", "#c2c2d6"]
sns.set_palette(palette = mycols, n_colors = 4)

from IPython import display

from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt
from sklearn import metrics
from sklearn.model_selection import train_test_split

%matplotlib inline

import os
print(os.listdir("../input"))

In [6]:
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")

train_ID = train['Id']
test_ID = test['Id']

train.drop("Id", axis = 1, inplace = True)
test.drop("Id", axis = 1, inplace = True)

full_df = pd.concat((train,test))

In [7]:
def fill_cat(df) :
    for col_name in df.columns : 
        if(df[col_name].dtype == 'object') :
            df[col_name] = df[col_name].fillna("NONE")
            
#------------------------- fill num features----------------------------


def fill_numerique_value(df) : 
    for col_name in df.columns : 
        if(df[col_name].dtype != "object") :
            df[col_name] = df[col_name].fillna(df[col_name].mean())
            

            
#------------------------------- remove skew_feat ------------------------------

def skewed_feature(df, treshold = None) : 
    
    # Check how skewed they are
    df =  df.select_dtypes(exclude=['object'])
    skewed_feats = df.apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
    skewness = skewed_feats[abs(skewed_feats) > 0.5]
    skewed_features = skewness.index
    lam = treshold
    for col in skewed_features:
        df[col] = boxcox1p(df[col], lam)
    print(skewness.shape[0],  "skewed numerical features have been Box-Cox transformed")
    
#----------------------------------- transformation of numeric feature ------------------------------------    

def quadratic_function(df, column) : 
    df.column =  df.column ** 2

def cubic_function(df, column) : 
    df.column =  df.column ** 3
    
def squared_function(df, column):
    df.columns = np.sqrt(df.column)
    
def linear_scale(series):
    min_val = series.min()
    max_val = series.max()
    scale = (max_val - min_val) / 2.0
    
    return series.apply(lambda x:((x - min_val) / scale) - 1.0)

def log_normalize(series):
    return series.apply(lambda x:math.log(x+1.0))

def clip(series, clip_to_min, clip_to_max):
    return series.apply(lambda x:(
        min(max(x, clip_to_min), clip_to_max)))

def z_score_normalize(series):
    mean = series.mean()
    std_dv = series.std()
    return series.apply(lambda x:(x - mean) / std_dv)

def binary_threshold(series, threshold):
    return series.apply(lambda x:(1 if x > threshold else 0))
    
def apply_function(df, transformation_function) : 
    
    df =  df.select_dtypes(exclude=['object'])
    for col in df :
        df[col] =  transformation_function(df[col])
    print("transformation done !")
    

In [8]:
# ------------------------------------- Remove the outliers ----------------------------

def remove_outliers(full_df) :
    
    df_numeric =  full_df.select_dtypes(exclude=["object"])
    df_cat =  full_df.select_dtypes(include=["object"])
    
    clf = IsolationForest(max_samples = 100, random_state = 42)
    clf.fit(df_numeric)
    y_noano = clf.predict(df_numeric)
    y_noano = pd.DataFrame(y_noano, columns = ['Top'])
    y_noano[y_noano['Top'] == 1].index.values

    df_numeric = df_numeric.iloc[y_noano[y_noano['Top'] == 1].index.values]
    df_numeric.reset_index(drop = True, inplace = True)

    df_cat = df_cat.iloc[y_noano[y_noano['Top'] == 1].index.values]
    df_cat.reset_index(drop = True, inplace = True)

    full_df = full_df.iloc[y_noano[y_noano['Top'] == 1].index.values]
    full_df.reset_index(drop = True, inplace = True)

    print("Number of Outliers:", y_noano[y_noano['Top'] == -1].shape[0])
    print("Number of rows without outliers:", full_df.shape[0])


In [15]:
def preprocess_features(full_df):

    fill_numerique_value(full_df)
    fill_cat(full_df) 
    skewed_feature(full_df, 0.15)
    apply_function(full_df, linear_scale)
    remove_outliers(full_df)
    selected_features = full_df
    
    processed_features = selected_features.copy()
    
    return processed_features

def preprocess_target(full_df):
    
    output_targets = pd.DataFrame()
    output_targets["SalePrice"] = log_normalize(full_df["SalePrice"])

    return output_targets

training_examples, validation_examples, training_targets, validation_targets =  train_test_split(
    preprocess_features(full_df),preprocess_target(full_df), test_size = 1459, random_state = 42)
    
# Double-check that we've done the right thing.
print ("Training examples summary:")
display.display(training_examples.describe())
print ("Validation examples summary:")
display.display(validation_examples.describe())

print ("Training targets summary:")
display.display(training_targets.describe())
print ("Validation targets summary:")
display.display(validation_targets.describe())

In [16]:
    # need some list of features during process
    
    full_num = full_df.select_dtypes(exclude=['object'])
    full_cat = full_df.select_dtypes(include=['object'])
    col_train_num = list(full_num.columns)
    col_train_num_bis = list(full_num.columns)
    col_train_cat = list(full_cat.columns)
    col_train_num_bis.remove('SalePrice')
    COLUMNS = col_train_num
    FEATURES = col_train_num_bis
    LABEL = "SalePrice"
    FEATURES_CAT = col_train_cat

In [17]:
def construct_feature_columns():
        
    engineered_features = []
    
    for continuous_feature in FEATURES:
        engineered_features.append(
            tf.contrib.layers.real_valued_column(continuous_feature))

    for categorical_feature in FEATURES_CAT:
        sparse_column = tf.contrib.layers.sparse_column_with_hash_bucket(
            categorical_feature, hash_bucket_size=1000)    
        
    engineered_features.append(tf.contrib.layers.embedding_column(sparse_id_column=sparse_column, dimension=16,combiner="sum"))
    
    feature_columns = set(engineered_features)
    
    return feature_columns

In [18]:
def my_input_fn(features, targets, batch_size=1, shuffle=True, num_epochs=None):
    
    """
    Args:
      features: pandas DataFrame of features
      targets: pandas DataFrame of targets
      batch_size: Size of batches to be passed to the model
      shuffle: True or False. Whether to shuffle the data.
      num_epochs: Number of epochs for which data should be repeated. None = repeat indefinitely
    Returns:
      Tuple of (features, labels) for next data batch
    """
    
    # Convert pandas data into a dict of np arrays.
    features = {key:np.array(value) for key,value in dict(features).items()}                                           
    
    # Construct a dataset, and configure batching/repeating
    ds = Dataset.from_tensor_slices((features,targets)) 
    ds = ds.batch(batch_size).repeat(num_epochs)
    
    # Shuffle the data, if specified
    if shuffle:
        ds = ds.shuffle(10000)
    
    # Return the next batch of data
    features, labels = ds.make_one_shot_iterator().get_next()
    return features, labels


In [19]:
def train_model(
    my_optimizer,
    value_of_clip,
    steps,
    batch_size,
    hidden_units,
    training_examples,
    training_targets,
    validation_examples,
    validation_targets ) :
    
    periods = 10
    steps_per_period = steps / periods
    # Create a linear regressor object.

    my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, value_of_clip)
    
    dnn_regressor = tf.estimator.DNNRegressor(feature_columns=construct_feature_columns(), 
                                              activation_fn = tf.nn.relu, 
                                              hidden_units= hidden_units, 
                                              optimizer = my_optimizer)
    
    # Create input functions
    training_input_fn = lambda: my_input_fn(training_examples, 
                                            training_targets["SalePrice"], 
                                            batch_size=batch_size)
    predict_training_input_fn = lambda: my_input_fn(training_examples, 
                                            training_targets["SalePrice"], 
                                            num_epochs=1, 
                                            shuffle=False)
    predict_validation_input_fn = lambda: my_input_fn(validation_examples, 
                                            validation_targets["SalePrice"], 
                                            num_epochs=1, 
                                            shuffle=False)

    # Train the model, but do so inside a loop so that we can periodically assess
    # loss metrics.
    print( "Training model...")
    print ("RMSE (on training data):")
    
    training_rmse = []
    validation_rmse = []
    for period in range (0, periods):
        # Train the model, starting from the prior state.
        dnn_regressor.train(
            input_fn=training_input_fn,
            steps=steps_per_period
        )
        
        # Take a break and compute predictions.
        training_predictions = dnn_regressor.predict(input_fn=predict_training_input_fn)
        training_predictions = np.array([item['predictions'][0] for item in training_predictions])
        validation_predictions = dnn_regressor.predict(input_fn=predict_validation_input_fn)
        validation_predictions = np.array([item['predictions'][0] for item in validation_predictions])

        # Compute training and validation loss.
        training_root_mean_squared_error = math.sqrt(
            metrics.mean_squared_error(training_predictions, training_targets))
        validation_root_mean_squared_error = math.sqrt(
            metrics.mean_squared_error(validation_predictions, validation_targets))
        # Occasionally print the current loss.
        print ("RMSE on training sample at period %02d : is %0.2f " % (period, training_root_mean_squared_error))
        print ("RMSE on validation sample at period %02d : is  %0.2f " % (period, validation_root_mean_squared_error))
        # Add the loss metrics from this period to our list.
        training_rmse.append(training_root_mean_squared_error)
        validation_rmse.append(validation_root_mean_squared_error)
        
    print( "Model training finished.")
    # Output a graph of loss metrics over periods.
    plt.ylabel("RMSE")
    plt.xlabel("Periods")
    plt.title("Root Mean Squared Error vs. Periods")
    plt.tight_layout()
    plt.plot(training_rmse, label="training", color = "red", alpha = 0.5) 
    plt.plot(validation_rmse, label="validation", color = "green", alpha = 0.5)
    plt.legend()
    plt.show()
    
    return dnn_regressor

In [20]:
def model_predict():
    
    my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(tf.train.FtrlOptimizer(learning_rate = 0.0025), 0.3)
    
    dnn_regressor = tf.estimator.DNNRegressor(feature_columns=construct_feature_columns(), 
                                                  activation_fn = tf.nn.relu, 
                                                  hidden_units= [2000,2000], 
                                                  optimizer = my_optimizer)

    predict_validation_input_fn = lambda: my_input_fn(validation_examples, 
                                                validation_targets["SalePrice"], 
                                                num_epochs=1, 
                                                shuffle=False)

    dnn_regressor.train(
        input_fn=predict_validation_input_fn,
        steps=6
            )
    
    test_predictions = dnn_regressor.predict(input_fn=predict_validation_input_fn)
    test_predictions = np.array([item['predictions'][0] for item in test_predictions])
    
    
    return test_predictions

In [21]:
def submit() :
    estimator_predictions = model_predict()
    final_predictions = estimator_predictions
    submission = pd.DataFrame()
    submission['Id'] = test_ID
    submission['SalePrice'] = final_predictions
    submission.to_csv('submission.csv',index=False)

####  parameters ajustement is the result of some trial by myself

In [22]:
def main() : 
    
    training_examples, validation_examples, training_targets, validation_targets =  train_test_split(
        preprocess_features(full_df),preprocess_target(full_df), test_size = 0.5, random_state = 1234)
    
    train_model(
    my_optimizer=tf.train.FtrlOptimizer(learning_rate=0.0025),
    value_of_clip = 0.3,
    steps=2000,
    batch_size=10,
    hidden_units=[2000,2000],
    training_examples=training_examples,
    training_targets=training_targets,
    validation_examples=validation_examples,
    validation_targets=validation_targets)
    
    submit()

if __name__ == '__main__' :
    main()