In [None]:
%tensorflow_version 2.x
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))
from tensorflow import keras
import tensorflow_addons as tfa

# first neural network with keras tutorial
from numpy import loadtxt
from keras.models import Sequential
from keras.layers import Dense
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from datetime import datetime
import numpy as np
from numpy.testing import assert_allclose
from keras.models import Sequential, load_model
from keras.layers import LSTM, Dropout, Dense
from keras.callbacks import ModelCheckpoint

import math

Found GPU at: /device:GPU:0


# Data load and pre-processing

In [None]:
rutaCarp = '/content/drive/My Drive/2020 Proyecto DS4A'

In [None]:
ofertas = pd.read_csv(rutaCarp+'/data_no_outliers.csv', delimiter=";") 

All the categorical data is marked in the data frame

In [None]:
cat_data = ['oft_tipo_inmueble',
            'oft_tipo_norma_juridica',
            'loccodigo',
            'suelo',
            'actividad',
            'tratamiento_urb',
            'topografia',
            'serpub',
            'serpub_tipo',
            'serpub_especif',
            'via',
            'clase_via',
            'estado_via',
            'influencia_via',
            'actividad_economica',
            'actividad_economica_tipo',
            'tipo_segun_actividad',
            'cp_terr_ar',
            'estrato']
for cd in cat_data:
    ofertas[cd] = ofertas[cd].astype('category')

We filter the variables that will be used in the model

In [None]:
cols_analisis = ['x', 'y',
                 
                 'd_park', 'd_highway', 'd_bikeway', 'd_ssf', 'd_mus', 'd_lib', 'd_sitp',
                 'd_tm', 'd_p_tm', 'd_gy', 'd_ies', 'd_bom', 'd_col', 'd_ips',

                 'oft_tipo_inmueble', 'oic_area_terreno',
                 'oia_cant_garajes',
                 'loccodigo', 'actividad_economica', 'estrato']

The data is split into input (X) and output (y) variables

In [None]:
X = pd.get_dummies(ofertas[cols_analisis])
numInp = len(X.columns)
Y = ofertas[["log_vfventa2020"]]

The previous data frames are sub-divided into train and test

In [None]:
x_train, x_test, y_train, y_test = train_test_split( X, Y, test_size=0.3, random_state=123)

# Data Balancing and Size reduction
About half of the data corresponds to middle-class socioeconomic strata, so 1000 random properties are selected from each stratum to ensure that the model works properly in all strata.

In [None]:
#se balancean los estratos
var_estratos = ['estrato_1.0','estrato_2.0','estrato_3.0','estrato_4.0','estrato_5.0','estrato_6.0']
min_datos = min(x_train[var_estratos].sum())
redond = math.floor(min_datos/1000)*1000
min_datos

1574

In [None]:
np.random.seed(1234) # a seed makes the analysis reproducible
                     # so everyone will get the same results

df_est_x = x_train[x_train['estrato_1.0']==1]
df_est_y = y_train[x_train['estrato_1.0']==1]

ndata = len(df_est_x)
idx_entrenamiento = np.random.choice(range(ndata),redond,replace=False)
idx_resto  = np.asarray(list(set(range(ndata)) - set(idx_entrenamiento)))

train_x_entr= df_est_x.iloc[idx_entrenamiento] # the training data set
train_x_otro= df_est_x.iloc[idx_resto]  # the test data set

train_y_entr= df_est_y.iloc[idx_entrenamiento] # the training data set
train_y_otro= df_est_y.iloc[idx_resto]  # the test data set

var_estratos_2 = ['estrato_2.0','estrato_3.0','estrato_4.0','estrato_5.0','estrato_6.0']
for estrati in var_estratos_2:
  df_est_x = x_train[x_train[estrati]==1]
  df_est_y = y_train[x_train[estrati]==1]
  
  ndata = len(df_est_x)
  idx_entrenamiento = np.random.choice(range(ndata),redond,replace=False)
  idx_resto  = np.asarray(list(set(range(ndata)) - set(idx_entrenamiento)))
  
  train_x_entr_estr= df_est_x.iloc[idx_entrenamiento] # the training data set
  train_x_rest_estr= df_est_x.iloc[idx_resto]  # the test data set

  train_y_entr_estr= df_est_y.iloc[idx_entrenamiento] # the training data set
  train_y_rest_estr= df_est_y.iloc[idx_resto]  # the test data set

  train_x_entr = pd.concat([train_x_entr, train_x_entr_estr], ignore_index=True)
  train_x_otro = pd.concat([train_x_otro, train_x_rest_estr], ignore_index=True)
  train_y_entr = pd.concat([train_y_entr, train_y_entr_estr], ignore_index=True)
  train_y_otro = pd.concat([train_y_otro, train_y_rest_estr], ignore_index=True)

# Data Transformation
In addition to the transformed dependent variable (using the logarithm function), the explanatory variables are normalized with their average and deviation.

In [None]:
def normalize(train, test):

    mean = np.mean(train, axis=0)
    std = np.std(train, axis=0)+0.000001

    X_train = (train - mean) / std
    X_test = (test - mean) /std
    return X_train, X_test

x_train_2, x_train_2_otro = normalize(train_x_entr, train_x_otro)
y_train_2, y_train_2_otro = normalize(train_y_entr, train_y_otro)

# Neural Network Configurations 
Next, we parameterize the characteristics of the neural network.

In [None]:
constante = 50
capa1 = int(numInp*constante)
capa2 = int(numInp*constante)
capa3 = int(numInp*constante)

In [None]:
now = datetime.now()
date_time = now.strftime("%Y%m%d-%H:%M v7")

To be able to compare the results of the Neural Network with the other models, we define the R Square metric. 

In [None]:
def r_square(y_true, y_pred):
    from keras import backend as K
    SS_res =  K.sum(K.square(y_true - y_pred)) 
    SS_tot = K.sum(K.square(y_true - K.mean(y_true))) 
    return ( 1 - SS_res/(SS_tot + K.epsilon()) )

# Neural Network Initialization
Next, we compile and train the neural network with the previous configurations.

In [None]:
#Model initialization
model = Sequential()
model.add(Dense(capa1, input_dim=numInp, activation='linear'))
model.add(Dense(capa2, activation='linear'))
model.add(Dense(capa3, activation='linear'))
model.add(Dense(capa3, activation='linear'))
model.add(Dense(1, activation='linear'))
model.compile(loss='mean_squared_error', optimizer='adam', metrics=[r_square])

# define the checkpoint

filepath = rutaCarp+'/logs/'+date_time+' NN Model.h5'
checkpoint = ModelCheckpoint(filepath, monitor='r_square', save_freq=96*2, verbose=1, save_best_only=True, mode='max')
time_stopping_callback = tfa.callbacks.TimeStopping(seconds=85*60, verbose=1)
  
callbacks_list = [checkpoint,time_stopping_callback]

# fit the model
model.fit(x_train_2, y_train_2, epochs=10, batch_size=50, verbose=1, validation_split=0.2, callbacks=callbacks_list)



Epoch 1/10
Epoch 2/10
Epoch 00002: r_square improved from -inf to -9.40807, saving model to /content/drive/My Drive/2020 Proyecto DS4A/logs/20201114-21:12 v7 NN Model.h5
Epoch 3/10
Epoch 4/10
Epoch 00004: r_square improved from -9.40807 to 0.05878, saving model to /content/drive/My Drive/2020 Proyecto DS4A/logs/20201114-21:12 v7 NN Model.h5
Epoch 5/10
Epoch 6/10
Epoch 00006: r_square improved from 0.05878 to 0.22188, saving model to /content/drive/My Drive/2020 Proyecto DS4A/logs/20201114-21:12 v7 NN Model.h5
Epoch 7/10
Epoch 8/10
Epoch 00008: r_square improved from 0.22188 to 0.28521, saving model to /content/drive/My Drive/2020 Proyecto DS4A/logs/20201114-21:12 v7 NN Model.h5
Epoch 9/10
Epoch 10/10
Epoch 00010: r_square improved from 0.28521 to 0.32804, saving model to /content/drive/My Drive/2020 Proyecto DS4A/logs/20201114-21:12 v7 NN Model.h5


<tensorflow.python.keras.callbacks.History at 0x7fa4ce0625c0>

# Neural Network Saving
Finally, we save the model along with some metrics to be able to compare it with another neural network compiled with different configurations

In [None]:

new_model = load_model(filepath, custom_objects={'r_square':r_square})

checkpoint = ModelCheckpoint(filepath, monitor='r_square', save_freq=96*2, verbose=1, save_best_only=True, mode='max')
time_stopping_callback = tfa.callbacks.TimeStopping(seconds=85*60, verbose=1)
  
callbacks_list = [checkpoint,time_stopping_callback]
history_callback=new_model.fit(x_train_2, y_train_2, epochs=2, verbose=1, batch_size=50,validation_split=0.2, callbacks=callbacks_list)

loss_history = history_callback.history["r_square"]
numpy_loss_history = np.array(loss_history)
np.savetxt(rutaCarp+'/logs/'+date_time+' log NN est r2.txt', numpy_loss_history, delimiter=",")

loss_history = history_callback.history["loss"]
numpy_loss_history = np.array(loss_history)
np.savetxt(rutaCarp+'/logs/'+date_time+' log NN est loss.txt', numpy_loss_history, delimiter=",")

ruta= rutaCarp+'/logs/'+date_time+' log NN desc.txt'
with open(ruta, "w") as outfile:
    outfile.write("\n".join(cols_analisis))

Epoch 1/2
Epoch 2/2
Epoch 00002: r_square improved from -inf to 0.59523, saving model to /content/drive/My Drive/2020 Proyecto DS4A/logs/20201114-21:12 v7 NN Model temp.h5
