In [1]:
import pandas as pd
import numpy as np

In [2]:
# This will allow you to see all column names & rows when you are doing .head(). None of the column name will be truncated.
# source: https://stackoverflow.com/questions/49188960/how-to-show-all-of-columns-name-on-pandas-dataframe

pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [3]:
# source: https://gist.github.com/rozanecm/38f2901c592bdffc40726cb0473318cf
# Function which plays a beep of given duration and frequency.
# Useful for when executing things that need a while to finish, to get notified.
import os
def beep(duration = 1, freq = 1500):
    """ play tone of duration in seconds and freq in Hz. """
    os.system('play --no-show-progress --null --channels 1 synth %s sine %f' % (duration, freq))

In [4]:
train = pd.read_csv('../data/train.csv', usecols=['id','precio'])
test = pd.read_csv('../data/test.csv', usecols=['id'])

In [5]:
# To save predictions.
# There must be a directory ../predictions for this to work as expected.
# source: https://gist.github.com/rozanecm/ee8333741db42b10158b3e0aff3f22aa
import time
def _get_filename(my_name, timestamp):
    return "../predictions/" + timestamp + " by " + my_name + ".csv"

def _save_description(authors_name, timestamp, submission_description):
    f = open("../predictions/" + authors_name + ".txt","a")
    f.write(timestamp + ": " + submission_description + '\n')
    f.close()

def save_submission(submission_df, authors_name="fcozza", description = "no description.", index=False, header=True):
    timestamp = time.strftime("%Y.%m.%d - %H:%M:%S")
    submission_df.to_csv(_get_filename(authors_name, timestamp), index=index, header=header)
    _save_description(authors_name, timestamp, description)

# Agregando columnas de predicciones anteriores

# target 1 - rf

In [6]:
current_target_name = 'target_1'

In [7]:
df = pd.read_csv('../predictions/2019.10.28 - 09:12:00 by rozanecm.csv')

test = test.merge(df.rename(columns={'target':current_target_name}), on='id')

In [8]:
df = pd.read_csv('../predictions/on_train_data/rozanecm_approach_1')

train = train.merge(df.rename(columns={'approach_1':current_target_name}), on='id')

# target 2 - rf + one hot + svd

In [9]:
current_target_name = 'target_2'

In [10]:
df = pd.read_csv('../predictions/2019.10.28 - 12:03:46 by rozanecm.csv')

test = test.merge(df.rename(columns={'target':current_target_name}), on='id')

In [11]:
df = pd.read_csv('../predictions/on_train_data/rozanecm_approach_2')

train = train.merge(df.rename(columns={'rozanecm_approach_2':current_target_name}), on='id')

# target 3 - rf + one hashing vectorizer + svd

In [12]:
current_target_name = 'target_3'

In [13]:
df = pd.read_csv('../predictions/2019.10.28 - 14:40:22 by rozanecm.csv')

test = test.merge(df.rename(columns={'target':current_target_name}), on='id')

In [14]:
df = pd.read_csv('../predictions/on_train_data/rozanecm_approach_3')

train = train.merge(df.rename(columns={'rozanecm_approach_3':current_target_name}), on='id')

# target 4 - rf + stopwords

In [15]:
current_target_name = 'target_4'

In [16]:
df = pd.read_csv('../predictions/2019.10.28 - 16:36:13 by rozanecm.csv')

test = test.merge(df.rename(columns={'target':current_target_name}), on='id')

In [17]:
df = pd.read_csv('../predictions/on_train_data/rozanecm_approach_4')

train = train.merge(df.rename(columns={'rozanecm_approach_4':current_target_name}), on='id')

# target 5 -lightgbm

In [18]:
current_target_name = 'target_5'

In [19]:
df = pd.read_csv('../predictions/2019.10.29 - 11:59:35 by rozanecm.csv')

test = test.merge(df.rename(columns={'target':current_target_name}), on='id')

In [20]:
df = pd.read_csv('../predictions/on_train_data/rozanecm_approach_7')

train = train.merge(df.rename(columns={'rozanecm_approach_7':current_target_name}), on='id')

# target 6 - lightgbm with grid search

In [21]:
current_target_name = 'target_6'

In [22]:
df = pd.read_csv('../predictions/2019.10.29 - 13:19:02 by rozanecm.csv')

test = test.merge(df.rename(columns={'target':current_target_name}), on='id')

In [23]:
df = pd.read_csv('../predictions/on_train_data/rozanecm_approach_8')

train = train.merge(df.rename(columns={'rozanecm_approach_8':current_target_name}), on='id')

# target 7 - lightgbm with grid search

In [24]:
current_target_name = 'target_7'

In [25]:
df = pd.read_csv('../predictions/2019.11.11 - 13:07:13 by rozanecm.csv')

test = test.merge(df.rename(columns={'target':current_target_name}), on='id')

In [26]:
df = pd.read_csv('../predictions/on_train_data/rozanecm_approach_10')

train = train.merge(df.rename(columns={'rozanecm_approach_10':current_target_name}), on='id')

# target 8 - lightgbm grid search + feat eng

In [27]:
current_target_name = 'target_8'

In [28]:
df = pd.read_csv('../predictions/2019.10.31 - 20:26:47 by rozanecm.csv')

test = test.merge(df.rename(columns={'target':current_target_name}), on='id')

In [29]:
df = pd.read_csv('../predictions/on_train_data/rozanecm_approach_1_with_feat_eng')

train = train.merge(df.rename(columns={'rozanecm_approach_1_with_feat_eng':current_target_name}), on='id')

# target 9 - light gbm grid search all train set + feat eng

In [30]:
current_target_name = 'target_9'

In [31]:
df = pd.read_csv('../predictions/2019.11.02 - 14:42:20 by rozanecm.csv')

test = test.merge(df.rename(columns={'target':current_target_name}), on='id')

In [32]:
df = pd.read_csv('../predictions/on_train_data/rozanecm_approach_3_with_feat_eng')

train = train.merge(df.rename(columns={'rozanecm_approach_3_with_feat_eng':current_target_name}), on='id')

# target 10 - feat selection rf y lightgbm grid search

In [33]:
current_target_name = 'target_10'

In [34]:
df = pd.read_csv('../predictions/2019.11.02 - 17:01:06 by rozanecm.csv')

test = test.merge(df.rename(columns={'target':current_target_name}), on='id')

In [35]:
df = pd.read_csv('../predictions/on_train_data/rozanecm_approach_4_with_feat_eng')

train = train.merge(df.rename(columns={'rozanecm_approach_4_with_feat_eng':current_target_name}), on='id')

# target 11 - ?

In [36]:
current_target_name = 'target_11'

In [37]:
df = pd.read_csv('../predictions/2019.11.20 - 02_33_23 by rozanecm.csv')

test = test.merge(df.rename(columns={'target':current_target_name}), on='id')

In [38]:
df = pd.read_csv('../predictions/on_train_data/on_train_data_rozanecm_approach_16')

train = train.merge(df.rename(columns={'rozanecm_approach_16':current_target_name}), on='id')

# target 12 - lightgbm log precio

In [39]:
current_target_name = 'target_12'

In [40]:
df = pd.read_csv('../predictions/2019.11.20 - 19:09:38 by rozanecm.csv')

test = test.merge(df.rename(columns={'target':current_target_name}), on='id')

In [41]:
df = pd.read_csv('../predictions/on_train_data/rozanecm_approach_19')

train = train.merge(df.rename(columns={'rozanecm_approach_19':current_target_name}), on='id')

# target 13 - lightgbm log precio y skewed features

In [42]:
current_target_name = 'target_13'

In [43]:
df = pd.read_csv('../predictions/2019.11.20 - 21:19:47 by rozanecm.csv')

test = test.merge(df.rename(columns={'target':current_target_name}), on='id')

In [44]:
df = pd.read_csv('../predictions/on_train_data/rozanecm_approach_20')

train = train.merge(df.rename(columns={'rozanecm_approach_20':current_target_name}), on='id')

# target 14 - lightgbm new features

In [45]:
current_target_name = 'target_14'

In [46]:
df = pd.read_csv('../predictions/2019.11.21 - 15:06:51 by rozanecm.csv')

test = test.merge(df.rename(columns={'target':current_target_name}), on='id')

In [47]:
df = pd.read_csv('../predictions/on_train_data/rozanecm_approach_24')

train = train.merge(df.rename(columns={'rozanecm_approach_24':current_target_name}), on='id')

# target 15 - xgboost tunned 

In [48]:
current_target_name = 'target_15'

In [49]:
df = pd.read_csv('../predictions/2019.11.28 - 00:25:43 by fcozza.csv')

test = test.merge(df.rename(columns={'target':current_target_name}), on='id')

In [50]:
df = pd.read_csv('../predictions/on_train_data/fcozza_approach_1')

train = train.merge(df.rename(columns={'fcozza_approach_1':current_target_name}), on='id')

# target 16 - lightgbm with features desc

In [51]:
current_target_name = 'target_16'

In [52]:
df = pd.read_csv('../predictions/2019.11.28 - 00:40:14 by fcozza.csv')

test = test.merge(df.rename(columns={'target':current_target_name}), on='id')

In [53]:
df = pd.read_csv('../predictions/on_train_data/fcozza_approach_2')

train = train.merge(df.rename(columns={'fcozza_approach_2':current_target_name}), on='id')

# target 17 - lightgbm with new text feat

In [54]:
current_target_name = 'target_17'

In [55]:
df = pd.read_csv('../predictions/2019.11.30 - 12:19:02 by fcozza.csv')

test = test.merge(df.rename(columns={'target':current_target_name}), on='id')

In [56]:
df = pd.read_csv('../predictions/on_train_data/fcozza_approach_5')

train = train.merge(df.rename(columns={'fcozza_approach_5':current_target_name}), on='id')

# target 18 - lightgbm + todos TODOS los features

In [57]:
current_target_name = 'target_18'

In [58]:
df = pd.read_csv('../predictions/2019.12.01 - 17:45:42 by fcozza.csv')

test = test.merge(df.rename(columns={'target':current_target_name}), on='id')

In [59]:
df = pd.read_csv('../predictions/on_train_data/fcozza_approach_6')

train = train.merge(df.rename(columns={'fcozza_approach_5':current_target_name}), on='id')

In [60]:
train.sample(5)

Unnamed: 0,id,precio,target_1,target_2,target_3,target_4,target_5,target_6,target_7,target_8,target_9,target_10,target_11,target_12,target_13,target_14,target_15,target_16,target_17,fcozza_approach_6
41089,86733,5950000.0,4198749.99,4558400.0,4720800.0,4981650.0,4988194.0,4595951.0,5017077.0,5132841.0,5026633.0,4961040.0,4964830.0,4478364.0,4735123.0,4998944.0,4472521.5,4391044.0,4728689.0,4857041.0
191475,2475,2900000.0,4476505.23,3229904.0,3064495.0,3186385.96,3372270.0,3256786.0,3230691.0,3309567.0,3917343.0,3854608.0,3339358.94,3176912.0,2726390.0,3016668.0,3026064.5,3013762.0,3236928.0,3295842.0
223121,141296,5500000.0,5019121.54,4776813.58,3342790.63,2901379.94,5141259.0,4551912.0,4860663.0,4641426.0,5357778.0,5274907.0,4681978.6,4784484.0,3366299.0,4214840.0,3801787.2,3249286.0,3039527.0,3064421.0
57153,10488,1900000.0,1654277.82,1817536.99,1633038.78,1555047.48,2652862.0,2556127.0,1441695.0,2690895.0,2426071.0,2267889.0,1774758.4,2434559.0,2216112.0,2523965.0,2013168.5,2179686.0,2285111.0,2286152.0
215183,260706,458725.0,858966.39,678194.77,657731.58,595429.3,554519.0,495906.9,414034.1,531064.4,540539.5,482131.9,798358.5,523113.9,496931.8,623037.7,597587.3,454631.6,508799.4,481684.5


In [61]:
test.sample(5)

Unnamed: 0,id,target_1,target_2,target_3,target_4,target_5,target_6,target_7,target_8,target_9,target_10,target_11,target_12,target_13,target_14,target_15,target_16,target_17,target_18
4065,14273,745507.7,953456.06,887782.12,1026363.18,836605.9,861515.2,1081708.0,811719.0,911910.6,955186.3,1001516.62,855402.8,888125.5,825139.3,774425.7,816073.0,786534.5,767577.0
49007,246738,609380.2,581983.9,599513.89,567822.5,558260.9,536499.3,489505.7,554073.6,628507.4,606287.2,580728.04,549672.0,543977.6,544778.4,478906.9,555498.6,586825.2,561452.3
2243,16901,5277515.0,5752988.0,6777340.0,6655550.0,6460865.0,5775400.0,6012000.0,5076937.0,6223401.0,6233189.0,5055066.66,5731942.0,6020158.0,5761179.0,5501461.0,4148638.0,5168440.0,5717847.0
26252,154241,372928.1,358762.97,367710.24,397108.4,414335.9,364298.3,364404.6,374578.5,248734.3,329162.5,424049.88,374832.5,374117.6,381574.9,484092.78,384289.4,378329.7,381672.4
21803,199656,2090844.0,2248721.31,1681085.8,1694288.74,1738846.0,1910320.0,1529624.0,1628959.0,1994234.0,1756924.0,2106477.6,1710985.0,1623787.0,1908028.0,1700600.4,1598325.0,1658652.0,1586846.0


# Light gbm with grid search for stacking

## Entrenamiento local

In [62]:
train['precio_log'] = np.log(train['precio'])

In [63]:
X = train.drop(['id','precio','precio_log'], axis=1) #set de datos
y = train['precio_log'] #target

In [64]:
seed = 42

In [65]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=seed)

In [66]:
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
gbm = lgb.LGBMRegressor(silent=False)

param_dist = {"boosting_type":['gbdt','dart'],
              "max_depth": [25,50,75],
              "learning_rate" : [0.001,0.01,0.05,0.1],
              "num_leaves": [300,900,1200],
              "n_estimators": [50,100,200],
             }

grid_search = GridSearchCV(gbm, n_jobs=-1, param_grid=param_dist, cv = 3, scoring="neg_mean_absolute_error", verbose=5)

In [67]:
import time

In [68]:
%%time
grid_search.fit(X_train,y_train)
grid_search.best_estimator_

Fitting 3 folds for each of 216 candidates, totalling 648 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  7.1min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed: 12.2min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 20.9min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed: 33.0min
[Parallel(n_jobs=-1)]: Done 648 out of 648 | elapsed: 34.6min finished


CPU times: user 14.2 s, sys: 917 ms, total: 15.1 s
Wall time: 34min 35s


LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.05, max_depth=50,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=300, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=False,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [69]:
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb

gbm_optimized = grid_search.best_estimator_

In [70]:
%%time
gbm_optimized.fit(X_train, y_train, sample_weight=None, init_score=None, eval_set=[(X_test,y_test)], eval_names=None,
            eval_sample_weight=None, eval_init_score=None, eval_metric='mae', early_stopping_rounds=10,
            verbose=False, feature_name='auto', callbacks=None)

CPU times: user 12.3 s, sys: 47.2 ms, total: 12.3 s
Wall time: 1.64 s


LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.05, max_depth=50,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=300, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=False,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [71]:
y_scores = gbm_optimized.predict(X_test, num_iteration=gbm_optimized.best_iteration_)

from sklearn.metrics import mean_absolute_error

print(mean_absolute_error(y_test, y_scores))

0.20654698852388653


In [72]:
print(mean_absolute_error(np.exp(y_test), np.exp(y_scores)))

505826.2302675514


In [145]:
y_scores = gbm_optimized.predict(X_test, num_iteration=gbm_optimized.best_iteration_)

from sklearn.metrics import mean_absolute_error

print(mean_absolute_error(y_test, y_scores))

0.20681735942366397


In [146]:
print(mean_absolute_error(np.exp(y_test), np.exp(y_scores)))

506362.0249810891


## Entrenamiento con todos los datos para obtener predicciones a subir

In [73]:
%%time
gbm_optimized.fit(X,y, sample_weight=None, init_score=None, eval_set=None, eval_names=None,
            eval_sample_weight=None, eval_init_score=None, eval_metric='mae', early_stopping_rounds=None,
            verbose=False, feature_name='auto', callbacks=None)

CPU times: user 15.1 s, sys: 105 ms, total: 15.2 s
Wall time: 2.04 s


LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.05, max_depth=50,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=300, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=False,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [74]:
# prediciendo valores posta...
predictions = gbm_optimized.predict(test.drop(['id'],axis=1), num_iteration=gbm_optimized.best_iteration_)

In [75]:
exp_predictions = np.exp(predictions)

In [76]:
df = pd.DataFrame(data={'id':test['id'], 'target':exp_predictions})

In [77]:
description = "Testing final approachs. Stacking apporach removing clustering approach and new light gbm with all features"
save_submission(df, description=description)

# Stacking solo para los mejores scores y sin relacion entre ellos

# target 2 - rf + one hot + svd

In [69]:
current_target_name = 'target_2'

In [70]:
df = pd.read_csv('../predictions/2019.10.28 - 12:03:46 by rozanecm.csv')

test = test.merge(df.rename(columns={'target':current_target_name}), on='id')

In [71]:
df = pd.read_csv('../predictions/on_train_data/rozanecm_approach_2')

train = train.merge(df.rename(columns={'rozanecm_approach_2':current_target_name}), on='id')

# target 15 - xgboost tunned 

In [72]:
current_target_name = 'target_15'

In [73]:
df = pd.read_csv('../predictions/2019.11.28 - 00:25:43 by fcozza.csv')

test = test.merge(df.rename(columns={'target':current_target_name}), on='id')

In [74]:
df = pd.read_csv('../predictions/on_train_data/fcozza_approach_1')

train = train.merge(df.rename(columns={'fcozza_approach_1':current_target_name}), on='id')

# target 16 - lightgbm with features desc

In [75]:
current_target_name = 'target_16'

In [76]:
df = pd.read_csv('../predictions/2019.11.28 - 00:40:14 by fcozza.csv')

test = test.merge(df.rename(columns={'target':current_target_name}), on='id')

In [77]:
df = pd.read_csv('../predictions/on_train_data/fcozza_approach_2')

train = train.merge(df.rename(columns={'fcozza_approach_2':current_target_name}), on='id')

# target 17 - lightgbm with new text features

In [75]:
current_target_name = 'target_17'

In [76]:
df = pd.read_csv('../predictions/2019.11.30 - 12:19:02 by fcozza.csv')

test = test.merge(df.rename(columns={'target':current_target_name}), on='id')

In [77]:
df = pd.read_csv('../predictions/on_train_data/fcozza_approach_5')

train = train.merge(df.rename(columns={'fcozza_approach_5':current_target_name}), on='id')

In [78]:
train.sample(5)

Unnamed: 0,id,precio,target_2,target_15,target_16
205439,152170,1200000.0,1690990.0,1444150.9,1451783.0
237661,30682,2550000.0,3931180.0,3782170.8,3765694.0
115305,268547,1230000.0,1175270.0,1262148.2,1228714.0
190238,16015,2550000.0,4657736.0,4068437.2,3233715.0
238016,44936,2290000.0,2580847.0,2821843.2,2789336.0


In [79]:
test.sample(5)

Unnamed: 0,id,target_2,target_15,target_16
43000,6705,1619015.69,1506820.1,1105231.0
35520,241317,1402112.16,1640528.8,1295102.0
19845,16742,2096175.0,2539204.5,2132457.0
21255,148426,1122573.69,352916.53,667890.1
29479,74444,1780578.11,1850491.5,1750497.0


In [80]:
import lightgbm as lgb

model = lgb.LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
                                  importance_type='split', learning_rate=0.05, max_depth=75,
                                  min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
                                  n_estimators=200, n_jobs=-1, num_leaves=1200, objective=None,
                                  random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=False,
                                  subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [81]:
X = train.drop(['precio'], axis=1) #set de datos
y = train['precio'] #target

In [82]:
model = model.fit(X, y)

In [83]:
y_pred = model.predict(test)

In [84]:
# To save predictions.
# There must be a directory ../predictions for this to work as expected.
# source: https://gist.github.com/rozanecm/ee8333741db42b10158b3e0aff3f22aa
import time
def _get_filename(my_name, timestamp):
    return "../predictions/" + timestamp + " by " + my_name + ".csv"

def _save_description(authors_name, timestamp, submission_description):
    f = open("../predictions/" + authors_name + ".txt","a")
    f.write(timestamp + ": " + submission_description + '\n')
    f.close()

def save_submission(submission_df, authors_name="fcozza", description = "no description.", index=False, header=True):
    timestamp = time.strftime("%Y.%m.%d - %H:%M:%S")
    submission_df.to_csv(_get_filename(authors_name, timestamp), index=index, header=header)
    _save_description(authors_name, timestamp, description)

In [85]:
df = pd.DataFrame(data={'id':test['id'], 'target':y_pred})

In [86]:
description = "Stacking approach with selected targets"
save_submission(df, description=description)

# No mejora, es mejor usar todos los targets

## Red neuronal

In [149]:
train['precio_log'] = np.log(train['precio'])

In [150]:
X = train.drop(['id','precio', 'precio_log'], axis=1) #set de datos
y = train['precio_log'] #target

In [151]:
seed=42

In [152]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=seed)

In [61]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor

Using TensorFlow backend.


In [62]:
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten

In [63]:
checkpoint_name = 'Weights-{epoch:03d}--{val_loss:.5f}.hdf5' 
checkpoint = ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_best_only = True, mode ='auto')
callbacks_list = [checkpoint]

In [68]:
# define base model
def baseline_model():
    NN_model = Sequential()
    
    # The Input Layer :
    NN_model.add(Dense(128, kernel_initializer='normal',input_dim = X_train.shape[1], activation='relu'))

    # The Hidden Layers :
    NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))
    NN_model.add(Dense(256, kernel_initializer='normal',activation='softmax'))
    NN_model.add(Dense(256, kernel_initializer='normal',activation='sigmoid'))

    # The Output Layer :
    NN_model.add(Dense(1, kernel_initializer='normal',activation='linear'))

    # Compile the network :
    NN_model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error'])
    return NN_model

In [69]:
checkpoint_name = 'Weights-{epoch:03d}--{val_loss:.5f}.hdf5' 
checkpoint = ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_best_only = True, mode ='auto')
callbacks_list = [checkpoint]

In [153]:
# evaluate model
estimator = KerasRegressor(build_fn=baseline_model, epochs=500, batch_size=32, validation_split = 0.2, callbacks=callbacks_list)

In [154]:
#Select numerical columns which needs to be normalized
train_norm = X_train[X_train.columns[0:16]]
test_norm = X_test[X_test.columns[0:16]]

In [155]:
from sklearn import preprocessing
# Normalize Training Data 
std_scale = preprocessing.StandardScaler().fit(train_norm)
X_train_norm = std_scale.transform(train_norm)

In [156]:
#Converting numpy array to dataframe
training_norm_col = pd.DataFrame(X_train_norm, index=train_norm.index, columns=train_norm.columns) 
X_train.update(training_norm_col)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = expressions.where(mask, this, that)


In [157]:
# Normalize Test Data 
X_test_norm = std_scale.transform(test_norm)

In [158]:
# Normalize Testing Data by using mean and SD of training set
testing_norm_col = pd.DataFrame(X_test_norm, index=test_norm.index, columns=test_norm.columns) 
X_test.update(testing_norm_col)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = expressions.where(mask, this, that)


In [None]:
estimator.fit(X_train,y_train,validation_data = (X_test, y_test))

In [165]:
y_scores = estimator.predict(X_test)

from sklearn.metrics import mean_absolute_error

print(mean_absolute_error(y_test, y_scores))

0.2266230758359902


In [166]:
print(mean_absolute_error(np.exp(y_test), np.exp(y_scores)))

549288.5811060395


# No sirve