In [73]:
import pandas as pd
import numpy as np

In [74]:
# This will allow you to see all column names & rows when you are doing .head(). None of the column name will be truncated.
# source: https://stackoverflow.com/questions/49188960/how-to-show-all-of-columns-name-on-pandas-dataframe

pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [75]:
# source: https://gist.github.com/rozanecm/38f2901c592bdffc40726cb0473318cf
# Function which plays a beep of given duration and frequency.
# Useful for when executing things that need a while to finish, to get notified.
import os
def beep(duration = 1, freq = 1500):
    """ play tone of duration in seconds and freq in Hz. """
    os.system('play --no-show-progress --null --channels 1 synth %s sine %f' % (duration, freq))

In [76]:
train = pd.read_csv('../data/train.csv', usecols=['id','precio'])
test = pd.read_csv('../data/test.csv', usecols=['id'])

In [77]:
# To save predictions.
# There must be a directory ../predictions for this to work as expected.
# source: https://gist.github.com/rozanecm/ee8333741db42b10158b3e0aff3f22aa
import time
def _get_filename(my_name, timestamp):
    return "../predictions/" + timestamp + " by " + my_name + ".csv"

def _save_description(authors_name, timestamp, submission_description):
    f = open("../predictions/" + authors_name + ".txt","a")
    f.write(timestamp + ": " + submission_description + '\n')
    f.close()

def save_submission(submission_df, authors_name="fcozza", description = "no description.", index=False, header=True):
    timestamp = time.strftime("%Y.%m.%d - %H:%M:%S")
    submission_df.to_csv(_get_filename(authors_name, timestamp), index=index, header=header)
    _save_description(authors_name, timestamp, description)

# Agregando columnas de predicciones anteriores

# target 1 - rf

In [78]:
current_target_name = 'target_1'

In [79]:
df = pd.read_csv('../predictions/2019.10.28 - 09:12:00 by rozanecm.csv')

test = test.merge(df.rename(columns={'target':current_target_name}), on='id')

In [80]:
df = pd.read_csv('../predictions/on_train_data/rozanecm_approach_1')

train = train.merge(df.rename(columns={'approach_1':current_target_name}), on='id')

# target 2 - rf + one hot + svd

In [81]:
current_target_name = 'target_2'

In [82]:
df = pd.read_csv('../predictions/2019.10.28 - 12:03:46 by rozanecm.csv')

test = test.merge(df.rename(columns={'target':current_target_name}), on='id')

In [83]:
df = pd.read_csv('../predictions/on_train_data/rozanecm_approach_2')

train = train.merge(df.rename(columns={'rozanecm_approach_2':current_target_name}), on='id')

# target 3 - rf + one hashing vectorizer + svd

In [84]:
current_target_name = 'target_3'

In [85]:
df = pd.read_csv('../predictions/2019.10.28 - 14:40:22 by rozanecm.csv')

test = test.merge(df.rename(columns={'target':current_target_name}), on='id')

In [86]:
df = pd.read_csv('../predictions/on_train_data/rozanecm_approach_3')

train = train.merge(df.rename(columns={'rozanecm_approach_3':current_target_name}), on='id')

# target 4 - rf + stopwords

In [87]:
current_target_name = 'target_4'

In [88]:
df = pd.read_csv('../predictions/2019.10.28 - 16:36:13 by rozanecm.csv')

test = test.merge(df.rename(columns={'target':current_target_name}), on='id')

In [89]:
df = pd.read_csv('../predictions/on_train_data/rozanecm_approach_4')

train = train.merge(df.rename(columns={'rozanecm_approach_4':current_target_name}), on='id')

# target 5 -lightgbm

In [90]:
current_target_name = 'target_5'

In [91]:
df = pd.read_csv('../predictions/2019.10.29 - 11:59:35 by rozanecm.csv')

test = test.merge(df.rename(columns={'target':current_target_name}), on='id')

In [92]:
df = pd.read_csv('../predictions/on_train_data/rozanecm_approach_7')

train = train.merge(df.rename(columns={'rozanecm_approach_7':current_target_name}), on='id')

# target 6 - lightgbm with grid search

In [93]:
current_target_name = 'target_6'

In [94]:
df = pd.read_csv('../predictions/2019.10.29 - 13:19:02 by rozanecm.csv')

test = test.merge(df.rename(columns={'target':current_target_name}), on='id')

In [95]:
df = pd.read_csv('../predictions/on_train_data/rozanecm_approach_8')

train = train.merge(df.rename(columns={'rozanecm_approach_8':current_target_name}), on='id')

# target 7 - lightgbm with grid search

In [96]:
current_target_name = 'target_7'

In [97]:
df = pd.read_csv('../predictions/2019.11.11 - 13:07:13 by rozanecm.csv')

test = test.merge(df.rename(columns={'target':current_target_name}), on='id')

In [98]:
df = pd.read_csv('../predictions/on_train_data/rozanecm_approach_10')

train = train.merge(df.rename(columns={'rozanecm_approach_10':current_target_name}), on='id')

# target 8 - lightgbm grid search + feat eng

In [99]:
current_target_name = 'target_8'

In [100]:
df = pd.read_csv('../predictions/2019.10.31 - 20:26:47 by rozanecm.csv')

test = test.merge(df.rename(columns={'target':current_target_name}), on='id')

In [101]:
df = pd.read_csv('../predictions/on_train_data/rozanecm_approach_1_with_feat_eng')

train = train.merge(df.rename(columns={'rozanecm_approach_1_with_feat_eng':current_target_name}), on='id')

# target 9 - light gbm grid search all train set + feat eng

In [102]:
current_target_name = 'target_9'

In [103]:
df = pd.read_csv('../predictions/2019.11.02 - 14:42:20 by rozanecm.csv')

test = test.merge(df.rename(columns={'target':current_target_name}), on='id')

In [104]:
df = pd.read_csv('../predictions/on_train_data/rozanecm_approach_3_with_feat_eng')

train = train.merge(df.rename(columns={'rozanecm_approach_3_with_feat_eng':current_target_name}), on='id')

# target 10 - feat selection rf y lightgbm grid search

In [105]:
current_target_name = 'target_10'

In [106]:
df = pd.read_csv('../predictions/2019.11.02 - 17:01:06 by rozanecm.csv')

test = test.merge(df.rename(columns={'target':current_target_name}), on='id')

In [107]:
df = pd.read_csv('../predictions/on_train_data/rozanecm_approach_4_with_feat_eng')

train = train.merge(df.rename(columns={'rozanecm_approach_4_with_feat_eng':current_target_name}), on='id')

# target 11 - ?

In [108]:
current_target_name = 'target_11'

In [109]:
df = pd.read_csv('../predictions/2019.11.20 - 02_33_23 by rozanecm.csv')

test = test.merge(df.rename(columns={'target':current_target_name}), on='id')

In [110]:
df = pd.read_csv('../predictions/on_train_data/on_train_data_rozanecm_approach_16')

train = train.merge(df.rename(columns={'rozanecm_approach_16':current_target_name}), on='id')

# target 12 - lightgbm log precio

In [111]:
current_target_name = 'target_12'

In [112]:
df = pd.read_csv('../predictions/2019.11.20 - 19:09:38 by rozanecm.csv')

test = test.merge(df.rename(columns={'target':current_target_name}), on='id')

In [113]:
df = pd.read_csv('../predictions/on_train_data/rozanecm_approach_19')

train = train.merge(df.rename(columns={'rozanecm_approach_19':current_target_name}), on='id')

# target 13 - lightgbm log precio y skewed features

In [114]:
current_target_name = 'target_13'

In [115]:
df = pd.read_csv('../predictions/2019.11.20 - 21:19:47 by rozanecm.csv')

test = test.merge(df.rename(columns={'target':current_target_name}), on='id')

In [116]:
df = pd.read_csv('../predictions/on_train_data/rozanecm_approach_20')

train = train.merge(df.rename(columns={'rozanecm_approach_20':current_target_name}), on='id')

# target 14 - lightgbm new features

In [117]:
current_target_name = 'target_14'

In [118]:
df = pd.read_csv('../predictions/2019.11.21 - 15:06:51 by rozanecm.csv')

test = test.merge(df.rename(columns={'target':current_target_name}), on='id')

In [119]:
df = pd.read_csv('../predictions/on_train_data/rozanecm_approach_24')

train = train.merge(df.rename(columns={'rozanecm_approach_24':current_target_name}), on='id')

# target 15 - xgboost tunned 

In [120]:
current_target_name = 'target_15'

In [121]:
df = pd.read_csv('../predictions/2019.11.28 - 00:25:43 by fcozza.csv')

test = test.merge(df.rename(columns={'target':current_target_name}), on='id')

In [122]:
df = pd.read_csv('../predictions/on_train_data/fcozza_approach_1')

train = train.merge(df.rename(columns={'fcozza_approach_1':current_target_name}), on='id')

# target 16 - lightgbm with features desc

In [123]:
current_target_name = 'target_16'

In [124]:
df = pd.read_csv('../predictions/2019.11.28 - 00:40:14 by fcozza.csv')

test = test.merge(df.rename(columns={'target':current_target_name}), on='id')

In [125]:
df = pd.read_csv('../predictions/on_train_data/fcozza_approach_2')

train = train.merge(df.rename(columns={'fcozza_approach_2':current_target_name}), on='id')

# target 17 - lightgbm with new text feat

In [126]:
current_target_name = 'target_17'

In [127]:
df = pd.read_csv('../predictions/2019.11.30 - 12:19:02 by fcozza.csv')

test = test.merge(df.rename(columns={'target':current_target_name}), on='id')

In [128]:
df = pd.read_csv('../predictions/on_train_data/fcozza_approach_5')

train = train.merge(df.rename(columns={'fcozza_approach_5':current_target_name}), on='id')

In [129]:
train.sample(5)

Unnamed: 0,id,precio,target_1,target_2,target_3,target_4,target_5,target_6,target_7,target_8,target_9,target_10,target_11,target_12,target_13,target_14,target_15,target_16,target_17
137726,93279,7800000.0,4923720.84,5078193.53,5304103.55,5797356.0,5115557.0,4861107.0,5135047.0,5166661.0,5673698.0,5110931.0,5458496.34,4226406.0,6216833.0,4160414.0,5331652.5,4927300.0,4769391.0
196027,206804,4192000.0,3748183.06,3711133.6,3590513.24,3408895.25,3347048.0,3824682.0,3291962.0,3526078.0,3198481.0,3238020.0,3425200.0,3677566.0,3648601.0,3363358.0,4115612.8,3788114.0,3823167.0
149294,265140,475000.0,723048.7,552810.87,555611.55,573113.08,550949.9,470806.8,345327.4,563645.6,445874.1,486424.0,735418.8,599299.8,565139.2,540046.0,451901.28,610665.4,636318.0
55583,241164,750000.0,789974.04,768587.42,776632.13,812468.2,786749.8,707129.0,736882.8,769699.0,733254.9,759133.2,812307.4,747670.3,756262.8,840878.7,953797.3,762194.0,751362.8
81033,161213,5600000.0,5156700.0,5188300.0,4359250.0,4857300.0,5182950.0,5312466.0,5062919.0,4964568.0,5580776.0,5353965.0,5410500.0,5062263.0,4548303.0,4886591.0,5122211.0,4667910.0,4727962.0


In [130]:
test.sample(5)

Unnamed: 0,id,target_1,target_2,target_3,target_4,target_5,target_6,target_7,target_8,target_9,target_10,target_11,target_12,target_13,target_14,target_15,target_16,target_17
52396,292981,1546760.0,1464624.0,2212915.2,1482183.0,1199277.0,1164041.0,1185496.0,1060365.0,1016883.0,1088908.0,1476700.0,1009869.0,1058461.0,1302872.0,1139158.8,1041245.0,989746.3
40645,146844,1967363.0,1467699.0,1495714.0,1533521.44,1526588.0,1493444.0,1862530.0,1549866.0,1352820.0,1339078.0,1581280.0,1370218.0,1323756.0,1356221.0,1165937.6,1303675.0,1173082.0
57256,176597,6529570.0,6666877.99,6942028.57,6308494.85,4793649.0,6150862.0,7014235.0,5007758.0,4597253.0,4676457.0,5018890.0,4745220.0,4592444.0,4792832.0,5435439.0,5203032.0,5067380.0
18654,235012,3710150.0,3666567.25,4283464.73,3921530.7,4107257.0,3462292.0,2958279.0,3547894.0,4407042.0,4645773.0,3367901.08,3491945.0,3632566.0,3200653.0,4762737.0,3986284.0,3281389.0
29687,60778,4928904.0,2142000.0,2135912.06,2373019.98,2289416.0,2280152.0,2661108.0,2448630.0,2067041.0,2064862.0,2089721.44,2186249.0,2240048.0,2209600.0,2357679.8,2263127.0,2286000.0


# Light gbm with grid search for stacking

## Entrenamiento local

In [131]:
train['precio_log'] = np.log(train['precio'])

In [132]:
X = train.drop(['id','precio','precio_log'], axis=1) #set de datos
y = train['precio_log'] #target

In [133]:
seed = 42

In [134]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=seed)

In [135]:
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
gbm = lgb.LGBMRegressor(silent=False)

param_dist = {"boosting_type":['gbdt','dart'],
              "max_depth": [25,50,75],
              "learning_rate" : [0.001,0.01,0.05,0.1],
              "num_leaves": [300,900,1200],
              "n_estimators": [50,100,200],
             }

grid_search = GridSearchCV(gbm, n_jobs=-1, param_grid=param_dist, cv = 3, scoring="neg_mean_absolute_error", verbose=5)

In [137]:
import time

In [138]:
%%time
grid_search.fit(X_train,y_train)
grid_search.best_estimator_

Fitting 3 folds for each of 216 candidates, totalling 648 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  6.8min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed: 11.8min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 20.3min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed: 32.0min
[Parallel(n_jobs=-1)]: Done 648 out of 648 | elapsed: 33.4min finished


CPU times: user 14.1 s, sys: 815 ms, total: 14.9 s
Wall time: 33min 25s


LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.05, max_depth=25,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=300, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=False,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [143]:
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb

gbm_optimized = grid_search.best_estimator_

In [144]:
%%time
gbm_optimized.fit(X_train, y_train, sample_weight=None, init_score=None, eval_set=[(X_test,y_test)], eval_names=None,
            eval_sample_weight=None, eval_init_score=None, eval_metric='mae', early_stopping_rounds=10,
            verbose=False, feature_name='auto', callbacks=None)

CPU times: user 12.2 s, sys: 98 ms, total: 12.3 s
Wall time: 1.63 s


LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.05, max_depth=25,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=300, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=False,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [145]:
y_scores = gbm_optimized.predict(X_test, num_iteration=gbm_optimized.best_iteration_)

from sklearn.metrics import mean_absolute_error

print(mean_absolute_error(y_test, y_scores))

0.20681735942366397


In [146]:
print(mean_absolute_error(np.exp(y_test), np.exp(y_scores)))

506362.0249810891


## Entrenamiento con todos los datos para obtener predicciones a subir

In [147]:
%%time
gbm_optimized.fit(X,y, sample_weight=None, init_score=None, eval_set=None, eval_names=None,
            eval_sample_weight=None, eval_init_score=None, eval_metric='mae', early_stopping_rounds=None,
            verbose=False, feature_name='auto', callbacks=None)

CPU times: user 13.3 s, sys: 55.2 ms, total: 13.3 s
Wall time: 1.75 s


LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.05, max_depth=25,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=300, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=False,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [148]:
# prediciendo valores posta...
predictions = gbm_optimized.predict(test.drop(['id'],axis=1), num_iteration=gbm_optimized.best_iteration_)

In [149]:
exp_predictions = np.exp(predictions)

In [150]:
df = pd.DataFrame(data={'id':test['id'], 'target':exp_predictions})

In [151]:
description = "Stacking apporach with all targets with grid search and with new text features"
save_submission(df, description=description)

# Stacking solo para los mejores scores y sin relacion entre ellos

# target 2 - rf + one hot + svd

In [69]:
current_target_name = 'target_2'

In [70]:
df = pd.read_csv('../predictions/2019.10.28 - 12:03:46 by rozanecm.csv')

test = test.merge(df.rename(columns={'target':current_target_name}), on='id')

In [71]:
df = pd.read_csv('../predictions/on_train_data/rozanecm_approach_2')

train = train.merge(df.rename(columns={'rozanecm_approach_2':current_target_name}), on='id')

# target 15 - xgboost tunned 

In [72]:
current_target_name = 'target_15'

In [73]:
df = pd.read_csv('../predictions/2019.11.28 - 00:25:43 by fcozza.csv')

test = test.merge(df.rename(columns={'target':current_target_name}), on='id')

In [74]:
df = pd.read_csv('../predictions/on_train_data/fcozza_approach_1')

train = train.merge(df.rename(columns={'fcozza_approach_1':current_target_name}), on='id')

# target 16 - lightgbm with features desc

In [75]:
current_target_name = 'target_16'

In [76]:
df = pd.read_csv('../predictions/2019.11.28 - 00:40:14 by fcozza.csv')

test = test.merge(df.rename(columns={'target':current_target_name}), on='id')

In [77]:
df = pd.read_csv('../predictions/on_train_data/fcozza_approach_2')

train = train.merge(df.rename(columns={'fcozza_approach_2':current_target_name}), on='id')

# target 17 - lightgbm with new text features

In [75]:
current_target_name = 'target_17'

In [76]:
df = pd.read_csv('../predictions/2019.11.30 - 12:19:02 by fcozza.csv')

test = test.merge(df.rename(columns={'target':current_target_name}), on='id')

In [77]:
df = pd.read_csv('../predictions/on_train_data/fcozza_approach_5')

train = train.merge(df.rename(columns={'fcozza_approach_5':current_target_name}), on='id')

In [78]:
train.sample(5)

Unnamed: 0,id,precio,target_2,target_15,target_16
205439,152170,1200000.0,1690990.0,1444150.9,1451783.0
237661,30682,2550000.0,3931180.0,3782170.8,3765694.0
115305,268547,1230000.0,1175270.0,1262148.2,1228714.0
190238,16015,2550000.0,4657736.0,4068437.2,3233715.0
238016,44936,2290000.0,2580847.0,2821843.2,2789336.0


In [79]:
test.sample(5)

Unnamed: 0,id,target_2,target_15,target_16
43000,6705,1619015.69,1506820.1,1105231.0
35520,241317,1402112.16,1640528.8,1295102.0
19845,16742,2096175.0,2539204.5,2132457.0
21255,148426,1122573.69,352916.53,667890.1
29479,74444,1780578.11,1850491.5,1750497.0


In [80]:
import lightgbm as lgb

model = lgb.LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
                                  importance_type='split', learning_rate=0.05, max_depth=75,
                                  min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
                                  n_estimators=200, n_jobs=-1, num_leaves=1200, objective=None,
                                  random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=False,
                                  subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [81]:
X = train.drop(['precio'], axis=1) #set de datos
y = train['precio'] #target

In [82]:
model = model.fit(X, y)

In [83]:
y_pred = model.predict(test)

In [84]:
# To save predictions.
# There must be a directory ../predictions for this to work as expected.
# source: https://gist.github.com/rozanecm/ee8333741db42b10158b3e0aff3f22aa
import time
def _get_filename(my_name, timestamp):
    return "../predictions/" + timestamp + " by " + my_name + ".csv"

def _save_description(authors_name, timestamp, submission_description):
    f = open("../predictions/" + authors_name + ".txt","a")
    f.write(timestamp + ": " + submission_description + '\n')
    f.close()

def save_submission(submission_df, authors_name="fcozza", description = "no description.", index=False, header=True):
    timestamp = time.strftime("%Y.%m.%d - %H:%M:%S")
    submission_df.to_csv(_get_filename(authors_name, timestamp), index=index, header=header)
    _save_description(authors_name, timestamp, description)

In [85]:
df = pd.DataFrame(data={'id':test['id'], 'target':y_pred})

In [86]:
description = "Stacking approach with selected targets"
save_submission(df, description=description)

# No mejora, es mejor usar todos los targets

## Red neuronal

In [149]:
train['precio_log'] = np.log(train['precio'])

In [150]:
X = train.drop(['id','precio', 'precio_log'], axis=1) #set de datos
y = train['precio_log'] #target

In [151]:
seed=42

In [152]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=seed)

In [61]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor

Using TensorFlow backend.


In [62]:
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten

In [63]:
checkpoint_name = 'Weights-{epoch:03d}--{val_loss:.5f}.hdf5' 
checkpoint = ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_best_only = True, mode ='auto')
callbacks_list = [checkpoint]

In [68]:
# define base model
def baseline_model():
    NN_model = Sequential()
    
    # The Input Layer :
    NN_model.add(Dense(128, kernel_initializer='normal',input_dim = X_train.shape[1], activation='relu'))

    # The Hidden Layers :
    NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))
    NN_model.add(Dense(256, kernel_initializer='normal',activation='softmax'))
    NN_model.add(Dense(256, kernel_initializer='normal',activation='sigmoid'))

    # The Output Layer :
    NN_model.add(Dense(1, kernel_initializer='normal',activation='linear'))

    # Compile the network :
    NN_model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error'])
    return NN_model

In [69]:
checkpoint_name = 'Weights-{epoch:03d}--{val_loss:.5f}.hdf5' 
checkpoint = ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_best_only = True, mode ='auto')
callbacks_list = [checkpoint]

In [153]:
# evaluate model
estimator = KerasRegressor(build_fn=baseline_model, epochs=500, batch_size=32, validation_split = 0.2, callbacks=callbacks_list)

In [154]:
#Select numerical columns which needs to be normalized
train_norm = X_train[X_train.columns[0:16]]
test_norm = X_test[X_test.columns[0:16]]

In [155]:
from sklearn import preprocessing
# Normalize Training Data 
std_scale = preprocessing.StandardScaler().fit(train_norm)
X_train_norm = std_scale.transform(train_norm)

In [156]:
#Converting numpy array to dataframe
training_norm_col = pd.DataFrame(X_train_norm, index=train_norm.index, columns=train_norm.columns) 
X_train.update(training_norm_col)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = expressions.where(mask, this, that)


In [157]:
# Normalize Test Data 
X_test_norm = std_scale.transform(test_norm)

In [158]:
# Normalize Testing Data by using mean and SD of training set
testing_norm_col = pd.DataFrame(X_test_norm, index=test_norm.index, columns=test_norm.columns) 
X_test.update(testing_norm_col)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = expressions.where(mask, this, that)


In [None]:
estimator.fit(X_train,y_train,validation_data = (X_test, y_test))

In [165]:
y_scores = estimator.predict(X_test)

from sklearn.metrics import mean_absolute_error

print(mean_absolute_error(y_test, y_scores))

0.2266230758359902


In [166]:
print(mean_absolute_error(np.exp(y_test), np.exp(y_scores)))

549288.5811060395


# No sirve