In [130]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from kerastuner.tuners import RandomSearch

In [131]:
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")
train_df.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target
0,1,A,B,A,A,B,D,A,E,C,...,0.881122,0.42165,0.741413,0.895799,0.802461,0.724417,0.701915,0.877618,0.719903,6.994023
1,2,B,A,A,A,B,B,A,E,A,...,0.440011,0.34623,0.278495,0.593413,0.546056,0.613252,0.741289,0.326679,0.808464,8.071256
2,3,A,A,A,C,B,D,A,B,C,...,0.914155,0.369602,0.832564,0.86562,0.825251,0.264104,0.695561,0.869133,0.828352,5.760456
3,4,A,A,A,C,B,D,A,E,G,...,0.934138,0.57893,0.407313,0.868099,0.794402,0.494269,0.698125,0.809799,0.614766,7.806457
4,6,A,B,A,A,B,B,A,E,C,...,0.3826,0.70594,0.325193,0.440967,0.462146,0.724447,0.683073,0.343457,0.297743,6.868974


In [132]:
# Save id column for submission and drop it from train/test dataset
id_col = test_df['id']
train_df.drop("id", axis=1, inplace=True)
test_df.drop("id", axis=1, inplace=True)

In [133]:
cat_features = ["cat0", "cat1", "cat2", "cat3", "cat4", "cat5", "cat6", "cat7", "cat8", "cat9"]

for feature in cat_features:
    print(train_df[feature]. unique())

['A' 'B']
['B' 'A']
['A' 'B']
['A' 'C' 'D' 'B']
['B' 'C' 'A' 'D']
['D' 'B' 'A' 'C']
['A' 'B' 'D' 'C' 'E' 'I' 'G' 'H']
['E' 'B' 'D' 'G' 'F' 'A' 'C' 'I']
['C' 'A' 'G' 'E' 'D' 'F' 'B']
['I' 'F' 'N' 'K' 'B' 'L' 'G' 'H' 'O' 'A' 'J' 'M' 'C' 'D' 'E']


In [134]:
from sklearn.preprocessing import LabelEncoder

def label_encode(train_df, column):
    le = LabelEncoder()
    new_feature = "{}_le".format(column)
    le.fit(train_df[column])
    train_df[new_feature] = le.transform(train_df[column])
#     test_df[new_feature] = le.transform(test_df[column])
    return train_df

for feature in cat_features:
    tr_df = label_encode(train_df, feature)
    ts_df = label_encode(test_df, feature)

In [135]:
tr_df.drop(["cat0", "cat1", "cat2", "cat3", "cat4", "cat5", "cat6", "cat7", "cat8", "cat9"], axis=1 ,inplace=True)
ts_df.drop(["cat0", "cat1", "cat2", "cat3", "cat4", "cat5", "cat6", "cat7", "cat8", "cat9"], axis=1 ,inplace=True)

In [136]:
tr_df.head()

Unnamed: 0,cont0,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,...,cat0_le,cat1_le,cat2_le,cat3_le,cat4_le,cat5_le,cat6_le,cat7_le,cat8_le,cat9_le
0,0.923191,0.684968,0.124454,0.217886,0.281421,0.881122,0.42165,0.741413,0.895799,0.802461,...,0,1,0,0,1,3,0,4,2,8
1,0.437627,0.014213,0.357438,0.846127,0.282354,0.440011,0.34623,0.278495,0.593413,0.546056,...,1,0,0,0,1,1,0,4,0,5
2,0.732209,0.760122,0.454644,0.81299,0.293756,0.914155,0.369602,0.832564,0.86562,0.825251,...,0,0,0,2,1,3,0,1,2,13
3,0.705142,0.771678,0.153735,0.732893,0.769785,0.934138,0.57893,0.407313,0.868099,0.794402,...,0,0,0,2,1,3,0,4,6,10
4,0.486063,0.639349,0.496212,0.354186,0.279105,0.3826,0.70594,0.325193,0.440967,0.462146,...,0,1,0,0,1,1,0,4,2,5


In [137]:
# Statistics
from scipy import stats
from scipy.stats import skew, kurtosis
# Test for skewness again
ts_df.skew()

cont0      0.510475
cont1     -0.719150
cont2      0.171273
cont3      0.405217
cont4      0.747058
cont5      0.508645
cont6      0.874643
cont7      0.702705
cont8      0.720466
cont9      0.228096
cont10     0.077330
cont11    -0.024752
cont12     0.378401
cont13     0.126923
cat0_le    3.650465
cat1_le    0.161465
cat2_le    3.129097
cat3_le   -0.505880
cat4_le    8.201204
cat5_le    0.072362
cat6_le    9.658899
cat7_le   -3.213018
cat8_le    0.128413
cat9_le   -0.079691
dtype: float64

In [138]:
tr_df.drop('cat6_le', axis=1, inplace=True)
ts_df.drop('cat6_le', axis=1, inplace=True)

In [139]:
X=tr_df.drop("target", axis=1) ## independent features
y=tr_df.target ## dependent features

In [140]:
def build_model(hp):
    model = keras.Sequential()
    for i in range(hp.Int('num_layers', 6, 20)):
        model.add(layers.Dense(units=hp.Int('units_' + str(i),
                                            min_value=32,
                                            max_value=128,
                                            step=32),
                               activation='relu'))
        
        model.add(layers.Dropout(rate=hp.Float('dropout_1',
                                        min_value=0.0,
                                        max_value=0.5,
                                        default=0.25,
                                        step=0.05)))
        
    model.add(layers.Dense(1, activation='linear'))
    model.compile(
        optimizer=keras.optimizers.RMSprop(
            hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])),
        loss=['mse'],
        metrics=['mean_squared_error'])
    return model

In [141]:
tuner = RandomSearch(
    build_model,
    objective='val_mean_squared_error',
    max_trials=4,
    executions_per_trial=2,
    directory='project1',
    project_name='Build Week1')

In [142]:
tuner.search_space_summary()


Search space summary
Default search space size: 9
num_layers (Int)
{'default': None, 'conditions': [], 'min_value': 6, 'max_value': 20, 'step': 1, 'sampling': None}
units_0 (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 128, 'step': 32, 'sampling': None}
dropout_1 (Float)
{'default': 0.25, 'conditions': [], 'min_value': 0.0, 'max_value': 0.5, 'step': 0.05, 'sampling': None}
units_1 (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 128, 'step': 32, 'sampling': None}
units_2 (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 128, 'step': 32, 'sampling': None}
units_3 (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 128, 'step': 32, 'sampling': None}
units_4 (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 128, 'step': 32, 'sampling': None}
units_5 (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 128, 'step': 32, 'sampling': None}
learning_rate (Choice)


In [143]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=0)

In [144]:
from sklearn.preprocessing import StandardScaler, Normalizer, MinMaxScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [145]:
tuner.search(X_train, y_train,
             epochs=5,
             validation_data=(X_test, y_test))

Trial 4 Complete [00h 06m 24s]
val_mean_squared_error: 0.7834339737892151

Best val_mean_squared_error So Far: 0.752607524394989
Total elapsed time: 00h 28m 06s
INFO:tensorflow:Oracle triggered exit


In [115]:
# ts_df.drop('target', axis=1, inplace=True)

In [148]:
ts_df.shape

(200000, 23)

In [153]:
best_hp = tuner.get_best_hyperparameters()[0]
model = tuner.hypermodel.build(best_hp)

In [149]:
scaled_test = scaler.transform(ts_df)

In [124]:
scaled_test = pd.DataFrame(scaled_test)

In [150]:
scaled_test

array([[-0.94384156,  0.18889729,  0.21122728, ...,  0.24211857,
         0.5878209 , -0.40233297],
       [-0.59669971,  0.4712825 , -0.73513549, ...,  0.24211857,
        -0.54308465,  1.21978919],
       [ 0.02919965,  0.47375016,  1.58298175, ...,  0.24211857,
        -0.54308465, -0.7267574 ],
       ...,
       [-0.30073498,  0.99373665,  0.01144403, ...,  0.24211857,
        -0.54308465,  0.89536476],
       [ 0.89789261, -0.37508013, -1.25393791, ...,  0.24211857,
        -0.54308465, -0.7267574 ],
       [ 0.59401466, -0.09248403, -1.5122961 , ...,  0.24211857,
         1.71872645, -0.07790854]])

In [154]:
pred=model.predict(scaled_test)

In [155]:
pred

array([[-0.00354258],
       [-0.00415176],
       [-0.00481415],
       ...,
       [-0.00308673],
       [-0.01030635],
       [-0.00579064]], dtype=float32)

In [157]:
basic_submission = {'id': id_col, 'target': pred.reshape(-1)}
base_submission = pd.DataFrame(data=basic_submission)
base_submission.to_csv('deepL_submission2.csv', index=False)

In [62]:
import matplotlib.pyplot as plt
loss_df = pd.DataFrame(model.history.history)
loss_df.plot()

TypeError: no numeric data to plot

In [65]:
model.history

<tensorflow.python.keras.callbacks.History at 0x255764d8820>