In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("/kaggle/input/mercedes-benz-greener-manufacturing/train.csv.zip", compression='zip')
print(df.shape)
print(df.info())
df.head()

# Preprocessing

In [None]:
from sklearn.model_selection import train_test_split

train_set, valid_set = train_test_split(df, test_size=0.2, random_state=42)

print(train_set.shape)
print(valid_set.shape)

In [None]:
# train set
X_train_cat = train_set.select_dtypes(include='object').copy()
X_train_num = train_set.select_dtypes(include='int64').copy()
X_train_num.drop('ID', axis=1, inplace=True)

# valid set
X_valid_cat = valid_set.select_dtypes(include='object').copy()
X_valid_num = valid_set.select_dtypes(include='int64').copy()
X_valid_num.drop('ID', axis=1, inplace=True)

# label
y_train = train_set['y'].values
y_valid = valid_set['y'].values

In [None]:
import tensorflow as tf

class OneHotEncodingLayer(tf.keras.layers.Layer):
    def __init__(self, vocab, num_oov_buckets=1,  **kwargs):
        super().__init__()
        self.vocab = vocab
        self.num_oov_buckets = num_oov_buckets
        
    def adapt(self, data):
        indices = tf.range(len(self.vocab), dtype=tf.int64)
        table_init = tf.lookup.KeyValueTensorInitializer(self.vocab, indices)
        self.table = tf.lookup.StaticVocabularyTable(table_init, self.num_oov_buckets)
        
    def call(self, inputs):
        cat_indices = self.table.lookup(inputs)
        one_hot =  tf.one_hot(cat_indices, depth=len(self.vocab)+self.num_oov_buckets)
        return tf.squeeze(one_hot, axis=1)
    
    def get_config(self):
        base_config = super().get_config()
        return {**base_config, "vocab":self.vocab, "num_oov_buckets":self.num_oov_buckets}

# keras Functional API

In [None]:
# categorical features one hot encoding
one_hot_encoders = [OneHotEncodingLayer(vocab=np.unique(data.values).tolist()) for col, data in X_train_cat.iteritems()]

for encoder, (col, data) in zip(one_hot_encoders, X_train_cat.iteritems()):
    encoder.adapt(data.values)
    
cat_inputs = [tf.keras.Input(shape=(1,), dtype=tf.string) for _ in X_train_cat.iteritems()]
encoder_outs = [encoder(cat_input) for encoder, cat_input in zip(one_hot_encoders, cat_inputs)]

# concatenate categorical features
concat = tf.keras.layers.concatenate(encoder_outs, axis=-1)

# numerical features are binary
num_inputs = tf.keras.Input(shape=(X_train_num.shape[1]),)

# concatenate all features
concat_all = tf.keras.layers.concatenate([concat, num_inputs], axis=-1)

# build DNN
dense1 = tf.keras.layers.Dense(300, activation='relu')(concat_all)
dense2 = tf.keras.layers.Dense(200, activation='relu')(dense1)
dense3 = tf.keras.layers.Dense(100, activation='relu')(dense2)
outputs = tf.keras.layers.Dense(1)(dense3)

model = tf.keras.models.Model(inputs=[cat_inputs, num_inputs], outputs=outputs)

In [None]:
K = tf.keras.backend

class ExponentialLearningRate(tf.keras.callbacks.Callback):
    def __init__(self, factor):
        self.factor = factor
        self.rates = []
        self.losses = []
    def on_batch_end(self, batch, logs):
        self.rates.append(K.get_value(self.model.optimizer.lr))
        self.losses.append(logs["loss"])
        K.set_value(self.model.optimizer.lr, self.model.optimizer.lr * self.factor)

In [None]:
tf.keras.backend.clear_session()
tf.random.set_seed(42)
np.random.seed(42)


train_cat_in = [data.values for col, data in X_train_cat.iteritems()]
valid_cat_in = [data.values for col, data in X_valid_cat.iteritems()]

expon_lr =  ExponentialLearningRate(factor=1.005)

model.compile(loss=tf.keras.losses.Huber(), optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4), metrics=['mae'])

model.fit([train_cat_in, X_train_num.values], y_train, epochs=10, validation_data=((valid_cat_in, X_valid_num.values), y_valid), callbacks=[expon_lr])

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.plot(expon_lr.rates, expon_lr.losses)
plt.gca().set_xscale('log')
plt.hlines(min(expon_lr.losses), min(expon_lr.rates), max(expon_lr.rates))
plt.axis([min(expon_lr.rates), max(expon_lr.rates), 0, expon_lr.losses[0]])
plt.grid()
plt.title('Exponential Sceduling (per batch)')
plt.xlabel("Learning rate")
plt.ylabel("Loss")
plt.show()

In [None]:
tf.keras.backend.clear_session()
tf.random.set_seed(42)
np.random.seed(42)

early_stopping_cb = tf.keras.callbacks.EarlyStopping(patience=10)

model.compile(loss=tf.keras.losses.Huber(), optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), metrics=['mae'])

model.fit([train_cat_in, X_train_num.values], y_train, epochs=100, 
          validation_data=((valid_cat_in, X_valid_num.values), y_valid), callbacks=[early_stopping_cb])

In [None]:
model.evaluate((valid_cat_in, X_valid_num.values), y_valid)

# Make Predictions

In [None]:
test = pd.read_csv("/kaggle/input/mercedes-benz-greener-manufacturing/test.csv.zip", compression='zip')
test.head()

In [None]:
X_test_cat = test.select_dtypes(include='object').copy()
X_test_num = test.select_dtypes(include='int64').copy()
X_test_num.drop('ID', axis=1, inplace=True)

test_cat_in = [data.values for col, data in X_test_cat.iteritems()]

test['y'] = model.predict((test_cat_in, X_test_num.values))
test['y'].values

In [None]:
submission = test[['ID', 'y']]
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)