In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import activations
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten, BatchNormalization, Dropout, InputLayer
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
import seaborn as sns
import matplotlib.pyplot as plt
import time

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv("../input/tabular-playground-series-mar-2022/train.csv",parse_dates=["time"])
test = pd.read_csv("../input/tabular-playground-series-mar-2022/test.csv",parse_dates=["time"])

In [None]:
y = train['congestion']
train.drop(columns=['congestion'], inplace=True)

In [None]:
class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [None]:
def reduce_memory_usage(df):
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != 'object':
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    pass
        else:
            df[col] = df[col].astype('category')
    return df

def date_string(df):
    df['year'] = df['time'].dt.year
    df['month'] = df['time'].dt.month
    df['day'] = df['time'].dt.day
    df['hours'] = df['time'].dt.hour
    df['minutes'] = df['time'].dt.minute
    df['seconds'] = df['time'].dt.second
    
def drop_cols(df):
    df.drop(columns=['time', 'row_id', 'year', 'seconds'], inplace=True)
    
def print_unique(df):
    for col in df.columns:
        print(f'Column Name: {col}, Unique Elements: {df[col].nunique()}')
    
def plot_dist(df, cols = 1):
    n_cols = len(df.columns)
    fig = plt.figure(figsize=(15,15))
    for n, col in enumerate(df.columns):
        a = fig.add_subplot(int(np.ceil(n_cols/float(cols))), cols, n + 1)
        sns.distplot(df[col], bins=30, kde=False)
    plt.subplots_adjust(hspace=0.3, wspace=0.3)
    plt.show()
    
def plot_columns(df, df_test, columns=None, cols = 1, plotting_type=None):
    n_cols = len(columns)
    fig = plt.figure(figsize=(18, 18), facecolor='#EAEAF2')
    for n, title in enumerate(columns):
        a = fig.add_subplot(int(np.ceil(n_cols/float(cols))), cols, n + 1)
        if plotting_type == None: 
            sns.kdeplot(df[str(title)],color='#58D68D', label='Train data')
            sns.kdeplot(df_test[str(title)], color='#DE3163', label='Test data')
            a.set_ylabel('')
            a.set_xlabel(title, fontsize=8, fontweight='bold')
        elif plotting_type == 'boxplot':
            sns.boxplot(y=df[str(title)],color='#58D68D')
            #sns.boxplot(y=df_test[str(title)], color='#DE3163')
            a.set_ylabel('')
            a.set_xlabel(title, fontsize=8, fontweight='bold')
        else:
            print("Stop sh*tting.")
            return
    plt.subplots_adjust(hspace=0.3, wspace=0.3)
    plt.show()

In [None]:
date_string(train)
drop_cols(train)
date_string(test)
drop_cols(test)

In [None]:
num_cols = [col for col in train.columns if train[col].dtype == 'int64']
cat_cols = [col for col in train.columns if col not in num_cols]

#plot_dist(train[num_cols], cols=3)

In [None]:
#plot_dist(test[num_cols], cols=3)

In [None]:
#X = MultiColumnLabelEncoder(columns = cat_cols).fit_transform(train)
#test_encoded = MultiColumnLabelEncoder(columns = cat_cols).transform(test)

X = pd.get_dummies(train, columns=cat_cols, drop_first=True)
test_encoded = pd.get_dummies(test, columns=cat_cols, drop_first=True)

In [None]:
MMS = MinMaxScaler()
SC = StandardScaler()
X[num_cols] = MMS.fit_transform(X[num_cols])
test_encoded[num_cols] = MMS.transform(test_encoded[num_cols])

In [None]:
input_shape = X.shape[1:]
epochs = 100
batch_size = 2048
folds = 20

In [None]:
reduce_memory_usage(X)
reduce_memory_usage(test_encoded)

In [None]:
X = X.to_numpy()
X_test = test_encoded.to_numpy()

In [None]:
del train, test, test_encoded

In [None]:
def build_model(X):
    model = Sequential()
    model.add(InputLayer(input_shape=(X.shape[-1])))
    for units, drop_perc in zip([1024, 512, 256], [0.3, 0.2, 0.1]):
        model.add(Dense(units, activation='selu', kernel_initializer="lecun_normal"))
        model.add(Dropout(drop_perc))
    model.add(Dense(128, activation='selu'))
    model.add(Dense(1, activation='linear'))
    return model

In [None]:
NN_Model = build_model(X)

RLR = ReduceLROnPlateau(
    monitor="val_mean_absolute_error",
    factor=0.5,
    patience=5
)
ES = EarlyStopping(
    monitor="val_mean_absolute_error",
    patience=10,
    restore_best_weights=True
)

callbacks = [RLR, ES]
NN_Model.compile(loss='mean_absolute_error', 
                 optimizer='adam', 
                 metrics=['mean_absolute_error'])

In [None]:
def cross_validate_model(model, n_splits=5, batch_size=64, epochs=50, X_test=X_test):
    KF = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    maes, histories = [], []
    y_test = np.zeros((X_test.shape[0], 1))

    for index, (train_index, val_index) in enumerate(KF.split(X, y)):
        print(f"Fold {index+1} out of {n_splits}")
        start = time.time()
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]
        history = model.fit(
            X_train, y_train,
            batch_size=batch_size,
            epochs=epochs,
            callbacks=callbacks,
            validation_data=(X_val, y_val),
            verbose=False
            )
        histories.append(history)
        y_preds = model.predict(X_val)
        mae = mean_absolute_error(y_val, y_preds)
        print(f"MAE: {mae}")
        y_test += model.predict(X_test)
        maes.append(mae)
        end = time.time()
        print(f'This Fold {index+1}, took {end - start} seconds.')
        
    return maes, y_test/n_splits, histories

In [None]:
maes, y_test, histories = cross_validate_model(NN_Model, 
                                               n_splits=folds, 
                                               batch_size=batch_size, 
                                               epochs=epochs, 
                                               X_test=X_test)

In [None]:
sample_submission = pd.read_csv("../input/tabular-playground-series-mar-2022/sample_submission.csv")
sample_submission['congestion'] = y_test
sample_submission.to_csv("submission.csv", index=False)