In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd 
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping , ModelCheckpoint

import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [None]:
%%time
train = pd.read_csv("../input/tabular-playground-series-oct-2021/train.csv")

In [None]:
# looking at the train data
train

In [None]:
train.info()
# checking the dtypes of columns

In [None]:
# Total number of null values
train.isnull().sum().sum()

In [None]:
x_train = train.drop(['id','target'],axis=1)
y_train = np.array(train.target)

In [None]:
categorical_col = x_train.select_dtypes(include=['int64'],exclude=['float64']).columns
categorical_col

In [None]:
for i in categorical_col:
    print("----------------------------------------------------------")
    print(f"For column {i}:")
    print(x_train[i].value_counts())

In [None]:
# All have 2 values in 0 and 1 so we dont need to incode it

In [None]:
x_cols = x_train.columns
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_train = pd.DataFrame(data = x_train, columns=x_cols)
x_train

In [None]:
print(x_train.shape)
print(y_train.shape)

In [None]:
# decresing the size of dataset as it is too large to train on kaggle
X_train, X_val, Y_train, Y_val = train_test_split(x_train, y_train, test_size = 0.35, random_state = 0, stratify = y_train)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size = 0.2, random_state = 0, stratify = Y_train)

In [None]:
# too much ram was bring used as dataset it large, so freeing up the original train as
# we have cleaned scaled x_train
del train

In [None]:
print(X_train.shape)
print(X_val.shape)

In [None]:
X_train = X_train.to_numpy().reshape(X_train.shape[0],X_train.shape[1],1)
X_val = X_val.to_numpy().reshape(X_val.shape[0], X_val.shape[1], 1)

In [None]:
# model creation
model = Sequential()
model.add(Conv1D(filters=32, kernel_size=2, activation='relu', input_shape = (285,1)))
model.add(BatchNormalization())
model.add(Dropout(0.2))

model.add(Conv1D(filters=64, kernel_size=2, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))

model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(1, activation='sigmoid', kernel_regularizer=keras.regularizers.l1(0.01)))
model.compile(optimizer=Adam(lr=0.0001), loss = 'binary_crossentropy', metrics=['accuracy'])

In [None]:
#adding a checkpointer to save the model weights if the accuracy improves over validation dataset
checkpointer = [EarlyStopping(monitor = 'val_accuracy', verbose = 1, restore_best_weights=True,mode="max",patience = 9),
                ModelCheckpoint(
                    filepath='model.weights.best.hdf5',
                    monitor="val_accuracy",
                    verbose=1,
                    save_best_only=True,
                    mode="max")]

In [None]:
#total epochs
epochs = 80
#model summary
model.summary()

In [None]:
%%time
#training the model
history = model.fit(X_train, Y_train, epochs=epochs, validation_data=(X_val, Y_val), verbose=1, callbacks=checkpointer)

In [None]:
model.load_weights('model.weights.best.hdf5')

In [None]:
#saving model
model.save("best_model")

In [None]:
# plotting curve of model accuracy over the training time
epochs = len(history.history['accuracy'])
epoch_range = range(1, epochs+1)
plt.plot(epoch_range, history.history['accuracy'])
plt.plot(epoch_range, history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Val'], loc='upper left')
plt.show()

In [None]:
# plotting curve of model loss over the training time
plt.plot(epoch_range, history.history['loss'])
plt.plot(epoch_range, history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Val'], loc='upper left')
plt.show()

In [None]:
# now we have our model ready. we will use this to predict on test dataset
test = pd.read_csv("../input/tabular-playground-series-oct-2021/test.csv")
test_id = test.id
x_test = test.drop(['id'],axis=1)
del test
# scaling the test data too
x_test = scaler.transform(x_test)

In [None]:
x_test

In [None]:
x_test = x_test.reshape(x_test.shape[0],x_test.shape[1],1)

In [None]:
# function for easy submission
def submission(model,filename):
    pred = model.predict(x_test)
    pred = pd.DataFrame(pred,columns=['target'])
    sub = pd.concat([test_id,pred],axis=1)
    sub.set_index('id',inplace=True)
    sub.to_csv(f"Submission_file_{filename}.csv")

In [None]:
# creating submission file
submission(model,"Tensorflow")

In [None]:
# if you found anything new please upvote the notebook as it motivates me to make more.
# I am thinking to make a notebook using TPU