In [None]:
import numpy as np
import pandas as pd 
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.optimizers import Adam
from keras.callbacks import EarlyStopping , ModelCheckpoint

import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [None]:
train = pd.read_csv("../input/tabular-playground-series-sep-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-sep-2021/test.csv")

In [None]:
# looking at the train data
train

In [None]:
train.info()
# checking the dtypes of columns

In [None]:
x_train = train.drop(['id','claim'],axis=1)
y_train = np.array(train.claim)
x_cols = x_train.columns

In [None]:
# as all are numerical columns so filling missing values with mean
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer = imputer.fit(x_train)
x_train = imputer.transform(x_train)

In [None]:
x_train = pd.DataFrame(data = x_train, columns=x_cols)

In [None]:
# checking if dataset is balanced or not
train.claim.value_counts()

In [None]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_train = pd.DataFrame(data = x_train, columns=x_cols)
x_train

In [None]:
print(x_train.shape)
print(y_train.shape)

In [None]:
# splitting train data to train and validation in ratio of 4:1
X_train, X_val, Y_train, Y_val = train_test_split(x_train, y_train, test_size = 0.2, random_state = 0, stratify = y_train)

In [None]:
print(X_train.shape)
print(X_val.shape)

In [None]:
# reshaping the train and val arrays
X_train = X_train.to_numpy().reshape(766335,118,1)
X_val = X_val.to_numpy().reshape(191584, 118, 1)

In [None]:
# model creation
model = Sequential()
model.add(Conv1D(filters=32, kernel_size=2, activation='relu', input_shape = (118,1)))
model.add(BatchNormalization())
model.add(Dropout(0.2))

model.add(Conv1D(filters=64, kernel_size=2, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))

model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(1, activation='sigmoid', kernel_regularizer=keras.regularizers.l1(0.01)))
model.compile(optimizer=Adam(lr=0.0001), loss = 'binary_crossentropy', metrics=['accuracy'])

#adding a checkpointer to save the model weights if the accuracy improves over validation dataset
checkpointer = [EarlyStopping(monitor = 'val_accuracy', verbose = 1, restore_best_weights=True,mode="max",patience = 9),
                ModelCheckpoint(
                    filepath='model.weights.best.hdf5',
                    monitor="val_accuracy",
                    verbose=1,
                    save_best_only=True,
                    mode="max")]

In [None]:
#total epochs
epochs = 80
#model summary
model.summary()

In [None]:
#training the model
history = model.fit(X_train, Y_train, epochs=epochs, validation_data=(X_val, Y_val), verbose=1, callbacks=checkpointer)

In [None]:
model.load_weights('model.weights.best.hdf5')

In [None]:
#saving model
model.save("best_model")

In [None]:
# plotting curve of model accuracy over the training time
epochs = len(history.history['accuracy'])
epoch_range = range(1, epochs+1)
plt.plot(epoch_range, history.history['accuracy'])
plt.plot(epoch_range, history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Val'], loc='upper left')
plt.show()

In [None]:
# plotting curve of model loss over the training time
plt.plot(epoch_range, history.history['loss'])
plt.plot(epoch_range, history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Val'], loc='upper left')
plt.show()

In [None]:
# now we have our model ready. we will use this to predict on test dataset
test_id = test.id
x_test = test.drop(['id'],axis=1)
# filling the NaN with mean
x_test = imputer.transform(x_test)
x_test = pd.DataFrame(data = x_test, columns=x_cols)
x_test = scaler.transform(x_test)

In [None]:
x_test

In [None]:
# reshaping the test datset
x_test = x_test.reshape(493474,118,1)

In [None]:
# function for easy submission
def submission(model,filename):
    pred = model.predict(x_test)
    pred = pd.DataFrame(pred,columns=['claim'])
    sub = pd.concat([test_id,pred],axis=1)
    sub.set_index('id',inplace=True)
    sub.to_csv(f"Submission_file_{filename}.csv")

In [None]:
# creating submission file
submission(model,"Tensorflow")