In [None]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import datatable as dt # reads data faster than pandas

import gc #to manage ram 
import subprocess

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

import tensorflow as tf

In [None]:
%%time
train = pd.read_csv('../input/tabular-playground-series-nov-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-nov-2021/test.csv')

print(train.shape)
print(test.shape)

In [None]:
train.describe()

In [None]:
test.describe()

In [None]:
print(f'Number of missing values in training data: {train.isna().sum().sum()}')
print(f'Number of missing values in testing data: {test.isna().sum().sum()}')

In [None]:
Features = [col for col in train.columns if col not in ['id', 'target']]

In [None]:
df = pd.concat([train[Features], test[Features]], axis=0)

cat_features = [col for col in Features if df[col].nunique() < 25]
cont_features = [col for col in Features if df[col].nunique() >= 25]

del df
print(f'Total number of features: {len(Features)}')
print(f'Number of categorical features: {len(cat_features)}')
print(f'Number of continuos features: {len(cont_features)}')

plt.pie([len(cat_features), len(cont_features)], 
        labels=['Categorical', 'Continuos'],
        colors=['#76D7C4', '#F5B7B1'],
        textprops={'fontsize': 13},
        autopct='%1.1f%%')
plt.show()

In [None]:
train.drop('id',axis=1,inplace=True)
test.drop('id',axis=1,inplace=True)

In [None]:
print(train['target'].value_counts())
sns.countplot(x = train['target'],data = train);

In [None]:
#creating a random temperory dataframe to get an idea of how the data is distributed 

np.random.seed(2110)
tmp_train = train.sample(10000)
tmp_test = test.sample(10000)

In [None]:
print("Feature distribution of features: ")
ncols = 5
nrows = 20

fig, axes = plt.subplots(nrows, ncols, figsize=(18, 50), facecolor='#EAEAF2')

for r in range(nrows):
    for c in range(ncols):
        col = Features[r*ncols+c]
        sns.kdeplot(x=tmp_train[col], ax=axes[r, c], label='Train data')
        sns.kdeplot(x=tmp_test[col], ax=axes[r, c], color="orange", label='Test data')
        axes[r, c].set_ylabel('')
        axes[r, c].set_xlabel(col, fontsize=8, fontweight='bold')
        axes[r, c].tick_params(labelsize=5, width=0.5)
        axes[r, c].xaxis.offsetText.set_fontsize(4)
        axes[r, c].yaxis.offsetText.set_fontsize(4)
plt.show()

del tmp_train
del tmp_test
gc.collect()

In [None]:
corr = train[Features+['target']].corr()

mask = np.triu(np.ones_like(corr, dtype=bool))

fig,ax=plt.subplots(figsize=(20,20))
ax.set_xticklabels(labels=corr.columns,fontsize=12)
ax.set_yticklabels(labels=corr.columns,fontsize=12)
sns.heatmap(corr,mask=mask,cmap='tab20c',linewidth=0.1)
plt.title('Correlation Map',color='blue',fontsize=12)
plt.show()

In [None]:
y = train['target']
train = train.drop(['target'], axis=1)

gc.collect()

In [None]:
train["mean"] = train[Features].mean(axis=1)
train["std"] = train[Features].std(axis=1)
train["min"] = train[Features].min(axis=1)
train["max"] = train[Features].max(axis=1)

test["mean"] = test[Features].mean(axis=1)
test["std"] = test[Features].std(axis=1)
test["min"] = test[Features].min(axis=1)
test["max"] = test[Features].max(axis=1)

Features.extend(['mean', 'std', 'min', 'max'])

gc.collect()

In [None]:
print(train.shape)
print(test.shape)

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
train[Features] = scaler.fit_transform(train[Features])
test[Features] = scaler.transform(test[Features])

In [None]:
x_train, x_val, y_train, y_val = train_test_split(train, y, train_size=0.7, test_size=0.3, random_state=2021)

del train
gc.collect()

In [None]:
X_train_expanded = tf.expand_dims(x_train, axis=-1)
X_val_expanded = tf.expand_dims(x_val, axis=-1) 

del x_train
del x_val
gc.collect()

In [None]:
model = tf.keras.models.Sequential()

model.add(tf.keras.layers.Input(shape=(X_train_expanded.shape[1], 1,)))

model.add(tf.keras.layers.Conv1D(128, kernel_size=1, activation='relu', padding='same'))
model.add(tf.keras.layers.Conv1D(128, kernel_size=1, activation='relu', padding='same'))
model.add(tf.keras.layers.MaxPooling1D(pool_size=1, strides=1))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.BatchNormalization())

model.add(tf.keras.layers.Conv1D(256, kernel_size=1, activation='relu', padding='same'))
model.add(tf.keras.layers.Conv1D(128, kernel_size=1, activation='relu', padding='same'))
model.add(tf.keras.layers.MaxPooling1D(pool_size=1, strides=1))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.BatchNormalization())

#model.add(tf.keras.layers.Conv1D(256, kernel_size=1, activation='relu', padding='same'))
#model.add(tf.keras.layers.Conv1D(128, kernel_size=1, activation='relu', padding='same'))
#model.add(tf.keras.layers.MaxPooling1D(pool_size=1, strides=1))
#model.add(tf.keras.layers.Dropout(0.5))
#model.add(tf.keras.layers.BatchNormalization())

model.add(tf.keras.layers.Flatten())

model.add(tf.keras.layers.Dense(128, activation='relu', activity_regularizer=tf.keras.regularizers.l2(0.00001), bias_regularizer=tf.keras.regularizers.l2(0.0001)))
model.add(tf.keras.layers.Dropout(0.5))

model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model.summary()

In [None]:
auc = tf.keras.metrics.AUC(name='aucroc')

#optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.0001, rho=0.9, epsilon=1e-08, decay=0.0)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08)

model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy', auc])

In [None]:
earlystopping = tf.keras.callbacks.EarlyStopping(monitor="val_loss", min_delta=0, patience=5, verbose=1, restore_best_weights=True)

reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.3, patience=3, verbose=1, min_delta=1e-4)

callbacks = [earlystopping, reduce_lr]

In [None]:
history = model.fit(x=X_train_expanded, y=y_train, batch_size=128, shuffle=True, epochs=25, validation_data=(X_val_expanded, y_val), callbacks=callbacks)

In [None]:
history_dict = history.history
history_dict.keys()

In [None]:
del X_train_expanded
del X_val_expanded
gc.collect()

In [None]:
loss_values = history_dict['loss']
val_loss_values = history_dict['val_loss']

plt.plot(loss_values, 'b', label='Training loss')
plt.plot(val_loss_values, color = 'orange', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
acc_values = history_dict['accuracy']
val_acc_values = history_dict['val_accuracy']

plt.plot(acc_values, 'b', label='accuracy')
plt.plot(val_acc_values, color = 'orange', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
auc = history_dict['aucroc']
val_auc = history_dict['val_aucroc']

plt.plot(auc, 'b', label='aucroc')
plt.plot(val_auc, color = 'orange', label='val_aucroc')
plt.title('Training and validation aucroc')
plt.xlabel('Epochs')
plt.ylabel('aucroc')
plt.legend()
plt.show()

In [None]:
sample = pd.read_csv("../input/tabular-playground-series-nov-2021/sample_submission.csv")

In [None]:
test_extended = tf.expand_dims(test, axis=-1);

del test
gc.collect()

In [None]:
sub = pd.DataFrame()
sub['id'] = sample['id']
sub['target'] = model.predict(test_extended)
sub = sub.set_index('id')
sub.head()

In [None]:
sub.to_csv('submission.csv')