In [None]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import datatable as dt # reads data faster than pandas

import gc #to manage ram 
import subprocess

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
#from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

import tensorflow as tf

As the dataset is very large, we cannot load all the dataset into RAM (it is giving memory error). You can use chunks of rows to train a model and save the model for subsequent training chunks. But here I'm going to take random data points(rows) from the data and use it to train and validate the model. This maynot be the best way to do (most probably(-_-).

In [None]:
%%time
df_train = pd.read_csv('../input/tabular-playground-series-oct-2021/train.csv', nrows = 750000)

print(df_train.shape)

In [None]:
# this method reads data faster and doesn't  memory errors as frequent as pandas.

# %%time
# train_dt = dt.fread("../input/tabular-playground-series-oct-2021/train.csv").to_pandas()

# train_dt.head()throw

We can reduce the datset using the below function without altering the data. Thanks to https://www.kaggle.com/bextuychiev/how-to-work-w-million-row-datasets-like-a-pro  

In [None]:
def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

In [None]:
train = reduce_memory_usage(df_train, verbose=True)

In [None]:
# train.head()

The dataset doesn't contain any missing values and the train and test data feature distributions are similar.

In [None]:
del df_train
gc.collect()

We will delete any useless dataframes to conserve RAM. gc - garbage collector

In [None]:
# train.shape

In [None]:
print(train['target'].value_counts())
sns.countplot(x = train['target'],data = train);

The distribution of the targetvalues is balanced.

In [None]:
train['std'] = train.std(axis=1)
train['min'] = train.min(axis=1)
train['max'] = train.max(axis=1)

gc.collect()

In [None]:
continous_cols= ['f'+str(i) for i in range(242)] + ['std']
continous_cols.remove('f22')
continous_cols.remove('f43')
categorical_cols = ['f'+str(i) for i in range(242,285)]+['f22','f43','min', 'max']
cols = continous_cols + categorical_cols

gc.collect()

In [None]:
#creating a random temperory dataframe to get an idea of how the data is distributed 

np.random.seed(2110)
tmp_train = train.sample(10000)

In [None]:
# plotting only first 60 features to give an idea
i = 1
plt.figure()
fig, ax = plt.subplots(15, 4,figsize=(20, 22))
for feature in continous_cols[:60]:
    plt.subplot(15, 4,i)
    sns.histplot(tmp_train[feature], kde=True,bins=100, label='train_'+feature)
    #sns.histplot(tmp_test[feature],color="orange", kde=True,bins=100, label='test_'+feature)
    plt.xlabel(feature, fontsize=9); plt.legend()
    i += 1
plt.show()

In [None]:
del tmp_train
gc.collect()

Try using other scalers.

In [None]:
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
train[continous_cols] = scaler.fit_transform(train[continous_cols])

In [None]:
gc.collect()

In [None]:
Y = train['target']
X = train.drop(['target', 'id'], axis=1)

x_train, x_val, y_train, y_val = train_test_split(X, Y, train_size=0.7, test_size=0.3, random_state=0)

del train
gc.collect()

In [None]:
X_train_expanded = tf.expand_dims(x_train, axis=-1)
X_val_expanded = tf.expand_dims(x_val, axis=-1) 

del x_train
del x_val
gc.collect()

In [None]:
model = tf.keras.models.Sequential()

model.add(tf.keras.layers.Input(shape=(X_train_expanded.shape[1], 1,)))

model.add(tf.keras.layers.Conv1D(128, kernel_size=1, activation='relu', padding='same'))
model.add(tf.keras.layers.Conv1D(128, kernel_size=1, activation='relu', padding='same'))
model.add(tf.keras.layers.MaxPooling1D(pool_size=1, strides=1))
model.add(tf.keras.layers.Dropout(0.3))
model.add(tf.keras.layers.BatchNormalization())

model.add(tf.keras.layers.Conv1D(256, kernel_size=1, activation='relu', padding='same'))
model.add(tf.keras.layers.Conv1D(128, kernel_size=1, activation='relu', padding='same'))
model.add(tf.keras.layers.MaxPooling1D(pool_size=1, strides=1))
model.add(tf.keras.layers.Dropout(0.3))
model.add(tf.keras.layers.BatchNormalization())

model.add(tf.keras.layers.Conv1D(512, kernel_size=1, activation='relu', padding='same'))
model.add(tf.keras.layers.Conv1D(128, kernel_size=1, activation='relu', padding='same'))
model.add(tf.keras.layers.MaxPooling1D(pool_size=1, strides=1))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.BatchNormalization())

model.add(tf.keras.layers.Flatten())

model.add(tf.keras.layers.Dense(512, activation='relu', activity_regularizer=tf.keras.regularizers.l2(0.00001), bias_regularizer=tf.keras.regularizers.l2(0.0001)))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(512, activation='relu', activity_regularizer=tf.keras.regularizers.l2(0.00001), bias_regularizer=tf.keras.regularizers.l2(0.0001)))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(512, activation='relu', activity_regularizer=tf.keras.regularizers.l2(0.00001), bias_regularizer=tf.keras.regularizers.l2(0.0001)))
model.add(tf.keras.layers.Dropout(0.5))

model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model.summary()

In [None]:
auc = tf.keras.metrics.AUC(name='aucroc')

#optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.0001, rho=0.9, epsilon=1e-08, decay=0.0)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
#optimizer = tf.keras.optimizers.Nadam( learning_rate=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08) 

model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy', auc])

In [None]:
earlystopping = tf.keras.callbacks.EarlyStopping(monitor="val_loss", min_delta=0, patience=5, verbose=1, restore_best_weights=True)

reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.3, patience=3, verbose=1, min_delta=1e-4)

callbacks = [earlystopping, reduce_lr]

history = model.fit(x=X_train_expanded, y=y_train, batch_size=1024, shuffle=True, epochs=20, validation_data=(X_val_expanded, y_val), callbacks=callbacks)

In [None]:
model.save('TPS_Oct_Model.h5')

In [None]:
del X_train_expanded
del X_val_expanded
gc.collect()

In [None]:
history_dict = history.history
history_dict.keys()

In [None]:
loss_values = history_dict['loss']
val_loss_values = history_dict['val_loss']

plt.plot(loss_values, 'b', label='Training loss')
plt.plot(val_loss_values, color = 'orange', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
acc_values = history_dict['accuracy']
val_acc_values = history_dict['val_accuracy']

plt.plot(acc_values, 'b', label='accuracy')
plt.plot(val_acc_values, color = 'orange', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
auc = history_dict['aucroc']
val_auc = history_dict['val_aucroc']

plt.plot(auc, 'b', label='aucroc')
plt.plot(val_auc, color = 'orange', label='val_aucroc')
plt.title('Training and validation aucroc')
plt.xlabel('Epochs')
plt.ylabel('aucroc')
plt.legend()
plt.show()

In [None]:
gc.collect()

In [None]:
test_dt = pd.read_csv("../input/tabular-playground-series-oct-2021/test.csv")
sample = pd.read_csv("../input/tabular-playground-series-oct-2021/sample_submission.csv")

In [None]:
test = test_dt.drop(['id'], axis = 1)

gc.collect()

In [None]:
test['std'] = test.std(axis=1)
test['min'] = test.min(axis=1)
test['max'] = test.max(axis=1)

In [None]:
del test_dt
gc.collect()

In [None]:
test[continous_cols] = scaler.transform(test[continous_cols])

In [None]:
test_extended = tf.expand_dims(test, axis=-1);

del test
gc.collect()

In [None]:
sub = pd.DataFrame()
sub['id'] = sample['id']
sub['target'] = model.predict(test_extended)
sub = sub.set_index('id')

In [None]:
sub.head()

In [None]:
sub.to_csv('submission.csv')