In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install git+https://github.com/PyTorchLightning/lightning-flash
!pip install 'git+https://github.com/PyTorchLightning/lightning-flash.git#egg=lightning-flash[tabular]'
!pip install imblearn
!pip install https://github.com/PyTorchLightning/metrics/archive/refs/heads/master.zip

In [None]:
from flash.tabular.classification import TabularClassifier, TabularClassificationData
from flash import Trainer
from flash.core.classification import LabelsOutput

import torchmetrics

import torch
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split

from pytorch_lightning.utilities.seed import seed_everything

First thing we will do is to read from csv file both the train and test submissions. However, since we do not have the actual answers, for our case we will create training, validation and test datasets from the training set they provide.

In [None]:
seed_everything(seed=42)

train_file = f"/kaggle/input/santander-customer-transaction-prediction/train.csv"
test_file = f"/kaggle/input/santander-customer-transaction-prediction/test.csv"
df_train = pd.read_csv(train_file)
df_predict = pd.read_csv(test_file)

df_train = df_train.drop('ID_code', axis = 1)

df_train.head()

In [None]:
print(df_train.shape)
print(df_predict.shape)

Using .info we can check that 

In [None]:
df_train.describe()

We can verify that there are no nan values in the dataframes

In [None]:
df_train.isnull().values.any()

Now we split the data the data in the three dataframes

In [None]:
train, rem = train_test_split(df_train, test_size=0.2)
validation, test = train_test_split(rem, test_size=0.2)
train = df_train

The training data is very unbalanced: most of the targets are classified as 0 instead of 1. For such reason we will use some synthetic data augmentation.

In [None]:
sm = SMOTE(sampling_strategy='auto', random_state=42)

oversampled_X, oversampled_Y = sm.fit_resample(train.drop('target', axis = 1), train['target'])
df_upsampled = pd.concat([pd.DataFrame(oversampled_Y), pd.DataFrame(oversampled_X)], axis=1)

Let us start working with some lightning flash code

In [None]:
datamodule = TabularClassificationData.from_data_frame(
    numerical_fields=['var_'+str(i) for i in range(200)],
    target_fields="target",
    train_data_frame = df_upsampled,
    val_data_frame = validation,
    test_data_frame = test,
    predict_data_frame=df_predict,
    batch_size = 256
)

We can check both the available schedulers and the available optimizers 

In [None]:
print('lr_schedulers', TabularClassifier.available_lr_schedulers())
print('optimizers', TabularClassifier.available_optimizers())

We will choose the exponential learning rate and the optimizers. Notice also the use of `AUROC` as metric, which is different from 
```
torchmetrics.functional.auroc()
```
in that it is modular instead of functional.

In [None]:
model = TabularClassifier.from_data(datamodule,
                                   metrics = [torchmetrics.AUROC(num_classes=datamodule.num_classes)],
                                   lr_scheduler=("ExponentialLR", {"gamma": 0.95}),
                                   optimizer = 'adamw')
model.output = LabelsOutput()

In [None]:
trainer = Trainer(max_epochs = 10, gpus=torch.cuda.device_count())

'''res = trainer.tuner.lr_find(model, datamodule=datamodule, min_lr=1e-5)
print(f"Suggested learning rate: {res.suggestion()}")
res.plot(show=True, suggest=True).show()'''

model.learning_rate = 5e-3 #res.suggestion()

In [None]:
trainer.fit(model, datamodule=datamodule)

Now we can validate and test the learned model

In [None]:
trainer.validate(model, datamodule=datamodule)

In [None]:
trainer.test(model, datamodule = datamodule)

Finally, we create the predictions

In [None]:
predictions = trainer.predict(model, datamodule=datamodule)

In [None]:
predict_datamodule = TabularClassificationData.from_data_frame(
    numerical_fields=['var_'+str(i) for i in range(200)],
    predict_data_frame=df_predict,
    batch_size = df_predict.shape[0],
    parameters = datamodule.parameters
)

In [None]:
predictions = trainer.predict(model, datamodule=predict_datamodule)

In [None]:
df_predict["target"] = predictions[0]
id_code = ['test_'+str(i) for i in range(len(predictions[0]))]
df_predict['ID_code'] = id_code

In [None]:
df_predict.head()

In [None]:
df_predict.to_csv('submission.csv',columns = ['ID_code','target'], index=False)