### Config

In [None]:
TRAIN_DATA_PATH = "../input/tabular-playground-series-may-2022/train.csv"
TEST_DATA_PATH  = "../input/tabular-playground-series-may-2022/test.csv"
fIG_SIZE = (8, 5)

# Import Data

In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', None)


train = pd.read_csv(TRAIN_DATA_PATH) 
test = pd.read_csv(TEST_DATA_PATH) 

# EDA

In [None]:
import numpy as np


def display_data_info(dict_df):
    '''
        Show how many rows and cols, 
        number of missing rows,
        number of columns withs missing values,
        number of duplicates
    '''
    
    # Table column names
    index = [
        'No. rows'          , 
        'No. cols'          , 
        'No. null cols'     , 
        'No. missing values', 
        'No. duplicate rows', 
        'Float columns'     ,
        'Int columns'       ,
        'Object columns'    ,
    ]
    
    # empty data
    data = {}
    
    # looping over each dataframe
    for name, df in dict_df.items():
        
        # getting dataframe info
        num_rows, num_cols = df.shape                  # no. of rows , no. cols
        num_null_cols      = np.sum(df.isna().sum()>0) # No. null cols
        num_null_rows      = df.isna().sum().sum()     # no. missing values
        num_duplic_rows    = np.sum(df.duplicated()>0) # no. duplicated rows
        data_types         = df.dtypes.to_dict()       # data types
        
        # getting column of each type
        float_cols         = []
        int_cols           = []
        obj_cols           = []
        for col, t in data_types.items():
            if t == 'float64' : float_cols.append(col)
            elif t == 'int64' : int_cols.append(col)
            else              : obj_cols.append(col)
            
        data[name] = [
            num_rows        , 
            num_cols        , 
            num_null_cols   , 
            num_null_rows   , 
            num_duplic_rows , 
            float_cols      ,
            int_cols        ,
            obj_cols        ,
        ]
        
    new_df = pd.DataFrame(data=data, index=index)
    
    display(new_df)

In [None]:
display_data_info({'train': train, 'test': test})

In [None]:
def barchart(df, column):
    '''draws a barchart based on the column name'''
    ax = df[column].value_counts().head(10).plot.bar(figsize=fIG_SIZE)
    ax.bar_label(ax.containers[0])
    display(ax)
    
def pie(df, column):
    '''draws a pie based on the column name'''
    display(df[column].value_counts().head(10).plot(kind='pie', autopct='%1.1f%%', figsize=fIG_SIZE))

In [None]:
barchart(train, 'target')

In [None]:
# int columns
train.hist(column=[
    'f_07', 'f_08', 
    'f_09', 'f_10', 
    'f_11', 'f_12', 
    'f_13', 'f_14', 
    'f_15', 'f_16', 
    'f_17', 'f_18', 
    'f_29', 'f_30'
], figsize=(20,15), bins=25);

In [None]:
# float columns
train.hist(column=[
    'f_00', 'f_01', 
    'f_02', 'f_03', 
    'f_04', 'f_05', 
    'f_06', 'f_19', 
    'f_20', 'f_21', 
    'f_22', 'f_23', 
    'f_24', 'f_25', 
    'f_26', 'f_28'
], figsize=(20,15), bins=25);

In [None]:
# object columns 
barchart(train, 'f_27')

# Preprocess

## Add Features

In [None]:
from collections import Counter


def add_feature(df):
    # Feature interactions: create three ternary features
    # Every ternary feature can have the values -1, 0 and +1
    df['i_02_21'] = (df.f_21 + df.f_02 > 5.2).astype(int) - (df.f_21 + df.f_02 < -5.3).astype(int)
    df['i_05_22'] = (df.f_22 + df.f_05 > 5.1).astype(int) - (df.f_22 + df.f_05 < -5.4).astype(int)
    i_00_01_26 = df.f_00 + df.f_01 + df.f_26
    df['i_00_01_26'] = (i_00_01_26 > 5.0).astype(int) - (i_00_01_26 < -5.0).astype(int)
    
    
def encode(df):
    index = [
        'A','B','C','D','E','F',
        'G','H','I','J','K','L',
        'M','N','O','P','Q','R',
        'S','T'
    ]
    
    df['uniq_chars'] = df['f_27'].apply(lambda w: len(set(w)))
    df['prefix']     = 0
    df['suffix']     = 0

    for col in index:
        df[col] = 0
        df[f'ch{col}'] = 0
    
    for k, v in df['f_27'].iteritems():
        alpha_count = Counter(v)
        
        prefix = 0
        for p in v[:4]:
            prefix += ord(p)
            
        suffix = 0
        for s in v[-4:]:
            suffix += ord(p)
            
        df.at[k, 'prefix'] = prefix
        df.at[k, 'suffix'] = suffix
        
        for i, (l, c) in enumerate(alpha_count.items()):
            df.at[k, l]        = c
            df.at[k, f'ch{l}'] = ord(l) - ord('A')
            
        
    add_feature(df)
    display(df.head())

In [None]:
relevent_train = train
encode(relevent_train)

In [None]:
relevent_train = relevent_train.drop('f_27', axis=1)
relevent_train = relevent_train.drop('id', axis=1)
# from sklearn.preprocessing import LabelEncoder

# le = LabelEncoder()

# relevent_train['f_27'] = le.fit_transform(train['f_27'])

In [None]:
relevent_train[['uniq_chars', 'prefix', 'suffix', 'target']].head(20)

## Split X and Y

In [None]:
from sklearn.model_selection import train_test_split


y = relevent_train['target']
X = relevent_train.drop('target', axis=1)

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, 
    test_size=0.2,
    train_size=0.8, 
    shuffle=True,
    random_state=42)

## Add features

### X Train

In [None]:
pd.options.mode.chained_assignment = None

f = [x for x in X_train.columns.values if x[0]=="f"]

X_train['abs_sum'] = X_train.loc[:,f].abs().sum(axis=1)
X_train['median']  = X_train.loc[:,f].median(axis=1)
X_train['std']     = X_train.loc[:,f].std(axis=1)
X_train['mean']    = X_train.loc[:,f].mean(axis=1)

X_train.head()

In [None]:
relevent_features = X_train.columns

### X Valid

In [None]:
X_valid['abs_sum'] = X_valid.loc[:,f].abs().sum(axis=1)
X_valid['median']  = X_valid.loc[:,f].median(axis=1)
X_valid['std']     = X_valid.loc[:,f].std(axis=1)
X_valid['mean']    = X_valid.loc[:,f].mean(axis=1)


pd.options.mode.chained_assignment = 'warn'
X_valid.head()

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

def robust_scale(X_t, X_v):
    return pd.DataFrame(scaler.fit_transform(X_t)), pd.DataFrame(scaler.transform(X_v))

X_train_st, X_val_st = robust_scale(X_train, X_valid)

In [None]:
display_data_info({'train': X_train_st, 'test': X_val_st})

# Neural Network

## Model

In [None]:
import torch
import torch.nn as nn


class Network(nn.Module):
    def __init__(self, in_feature):
        super().__init__()
        
        self.main = nn.Sequential(
            nn.Linear(in_feature, 512),
            nn.SiLU(),
#             nn.LeakyReLU(0.2, inplace=True),
            nn.Dropout(0.3),
            
            nn.Linear(512, 256),
            nn.SiLU(),
#             nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(256, 128),
            nn.SiLU(),
#             nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(128, 128),
            nn.SiLU(),
#             nn.LeakyReLU(0.2, inplace=True),
            nn.Dropout(0.3),
            
            nn.Linear(128, 64),
            nn.SiLU(),
#             nn.LeakyReLU(0.2, inplace=True),
            nn.Dropout(0.3),

            nn.Linear(64, 1),
            nn.Sigmoid()
        )
        
    
    def forward(self, x):
        x = self.main(x)
        return x.view(-1) 

## Config

In [None]:
from torch import optim

device         = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
in_feature     = X_train.shape[1]

model          = Network(in_feature)
model.to(device)

epochs         = 100
batch_size     = 1024

loss_function  = nn.BCELoss()

max_learning_rate = 0.01
optimizer      = optim.Adam(model.parameters(), lr=max_learning_rate, weight_decay=1e-6)
scheduler      = optim.lr_scheduler.OneCycleLR(optimizer,
                                              max_lr = max_learning_rate,
                                              epochs = epochs,
                                              steps_per_epoch = 720000,
                                              pct_start = 0.01,
                                              anneal_strategy = "cos")
# optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.01)

## Dataset and DataLoaders

In [None]:
from sklearn.preprocessing import StandardScaler

class TPS2022(torch.utils.data.Dataset):
    def __init__(self, X, Y):
        self.X = torch.tensor(X.values,dtype=torch.float)
        self.Y = torch.tensor(Y.values,dtype=torch.float)
        
    def __len__(self):
        return len(self.Y)
    
    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]

In [None]:
from torch.utils.data import DataLoader

train_dataset = TPS2022(X_train_st, y_train)
valid_dataset = TPS2022(X_val_st, y_valid)

train_loader  = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader  = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

## Trainer

In [None]:
train_losses, valid_losses = [], []
valid_loss_min = np.Inf

for i in range(epochs):
    tot_train_loss = 0
    
    model.train()
    
    for x, y in train_loader:
        pred = model(x.to(device))
        loss = loss_function(pred, y.to(device))
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        tot_train_loss += loss.item()
        
    else:
        tot_valid_loss = 0
        valid_correct = 0
        
        model.eval()
        with torch.no_grad():
        
            for x, y in valid_loader:      
                pred = model(x.to(device))
                loss = loss_function(pred, y.to(device))
                tot_valid_loss += loss.item()
        
        train_loss = tot_valid_loss / len(train_loader.dataset)
        valid_loss = tot_valid_loss / len(valid_loader.dataset)
        
        train_losses.append(train_loss)
        valid_losses.append(valid_loss)
        
        print("Epoch: {}/{}.. ".format(i+1, epochs),
              "Training Loss: {:.6f}.. ".format(train_loss),
              "Test Loss: {:.6f}.. ".format(valid_loss),
              )
        
        if valid_loss <= valid_loss_min:
            print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
            valid_loss_min,
            valid_loss))
            torch.save(model.state_dict(), 'model.pt')
            valid_loss_min = valid_loss
            
        scheduler.step()

# Results

In [None]:
import matplotlib.pyplot as plt

plt.plot(train_losses, label='Train Loss')
plt.plot(valid_losses, label='Valid Loss')

# Predict on test

## Load best model

In [None]:
model = Network(in_feature)
model.load_state_dict(torch.load('model.pt'))

model.to(device);

## Preprocess test dataset

In [None]:
relevent_test = test.iloc[:, 1:]

In [None]:
encode(relevent_test)

In [None]:
relevent_test = relevent_test.drop('f_27', axis=1)
# relevent_test['f_27'] = le.fit_transform(relevent_test['f_27'])

relevent_test['abs_sum'] = relevent_test.loc[:,f].abs().sum(axis=1)
relevent_test['median']  = relevent_test.loc[:,f].median(axis=1)
relevent_test['std']     = relevent_test.loc[:,f].std(axis=1)
relevent_test['mean']    = relevent_test.loc[:,f].mean(axis=1)

In [None]:
relevent_features

In [None]:
relevent_test.columns

In [None]:
# scaler = RobustScaler()
# scaler = StandardScaler()   
X_test_st = pd.DataFrame(scaler.transform(relevent_test))

In [None]:
display_data_info({'test': X_test_st})

In [None]:
class TestDataset(torch.utils.data.Dataset):
    def __init__(self, X):
        self.X = torch.tensor(X.values,dtype=torch.float)
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx]
    
    
testset     = TestDataset(X_test_st)
test_loader = DataLoader(testset, batch_size=1024, shuffle=False, num_workers=0)

## Predict

In [None]:
preds_class = np.array([], dtype=int)

with torch.no_grad():
    for x in test_loader:
        preds = model(x.to(device)).cpu()
        preds_class = np.append(preds_class, preds)

# Submit

In [None]:
df = pd.DataFrame({
    'id': test.id,
    'target': preds_class
})

df = df.set_index('id')
df.to_csv('submission.csv')

In [None]:
df.head()