In [None]:
!pip install tab2img

# Introduction

So I am going to change the tabular data here into images and then apply a CNN to the result which is quite a novel approach to the problem. I have seen this used successfully in a commercial project and I do believe that in general this approach can be effective. Particular as an additional means of analysing tabular data when also employing other methods. I saw @remekkinas great notebook [here](https://www.kaggle.com/remekkinas/bacteria-image-conv2d-cv-grad-cam) and was inspired but I wanted to try it with my usual approach.

In [None]:
import numpy as np 
import pandas as pd
from PIL import Image
from dateutil.parser import parse
from typing import List
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from torch import optim
import torch.nn as nn

# Load Data

We will load the tabular data. Process it and transform the data to images.

In [None]:
df = pd.read_csv("../input/tabular-playground-series-feb-2022/train.csv")
test_df = pd.read_csv("../input/tabular-playground-series-feb-2022/test.csv")

Change the target column to categorical numbers. This is neccessary for training a neural network

In [None]:
df["target_code"] = df.target.astype('category').cat.codes
saved_link_df = df.loc[:,['target','target_code']].drop_duplicates()
saved_link_df

Let's check the class balance

In [None]:
plt.xticks(rotation = -80) 
sns.countplot(x=df["target"])

Split X and y

In [None]:
y = df["target_code"]
X = df.drop(columns=["target","target_code","row_id"], axis=1)
X_test = test_df.drop(columns=["row_id"], axis=1)

Check column order, if the columns aren't in order between test and train we will have a terrible result as the images will be different

In [None]:
idx = 0
bad_cols = []
for col, col2 in zip(df.columns, test_df.columns):
    if col != col2:
        bad_cols.append(col)
if len(bad_cols) == 0:
    print("Columns are in order :)")

Seperate training and validation sets

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

train_ratio = 0.90

X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=1 - train_ratio, random_state = 0)

Let's create polynomial features so that the images will be much larger

In [None]:
temp_train_df = pd.concat([X_train, y_train], axis=1)
train_corr = temp_train_df.corr().sort_values('target_code', ascending=False).index

In [None]:
train_series = train_corr.to_series()[1:11]
train_series = pd.Series(train_series.values)

In [None]:
from sklearn.preprocessing import PolynomialFeatures

def add_poly_features(X_temp, X_to_concat, poly):
    polyfeatures = poly.transform(X_temp[train_series].copy())
    polyfeatures = pd.DataFrame(data=polyfeatures)
    X_reset = X_to_concat.reset_index()
    X_concat = pd.concat([X_reset, polyfeatures], axis=1, ignore_index=True)
    return X_concat.iloc[:,1:]

poly = PolynomialFeatures(interaction_only=True)
temp_X_train = X_train[train_series].copy()
poly = poly.fit(temp_X_train)
X_train_concat = add_poly_features(X_train.copy(), X_train.copy(), poly)
X_val_concat = add_poly_features(X_val.copy(), X_val.copy(), poly)
X_test_concat = add_poly_features(X_test.copy(), X_test.copy(), poly)

Do the next lot of polynomial features. This is broken up so that we don't use all the ram of the kaggle instance

In [None]:
for i in range(11,91,10):
    train_series = train_corr.to_series()[i:(i + 10)]
    train_series = pd.Series(train_series.values)
    poly = PolynomialFeatures(interaction_only=True)
    temp_X_train = X_train[train_series].copy()
    poly = poly.fit(temp_X_train)
    X_train_concat = add_poly_features(X_train.copy(), X_train_concat, poly)
    X_val_concat = add_poly_features(X_val.copy(), X_val_concat, poly)
    X_test_concat = add_poly_features(X_test.copy(), X_test_concat, poly)

Scale X and y

In [None]:
from sklearn.preprocessing import PolynomialFeatures

scaler = preprocessing.MinMaxScaler().fit(X_train_concat)
X_scaled_train = scaler.transform(X_train_concat)
X_scaled_val = scaler.transform(X_val_concat)
X_scaled_test = scaler.transform(X_test_concat)

Convert tabular data to images

In [None]:
from tab2img.converter import Tab2Img
model = Tab2Img()
train_images = model.fit_transform(X_scaled_train, y_train.values)
val_images = model.transform(X_scaled_val)
test_images = model.transform(X_scaled_test)

Let's visualize the images

In [None]:
fig,ax = plt.subplots(2,5)
for i in range(10):
    nparray = test_images[i].reshape(29,29)
    image = Image.fromarray(nparray * 255)
    ax[i%2][i//2].imshow(image)
fig.show()

fig,ax = plt.subplots(2,5)
for i in range(10):
    nparray = train_images[i].reshape(29,29)
    image = Image.fromarray(nparray * 255)
    ax[i%2][i//2].imshow(image)
fig.show()

# Create the Custom Dataset Class

We need this to be able to load the image and label into the model we will create. So we will create a custom dataset to handle this

In [None]:
from torch.utils.data import Dataset

class CustomDataset(Dataset):
  def __init__(self, X, y, BatchSize, transform):
    super().__init__()
    self.BatchSize = BatchSize
    self.y = y
    self.X = X
    self.transform = transform
    
  def num_of_batches(self):
    """
    Detect the total number of batches
    """
    return math.floor(len(self.list_IDs) / self.BatchSize)

  def __getitem__(self,idx):
    class_id = self.y[idx]
    img = self.transform(np.nan_to_num(self.X[idx]))
    return img, torch.tensor(class_id)

  def __len__(self):
    return len(self.X)

# Instantiate the Datasets

We will form them into torch dataloaders to make the data easier to work with. We are also going to put in a minor amount of image augmentation in the train dataset.from sklearn.preprocessing import PolynomialFeatures

In [None]:
from torch.utils.data import DataLoader
from torchvision import transforms

transform = transforms.Compose([
                transforms.ToTensor()
            ])

dataset_stages = ['train', 'val', 'test']

batch_size = 32
image_datasets = {'train' : CustomDataset(train_images, y_train.values, batch_size, transform), 'val' : CustomDataset(val_images, y_val.values, batch_size, transform), 'test' : CustomDataset(test_images, range(0,len(test_df)), batch_size, transform)}
dataloaders = {'train' : DataLoader(image_datasets['train'], batch_size=image_datasets['train'].BatchSize, shuffle=True, num_workers=0), 
               'val' : DataLoader(image_datasets['val'], batch_size=image_datasets['val'].BatchSize, shuffle=True, num_workers=0), 
               'test' : DataLoader(image_datasets['test'], batch_size=image_datasets['test'].BatchSize, shuffle=False, num_workers=0)}
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val', 'test']}

Check an image from the dataset

In [None]:
image = transforms.ToPILImage()(image_datasets['train'][412][0].cpu()).convert("RGB")
display(image)

Create a Training Function

In [None]:
import time

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

def train_model(model, criterion, optimizer, scheduler, num_epochs=10, early_stop_value=0, categorical=True):
    since = time.time()
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)
        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0
            num_batches = 0
            outputs = None
            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                # Loading Bar
                if (phase == 'train'):
                    num_batches += 1
                    percentage_complete = ((num_batches * batch_size) / (dataset_sizes[phase])) * 100
                    percentage_complete = np.clip(percentage_complete, 0, 100)
                    print("{:0.2f}".format(percentage_complete), "% complete", end='\r')

                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()
                
                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    labels = labels.type(torch.LongTensor)
                    labels = labels.to(device)
                    outputs = outputs.float().to(device)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        # TODO: try removal
                        # torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                if categorical:
                    predicted = torch.max(outputs.data, 1)[1] 
                    running_correct = (predicted == labels).sum()
                    running_corrects += running_correct
                else:
                    running_loss += loss.item() * inputs.size(0)
                    running_correct = 0
                    for i in  range(0,len(outputs)):
                        label = labels.unsqueeze(1).float()[i]
                        running_correct += abs(abs(outputs[i]) -  abs(label))
                    running_corrects += running_correct
                    
            if phase == 'train':
                scheduler.step()

            epoch_loss = running_loss / dataset_sizes[phase]
            
            epoch_acc = running_corrects.double() / dataset_sizes[phase]
            #epoch_acc = sum(epoch_acc) / len(epoch_acc)

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc.item()))
            # Early Stop
            if early_stop_value > 0:
                if phase == 'val':
                    val_accuracy = epoch_acc.item()
        if early_stop_value > 0 and val_accuracy > early_stop_value:
            print("*** EARLY STOP ***")
            break
    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    return model

# Train Model

In [None]:
from torchvision import models
from torch.optim import lr_scheduler

shufflenet = models.shufflenet_v2_x1_0()
shufflenet.conv1[0] = nn.Conv2d(1, 24, kernel_size=(2, 2), stride=(1, 1))
shufflenet.fc = nn.Linear(in_features=1024, out_features=10, bias=True)
model_ft = shufflenet

In [None]:
criterion = nn.CrossEntropyLoss()

optimizer_ft = optim.Adam(model_ft.parameters(), lr=0.01)

exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.01)

model_ft = train_model(model_ft.to(device), criterion, optimizer_ft, exp_lr_scheduler, 16)

# Run on Test Set

In [None]:
predictions = []

outputs = None

for inputs, labels in dataloaders['test']:
    model_ft.eval()
    model_ft.eval()
    
    inputs = inputs.to(device)
    labels = labels.to(device)
    outputs = model_ft(inputs)
    
    for o in torch.max(outputs.data, 1)[1]:
        predictions.append(o.cpu().item())    

Convert back to names as required for submission

In [None]:
label_dict = {}
for _, row in saved_link_df.iterrows():
    label_dict[row["target_code"]] = row["target"]

labels = []
for prediction in predictions:
    labels.append(label_dict[prediction])

In [None]:
submissiondf = pd.read_csv("../input/tabular-playground-series-feb-2022/sample_submission.csv")
submissiondf["target"] = labels
submissiondf.head()

In [None]:
submissiondf.to_csv("submission.csv", index=False)

# Conclusion

This is actually my first experiment with creating polynomial features to tabular data and then applying a neural network to the result. It would have been nice to see more accuracy out of this but I am happy whenever an experiment has been seen through to its end point. Perhaps with some experimentation much greater accuracy can be achieved with this process. I doubt this is the last time I attempt this approach.   