In [None]:
!pip install tab2img

# Introduction

So I am going to change the tabular data here into images and then apply a CNN to the result which is quite a novel approach to the problem. I do not think this will be very successful right from the start, however it is an experiment I've always wanted to do and an approach I really quite liked when applied to tabular data. I have seen this used successfully in a commercial project and I do believe that in general this approach can be effective. Particular as an additional means of analysing tabular data when also employing other methods. 

In [None]:
import numpy as np 
import pandas as pd
from PIL import Image
from dateutil.parser import parse
from typing import List
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from torch import optim
import torch.nn as nn

# Load Data

We will load the tabular data. Process it and transform the data to images.

In [None]:
df = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
new_df = df.copy()
for col in df.select_dtypes(include='object').columns:
    new_df = pd.get_dummies(new_df, columns=[col])
df = new_df
df.head()

I will process the test set the same way and make sure it has the same columns

In [None]:
test_df = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")
new_df = test_df.copy()
for col in test_df.select_dtypes(include='object').columns:
    new_df = pd.get_dummies(new_df, columns=[col])
test_df = new_df
test_df.head()

# add missing columns
idx = 0
for col in df.columns:
    if col not in test_df:
        test_df.insert(idx, col, [0] * len(test_df))
    idx = idx + 1
test_df.head()

test_df = test_df.drop(columns=["SalePrice","Id"], axis=1)

Seperate tabular data to X and y

In [None]:
y = df["SalePrice"]
X = df.drop(columns=["SalePrice","Id"], axis=1)

Seperate training and validation sets

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

train_ratio = 0.90

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=1 - train_ratio, random_state = 0)

Scale X and y

In [None]:
scaler = preprocessing.MinMaxScaler().fit(X_train)
X_scaled_train = scaler.transform(X_train)
X_scaled_val = scaler.transform(X_val)
X_scaled_test = scaler.transform(test_df)
y_scaler =  preprocessing.StandardScaler().fit(y_train.values.reshape(-1, 1))
y_scaled_train = y_scaler.transform(y_train.values.reshape(-1, 1))
y_scaled_val = y_scaler.transform(y_val.values.reshape(-1, 1))

Convert tabular data to images

In [None]:
from tab2img.converter import Tab2Img
model = Tab2Img()
train_images = model.fit_transform(X_scaled_train, y_scaled_train)
val_images = model.transform(X_scaled_val)
test_images = model.transform(X_scaled_test)

Let's visualize the images

In [None]:
fig,ax = plt.subplots(2,5)
for i in range(10):
    nparray = test_images[i].reshape(17,17)
    image = Image.fromarray(nparray * 255)
    ax[i%2][i//2].imshow(image)
fig.show()

fig,ax = plt.subplots(2,5)
for i in range(10):
    nparray = train_images[i].reshape(17,17)
    image = Image.fromarray(nparray * 255)
    ax[i%2][i//2].imshow(image)
fig.show()

# Create the Custom Dataset Class

We need this to be able to load the image and label into the model we will create. So we will create a custom dataset to handle this

In [None]:
from torch.utils.data import Dataset

class CustomDataset(Dataset):
  def __init__(self, X, y, BatchSize, transform):
    super().__init__()
    self.BatchSize = BatchSize
    self.y = y
    self.X = X
    self.transform = transform
    
  def num_of_batches(self):
    """
    Detect the total number of batches
    """
    return math.floor(len(self.list_IDs) / self.BatchSize)

  def __getitem__(self,idx):
    class_id = self.y[idx]
    img = self.transform(np.nan_to_num(self.X[idx]))
    return img, torch.tensor(class_id)

  def __len__(self):
    return len(self.X)

# Instantiate the Datasets

We will form them into torch dataloaders to make the data easier to work with. We are also going to put in a minor amount of image augmentation in the train dataset.

In [None]:
from torch.utils.data import DataLoader
from torchvision import transforms

transform = transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize([0.5], [0.5])
            ])

dataset_stages = ['train', 'val', 'test']

batch_size = 32
image_datasets = {'train' : CustomDataset(train_images, y_train.values, batch_size, transform), 'val' : CustomDataset(val_images, y_val.values, batch_size, transform), 'test' : CustomDataset(test_images, range(0,len(test_df)), batch_size, transform)}
dataloaders = {'train' : DataLoader(image_datasets['train'], batch_size=image_datasets['train'].BatchSize, shuffle=True, num_workers=0), 
               'val' : DataLoader(image_datasets['val'], batch_size=image_datasets['val'].BatchSize, shuffle=True, num_workers=0), 
               'test' : DataLoader(image_datasets['test'], batch_size=image_datasets['test'].BatchSize, shuffle=False, num_workers=0)}
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val', 'test']}

Check an image from the dataset

In [None]:
image = transforms.ToPILImage()(image_datasets['train'][412][0].cpu()).convert("RGB")
display(image)

Create a Training Function

In [None]:
import time

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

def train_model(model, criterion, optimizer, scheduler, num_epochs=10):
    since = time.time()
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)
        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0
            num_batches = 0
            outputs = None
            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                # Loading Bar
                if (phase == 'train'):
                    num_batches += 1
                    percentage_complete = ((num_batches * batch_size) / (dataset_sizes[phase])) * 100
                    percentage_complete = np.clip(percentage_complete, 0, 100)
                    print("{:0.2f}".format(percentage_complete), "% complete", end="\r")

                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()
                
                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    loss = criterion(outputs.float(), labels.unsqueeze(-1))

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        # TODO: try removal
                        # torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_correct = 0
                for i in  range(0,len(outputs)):
                    label = labels.unsqueeze(1).float()[i]
                    running_correct += abs(abs(outputs[i]) -  abs(label))
                running_corrects += running_correct
                    
            if phase == 'train':
                scheduler.step()

            epoch_loss = running_loss / dataset_sizes[phase]
            
            epoch_acc = running_corrects.double() / dataset_sizes[phase]
            #epoch_acc = sum(epoch_acc) / len(epoch_acc)

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc.item()))

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    return model

# Load up Shufflenet

Here I will change the first layer to suit a smaller image and the classification layer will be changed for a regression problem

In [None]:
from torchvision import models
from torch.optim import lr_scheduler

class Net(nn.Module):   
    def __init__(self):
        super(Net, self).__init__()

        self.cnn_layers = nn.Sequential(
            # Defining a 2D convolution layer
            nn.Conv2d(1, 4, kernel_size=2, stride=1, padding=1),
            nn.BatchNorm2d(4),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            # Defining another 2D convolution layer
            nn.Conv2d(4, 4, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(4),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )

        self.linear_layers = nn.Sequential(
            nn.Linear(64, 64),
            nn.Linear(64, 64),
            nn.Linear(64, 64),
            nn.Linear(64, 64),
            nn.Linear(64, 1)
        )

    # Defining the forward pass    
    def forward(self, x):
        x = self.cnn_layers(x)
        x = x.view(x.size(0), -1)
        x = self.linear_layers(x)
        return x

# Train Model

In [None]:
model_ft = Net()

criterion = nn.L1Loss()

optimizer_ft = optim.Adam(model_ft.parameters(), lr=0.01)

exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)

model_ft = train_model(model_ft.to(device), criterion, optimizer_ft, exp_lr_scheduler, 30)

# Make Submission

In [None]:
outputs = None
predictions = []
for inputs, labels in dataloaders['test']:
    model_ft.eval()
    
    inputs = inputs.to(device)
    labels = labels.to(device)
    outputs = model_ft(inputs)
    outputs = outputs.cpu().detach().numpy().squeeze()
    for o in outputs:
        predictions.append(o)

In [None]:
submission_df = pd.read_csv("../input/house-prices-advanced-regression-techniques/sample_submission.csv")
submission_df["SalePrice"] = predictions
submission_df.to_csv("submission.csv", index=False)

# Conclusion

The accuracy here isn't going to win the leaderboard but I think it is interesting how much was achieved given how experimental an approach this is and the admittedly odd usage of CNN in this. I am very happy with the outcome. I must admit it took me quite a few revisions to get this off the ground and there were many bugs and issues that came up. I think this is now a tool in my toolbelt for the future though and I hope it can help you too. 