<a href="https://colab.research.google.com/github/sestys/aicrowd_blitz4/blob/main/dd2421ml_programming_challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')
!cp /content/drive/My\ Drive/Colab\ Notebooks/dd2421_programming* /content/

Mounted at /content/drive


In [None]:
!pip install pytorch-lightning

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.naive_bayes import GaussianNB


import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm
# import pytorch_lightning as pl

# Load and fix data

In [3]:
data_path = '/content/'
data_types = {'x1': float, 'x2': float, 'x3': float, 'x4': float, 'x5': bool, 'x6': str, 'x7': float, 'x8': float, 'x9': float, 'x10': float}
train = pd.read_csv(data_path + 'dd2421_programming_challenge_train.csv')
test = pd.read_csv(data_path + 'dd2421_programming_challenge_test.csv', dtype=data_types)

In [4]:
# train_fixed = train.drop(606) # drop the "ooh" row

In [5]:
for col, dtype in data_types.items():
  if col == 'x5': # bool
    train[col] = train[col].apply(lambda x: x == "True")
  elif col == 'x6': # categorical, done later
    continue
    train[col] = train[col].astype('category')
  else: #float
    train[col] = pd.to_numeric(train[col], errors='coerce')

In [6]:
corrupt_rows = train[train.isnull().any(axis=1)]
train = train[train.notnull().all(axis=1)]
train['x6'] = train['x6'].astype('category')

In [7]:
print(train.shape)
print(corrupt_rows.shape)

(997, 12)
(4, 12)


In [8]:
labels = train['y'].unique()
idx2name = {k:v for k, v in enumerate(labels, start=0)}
name2idx = {v:k for k,v in enumerate(labels, start=0)}
idx2name

{0: 'Bob', 1: 'Atsuto', 2: 'Jörg'}

In [9]:
y = train[['y']]
y_num = y['y'].apply(lambda x: name2idx[x])
x_train = train.drop(['id', 'y', 'x6'], axis=1) ## dropping the categorical data

In [10]:
y_num = y_num.to_numpy()
y_num

array([0, 0, 1, 1, 0, 2, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 2, 0, 1, 1, 0,
       2, 2, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 2, 0, 1, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 1, 1, 0, 2, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 2, 1, 0, 1, 0, 0, 1, 0, 0, 2, 1, 0, 0, 1, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 2, 1, 1, 0, 0, 1, 0, 1, 2, 0, 2, 0, 2, 2, 0, 1, 2,
       1, 0, 0, 0, 1, 2, 2, 0, 1, 1, 0, 0, 1, 0, 1, 1, 2, 1, 2, 0, 0, 0,
       2, 1, 1, 0, 0, 2, 1, 2, 1, 0, 0, 1, 0, 0, 2, 0, 1, 1, 2, 1, 2, 0,
       1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 2, 1, 2, 0, 0, 0, 1, 1, 0, 1,
       1, 0, 0, 1, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 1,
       1, 0, 0, 1, 0, 1, 2, 0, 1, 2, 1, 0, 1, 0, 1, 2, 2, 2, 1, 0, 1, 2,
       0, 1, 0, 0, 2, 1, 2, 1, 1, 1, 1, 2, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0,
       2, 2, 0, 0, 2, 0, 2, 2, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0,
       0, 2, 2, 1, 0, 2, 1, 2, 1, 1, 2, 0, 1, 2, 1, 0, 0, 2, 2, 0, 0, 0,
       1, 1, 0, 0, 2, 2, 1, 2, 0, 0, 0, 0, 1, 0, 1,

# Data exploration



In [11]:
print(x_train.dtypes)
print(y['y'].unique())

x1     float64
x2     float64
x3     float64
x4     float64
x5        bool
x7     float64
x8     float64
x9     float64
x10    float64
dtype: object
['Bob' 'Atsuto' 'Jörg']


In [12]:
print(x_train.shape)
x_train.head(20)

(997, 9)


Unnamed: 0,x1,x2,x3,x4,x5,x7,x8,x9,x10
0,-0.78103,-0.63375,-0.03665,0.27517,False,-1.85755,0.22668,-2.3513,1.42092
1,0.42649,-0.50019,0.00696,0.11817,False,-1.73761,-1.61015,-5.12219,1.87723
2,1.86817,-0.45565,0.07658,0.09662,False,1.02798,-0.92326,-1.67111,2.07828
3,-1.80073,0.08535,-0.02467,0.14032,False,2.02793,-0.46318,2.41329,-0.62963
4,1.56943,-1.2586,-0.02092,0.22267,False,-0.43304,-1.99881,-1.58241,2.28423
5,0.14361,-0.37353,-1.01562,1.00364,False,-0.05208,0.2106,-0.29282,3.84481
6,-0.34399,-1.16045,0.21745,0.07107,False,-0.67029,0.03244,-3.17673,1.74463
7,-0.05407,-0.47607,-0.56841,0.65207,False,-1.85532,-0.63706,-0.44894,2.58572
8,-0.71673,-0.27256,-0.38471,0.50048,False,-1.4087,1.59564,-1.92357,0.05368
9,-0.13301,0.31355,-0.92791,0.88101,False,0.25196,-0.25724,-1.00409,1.15892


# Split data

In [13]:
X_train, X_test, y_train, y_test = train_test_split(x_train, y_num, test_size=0.1, random_state=42)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(897, 9) (897,)
(100, 9) (100,)


# Training

## Random Forest
~85%

In [None]:
n_estimators = [10, 20, 50, 75, 100, 200, 500]
k_fold_splits = 10
kf = KFold(n_splits=k_fold_splits)

In [None]:
for n_trees in n_estimators:
  print("Using {} trees in Random Forest".format(n_trees))
  acc = np.zeros((k_fold_splits))
  i = 0
  for train_i, test_i in kf.split(x_train):
    X_train, X_test = x_train.iloc[train_i], x_train.iloc[test_i]
    y_train, y_test = y.iloc[train_i], y.iloc[test_i]
    rf = RandomForestClassifier(n_estimators=n_trees, max_features='auto')
    rf.fit(X_train, y_train.values.ravel())
    pred = rf.predict(X_test)
    score = accuracy_score(y_test, pred)
    acc[i] = score
    # print('Model {} accuracy: {:.2f}%'.format(i, score * 100))
    i += 1

  print('Mean: {:.2f}, stdev: {:.2f}, media: {:.2f}'.format(np.mean(acc), np.std(acc), np.median(acc)))
  print()

Using 10 trees in Random Forest
Mean: 0.84, stdev: 0.05, media: 0.82

Using 20 trees in Random Forest
Mean: 0.86, stdev: 0.05, media: 0.86

Using 50 trees in Random Forest
Mean: 0.86, stdev: 0.05, media: 0.85

Using 75 trees in Random Forest
Mean: 0.85, stdev: 0.05, media: 0.85

Using 100 trees in Random Forest
Mean: 0.86, stdev: 0.04, media: 0.87

Using 200 trees in Random Forest
Mean: 0.85, stdev: 0.05, media: 0.86

Using 500 trees in Random Forest


KeyboardInterrupt: ignored

## AdaBoost with naive-Bayes
< 70%

In [None]:
n_estimators = [10, 20, 50, 75, 100, 200, 500]
k_fold_splits = 10
kf = KFold(n_splits=k_fold_splits)

In [None]:
for n_trees in n_estimators:
  print("Using {} trees in AdaBoost".format(n_trees))
  acc = np.zeros((k_fold_splits))
  i = 0
  for train_i, test_i in kf.split(x_train):
    X_train, X_test = x_train.iloc[train_i], x_train.iloc[test_i]
    y_train, y_test = y.iloc[train_i], y.iloc[test_i]
    n_b = GaussianNB()
    rf = AdaBoostClassifier(base_estimator=n_b, n_estimators=n_trees)
    rf.fit(X_train, y_train.values.ravel())
    pred = rf.predict(X_test)
    score = accuracy_score(y_test, pred)
    acc[i] = score
    # print('Model {} accuracy: {:.2f}%'.format(i, score * 100))
    i += 1

  print('Mean: {:.2f}, stdev: {:.2f}, media: {:.2f}'.format(np.mean(acc), np.std(acc), np.median(acc)))
  print()

Using 10 trees in AdaBoost
Mean: 0.56, stdev: 0.16, media: 0.62

Using 20 trees in AdaBoost
Mean: 0.60, stdev: 0.15, media: 0.63

Using 50 trees in AdaBoost
Mean: 0.64, stdev: 0.07, media: 0.64

Using 75 trees in AdaBoost
Mean: 0.63, stdev: 0.08, media: 0.65

Using 100 trees in AdaBoost
Mean: 0.61, stdev: 0.10, media: 0.59

Using 200 trees in AdaBoost
Mean: 0.68, stdev: 0.05, media: 0.68

Using 500 trees in AdaBoost
Mean: 0.64, stdev: 0.11, media: 0.64



## Neural Network

### Dataloaders

In [14]:
class MLDataset(Dataset):
    
    def __init__(self, data, labels=None):
        self.data = data
        self.labels = labels
        
    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self, index):
        d = torch.tensor(self.data.iloc[index])
        if self.labels is None:
            return d
        else:
            l = self.labels[index]
            return d, l

In [15]:
trainset = MLDataset(X_train, y_train)
validationset = MLDataset(X_test, y_test)

train_loader = DataLoader(trainset, batch_size=16, shuffle=True)
validation_loader = DataLoader(validationset, batch_size=16, shuffle=True)

In [None]:
print(len(train_loader))
print(len(validation_loader))

57
7


In [16]:
data_batch, labels_batch = iter(train_loader).next()
print(data_batch.shape)
print(labels_batch.shape)

torch.Size([16, 9])
torch.Size([16])


### Model

In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [30]:
class FFModel(nn.Module):
    def __init__(self, nclass, input_size, nh, leakyRelu=False):
        super(FFModel, self).__init__()
        self.layer_1 = nn.Linear(input_size, nh)
        self.layer_2 = nn.Linear(nh, nclass)
        # self.softmax = F.log_softmax(nclass)
    
    def forward(self, input):
        x = self.layer_1(input)
        x = F.relu(x)
        x = self.layer_2(x)
        return F.log_softmax(x, dim=1)

    
    # def configure_optimizers(self):
    #     optimizer = optim.Adam(self.parameters(), lr=1e-3)
    #     return optimizer
    

    # def training_step(self, batch, batch_idx):
    #     x, y = batch
    #     loss = F.cross_entropy(self(x.float()), y)
    #     return loss


    # def validation_step(self, batch, batch_idx):
    #     x, y = batch
    #     y_hat = self(x.float())
    #     val_loss = F.cross_entropy(y_hat, y)
    #     return val_loss



In [31]:
model = FFModel(3, 9, 64)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [32]:
max_epoch = 10
epoch_losses = []
iteration_losses = []
num_updates_epochs = []
for epoch in tqdm(range(1, max_epoch + 1)):
    epoch_loss_list = [] 
    num_updates_epoch = 0
    for x, y in tqdm(train_loader, leave=False):
        optimizer.zero_grad()
        x = x.float()
        y = y.to(device)
        y_hat = model(x.to(device))
        loss = F.cross_entropy(y_hat, y)
        iteration_loss = loss.item()

        if np.isnan(iteration_loss) or np.isinf(iteration_loss):
            continue
          
        num_updates_epoch += 1
        iteration_losses.append(iteration_loss)
        epoch_loss_list.append(iteration_loss)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 5)
        optimizer.step()

    epoch_loss = np.mean(epoch_loss_list)
    print("Epoch:{}    Loss:{}    NumUpdates:{}".format(epoch, epoch_loss, num_updates_epoch))
    epoch_losses.append(epoch_loss)
    num_updates_epochs.append(num_updates_epoch)


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=57.0), HTML(value='')))

Epoch:1    Loss:0.9209292321874384    NumUpdates:57


HBox(children=(FloatProgress(value=0.0, max=57.0), HTML(value='')))

Epoch:2    Loss:0.6281961513715878    NumUpdates:57


HBox(children=(FloatProgress(value=0.0, max=57.0), HTML(value='')))

Epoch:3    Loss:0.525521643329085    NumUpdates:57


HBox(children=(FloatProgress(value=0.0, max=57.0), HTML(value='')))

Epoch:4    Loss:0.48182019695901035    NumUpdates:57


HBox(children=(FloatProgress(value=0.0, max=57.0), HTML(value='')))

Epoch:5    Loss:0.46765818303091483    NumUpdates:57


HBox(children=(FloatProgress(value=0.0, max=57.0), HTML(value='')))

Epoch:6    Loss:0.4240732256551845    NumUpdates:57


HBox(children=(FloatProgress(value=0.0, max=57.0), HTML(value='')))

Epoch:7    Loss:0.413910455086775    NumUpdates:57


HBox(children=(FloatProgress(value=0.0, max=57.0), HTML(value='')))

Epoch:8    Loss:0.4153307622723412    NumUpdates:57


HBox(children=(FloatProgress(value=0.0, max=57.0), HTML(value='')))

Epoch:9    Loss:0.38923199906160955    NumUpdates:57


HBox(children=(FloatProgress(value=0.0, max=57.0), HTML(value='')))

Epoch:10    Loss:0.3925762579106448    NumUpdates:57



In [37]:
model.eval()
test_loss = 0
correct = 0
with torch.no_grad():
    for data, target in validation_loader:
        data = data.float()
        data, target = data.to(device), target.to(device)
        output = model(data)
        test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
        pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
        correct += pred.eq(target.view_as(pred)).sum().item()

test_loss /= len(validation_loader.dataset)

print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
    test_loss, correct, len(validation_loader.dataset),
    100. * correct / len(validation_loader.dataset)))


Test set: Average loss: 0.3456, Accuracy: 80/100 (80%)

