In [1]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, random_split
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [2]:
class Dataset(Dataset):
    def __init__(self, path, train=True):
        self.data = pd.read_csv(path)
        self.train = train
    
    def __len__(self):
        
        return len(self.data)
    
    def __getitem__(self, idx):
        
        if self.train:
            (features, label) = (torch.tensor(self.data.drop(columns='is_canceled').loc[idx], dtype=torch.float32),
                              torch.tensor(self.data['is_canceled'].loc[idx], dtype=torch.float32))
            
            return (features, label)
        
        else:
            features = torch.tensor(self.data.loc[idx], dtype=torch.float32)
            
            return features

In [3]:
# load dataset
dataset = Dataset('../../data/train_final.csv')

# calculate size for training and validation dataset
dataset_size = len(dataset)
train_size = int(dataset_size * 0.9)
valid_size = dataset_size - train_size

# split dataset randomly
train_dataset, valid_dataset = random_split(dataset, [train_size, valid_size])

# create dataloader
train_loader = torch.utils.data.DataLoader(train_dataset, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, shuffle=True)

In [4]:
class LogisticRegression(nn.Module):
    def __init__(self, n_x, n_y):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(n_x, n_y)
        self.init_param()
    
    def init_param(self):
        seed = 42
        torch.manual_seed(seed)
        
        nn.init.kaiming_normal_(self.linear.weight)
        nn.init.zeros_(self.linear.bias)
        
    def forward(self, x):
        z = self.linear(x)
        a = torch.sigmoid(z)
        
        return a

In [5]:
# define model
model = LogisticRegression(8, 1)

In [6]:
# define loss function
criterion = nn.BCELoss()

In [7]:
# define optimizer
optimizer = optim.SGD(model.parameters(), lr=1e-3)

In [8]:
# train model
EPOCHS = 10
best_accuracy = 0

for epoch in range(EPOCHS):
    
    for batch in train_loader:
        x, y = batch
        optimizer.zero_grad()
        output = model(x).view(-1)
        loss = criterion(output, y)
        loss.backward()
        optimizer.step()
    
    test_loss = 0
    correct = 0

    with torch.no_grad():
        for batch in valid_loader:
            x, y = batch
            output = model(x).view(-1)
            test_loss += criterion(output, y)
            predict = (output >= 0.5)
            correct += (predict == y).sum().item()
    
    test_loss /= len(valid_loader.dataset) 
    accuracy = (correct / len(valid_loader.dataset)) * 100

    print(f"Epoch: {epoch}. Average loss: {test_loss.item():.6f}. Accuracy: {accuracy:.6f}")
    
    if best_accuracy < accuracy:
        print("Accuracy increased. Saving model...")
        torch.save(model.state_dict(), 'model.pth')
        best_accuracy = accuracy
        
    print("-----------------------------------------------------")

Epoch: 0. Average loss: 0.397001. Accuracy: 79.811451
Accuracy increased. Saving model...
-----------------------------------------------------
Epoch: 1. Average loss: 0.140550. Accuracy: 92.136123
Accuracy increased. Saving model...
-----------------------------------------------------
Epoch: 2. Average loss: 1.799639. Accuracy: 77.397103
-----------------------------------------------------
Epoch: 3. Average loss: 0.109928. Accuracy: 96.458956
Accuracy increased. Saving model...
-----------------------------------------------------
Epoch: 4. Average loss: 6.807484. Accuracy: 76.454357
-----------------------------------------------------
Epoch: 5. Average loss: 0.036516. Accuracy: 98.850310
Accuracy increased. Saving model...
-----------------------------------------------------
Epoch: 6. Average loss: 0.024270. Accuracy: 99.126236
Accuracy increased. Saving model...
-----------------------------------------------------
Epoch: 7. Average loss: 0.038588. Accuracy: 98.781329
----------

In [9]:
model = LogisticRegression(8, 1)
model.load_state_dict(torch.load('model.pth'))

<All keys matched successfully>

In [10]:
test_dataset = Dataset('../../test_final.csv', False)
test_loader = torch.utils.data.DataLoader(test_dataset)

In [11]:
predict = []

for features in test_loader:
    x = features
    output = model(x).view(-1)
    
    predict.append(0) if output <= 0.5 else predict.append(1)

In [12]:
len(predict)

23525

In [13]:
data = {
    'index': range(len(predict)),
    'is_canceled': predict
}

submission = pd.DataFrame(data)

In [14]:
submission

Unnamed: 0,index,is_canceled
0,0,1
1,1,0
2,2,1
3,3,0
4,4,1
...,...,...
23520,23520,0
23521,23521,0
23522,23522,0
23523,23523,1


In [15]:
submission.to_csv('submission.csv', index=False)