In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install -U optimal_pytorch

In [None]:
from optimal_pytorch.coin_betting.torch import Cocob

In [None]:
help(Cocob)

In [None]:
from sklearn.impute import SimpleImputer
from sklearn import preprocessing


def preprocess(data, train=True):
    if train:
        X = data.drop(['Survived','PassengerId'], axis=1)
        Y = data['Survived']
    else:
        X = data.drop(['PassengerId'], axis=1)
    
    X['Cabin'] = X['Cabin'].fillna('X').map(lambda x: x[0:5].strip())
    imp = SimpleImputer(strategy="most_frequent")
    X = pd.DataFrame(imp.fit_transform(X), columns=X.columns)
    enc = preprocessing.OrdinalEncoder()
    cols = ['Sex','Cabin','Name','Ticket', 'Embarked']
    X[cols] = enc.fit_transform(X[cols])
    scaler = preprocessing.StandardScaler()
    X = scaler.fit_transform(X)
    if train:
        return X, Y
    else:
        return X

    
df = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/train.csv')
x_train, y_train = preprocess(df)
df = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/test.csv')
x_test = preprocess(df, train=False)

In [None]:
x_train.shape

In [None]:
np.unique(y_train)

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from sklearn.model_selection import KFold
from torch.optim import SGD


def log_loss(w, x):
    wTx = torch.einsum('i,ki ->', w, x)
    return torch.log(1 + torch.exp(wTx))

def predict(w, x):
    wTx = torch.einsum('i,ji -> j', w, x)
    out = 1. / (1. + torch.exp(-wTx))
    out[out > .5] = 1.
    out[out <= .5] = 0.
    return out


class synthanic_data(Dataset):
    
    def __init__(self, x, y):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.x = torch.from_numpy(x)
        self.y = torch.from_numpy(y)

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
            
        features = self.x[idx]
        labels = self.y[idx]
        return {'x': features, 'y': labels}
    
    
# Model
class LogisticRegression(nn.Module):
    def __init__(self, input_size, num_classes):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_size, num_classes)
    
    def forward(self, x):
        out = self.linear(x)
        return out


#define dataset
synthanic_torch = synthanic_data(np.array(x_train), np.array(y_train))

# Configuration options
k_folds = 5
num_epochs = 5
# loss_function = nn.CrossEntropyLoss()

# For fold results
results = {}

# Set fixed random number seed
torch.manual_seed(42)

dim = x_train.shape[1]
criterion = nn.CrossEntropyLoss()  


# Define the K-fold Cross Validator
kfold = KFold(n_splits=k_folds, shuffle=True)

# Start print
print('--------------------------------')

# K-fold Cross Validation model evaluation
for fold, (train_ids, test_ids) in enumerate(kfold.split(x_train)):

    # Print
    print(f'FOLD {fold}')
    print('--------------------------------')

    # Sample elements randomly from a given list of ids, no replacement.
    train_subsampler = torch.utils.data.SubsetRandomSampler(train_ids)
    test_subsampler = torch.utils.data.SubsetRandomSampler(test_ids)

    # Define data loaders for training and testing data in this fold
    trainloader = torch.utils.data.DataLoader(
                      synthanic_torch, batch_size=256, sampler=train_subsampler)
    testloader = torch.utils.data.DataLoader(
                      synthanic_torch, batch_size=10000, sampler=test_subsampler)

    # Init the model and optimizer
    model = LogisticRegression(dim, 2)
    optimizer = Cocob(model.parameters())

    # Run the training loop for defined number of epochs
    for epoch in range(num_epochs):
        print(f'Starting epoch {epoch+1}')
        current_loss = 0.0
        
        # Iterate over the DataLoader for training data
        for i, data in enumerate(trainloader, 0):
            inputs, targets = data['x'].float(), data['y']
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            current_loss += loss.item()
        
        with torch.no_grad():
            d = x_train.shape[0]
            outputs = model(torch.tensor(x_train).float())
            _, predicted = torch.max(outputs.data, 1)
            correct = (predicted.numpy() == y_train).sum()

        print(f"Train loss after {epoch + 1} epochs {current_loss:.3f}")
        print(f"Corrected classified {correct}")
        current_loss = 0.0

    print('Training process has finished. Saving trained model.')
    print('Starting testing')

    # Saving the model
    save_path = f'./model-fold-{fold}.pth'
    torch.save(model, save_path)

    # Evaluation for this fold
    correct, total = 0, 0
    with torch.no_grad():
        for i, data in enumerate(testloader, 0):
            inputs, targets = data['x'].float(), data['y']
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            correct += (predicted == targets).sum()
            total += targets.shape[0]
            
    # Print accuracy
    print('Accuracy for fold %d: %d %%' % (fold, 100.0 * correct / total))
    print('--------------------------------')
    results[fold] = 100.0 * (correct / total)

# Print fold results
print(f'K-FOLD CROSS VALIDATION RESULTS FOR {k_folds} FOLDS')
print('--------------------------------')
sum = 0.0
for key, value in results.items():
    print(f'Fold {key}: {value:.2f} %')
    sum += value
    print(f'Average: {sum/len(results.items()):.2f} %')

In [None]:
sample_submission = pd.read_csv('../input/tabular-playground-series-apr-2021/sample_submission.csv')
with torch.no_grad():
    outputs = model(torch.tensor(x_test).float())
    _, predicted = torch.max(outputs.data, 1)
    # predicted = predict(model, torch.tensor(x_test).float())
sample_submission['Survived'] = predicted.numpy().astype(int)
sample_submission.to_csv('submission.csv',index=False)

In [None]:
sample_submission.head()