<div>
    <img src="https://storage.googleapis.com/kaggle-datasets-images/107706/256873/21d3eec8c2d5c04b7014f61ae3b516be/dataset-cover.jpg" />
</div>

In [None]:
import numpy as np
import math
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression

from sklearn.utils import shuffle
from sklearn.metrics import f1_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import train_test_split

import torch
import torch.nn.functional as F 
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
import torchvision.datasets as dsets

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

<h1 id="dataset" style="color:black; background:white; border:0.5px dotted;"> 
    <center>Dataset Preparation
        <a class="anchor-link" href="#dataset" target="_self">¶</a>
    </center>
</h1>

In [None]:
path = '../input/cardiovascular-disease-dataset/cardio_train.csv'
df = pd.read_csv(path, 
                 delimiter=';',
                 index_col='id')
df = shuffle(df)
df.head()

In [None]:
normalize_columns = ['age', 'height', 'weight', 'ap_hi', 'ap_lo']
mean_std = {}

for column in normalize_columns:
    mean_std[column] = (df[column].mean(), df[column].std())
    df[column] = (df[column] - df[column].mean())/df[column].std()

In [None]:
fig, heat = plt.subplots(figsize = (14,7))
heat = sns.heatmap(df.corr())

In [None]:
df['cardio'].value_counts().plot(kind='bar')
plt.xticks([0,1], ['No Disease', 'Disease'])
plt.ylabel('Count')

<h1 id="dataset_split" style="color:black; background:white; border:0.5px dotted;"> 
    <center>Dataset Split
        <a class="anchor-link" href="#dataset_split" target="_self">¶</a>
    </center>
</h1>

In [None]:
class CardioDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels
    
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        x = self.features[idx]
        y = self.labels[idx]
        
        x = torch.tensor(x, dtype=torch.float)
        y = torch.tensor(np.eye(2)[y], dtype=torch.long)
        return x, y

In [None]:
# Generate indices for splits
test_ind = round(len(df)*0.25)
train_ind = test_ind + round(len(df)*0.01)
unlabeled_ind = train_ind + round(len(df)*0.74)


# Partition the data
test = df.iloc[:test_ind]
train = df.iloc[test_ind:train_ind]
unlabeled = df.iloc[train_ind:unlabeled_ind]

In [None]:
def test_train_unlabeled(test, train, unlabeled):
    X_train = train.drop('cardio', axis=1)
    X_train = torch.tensor(X_train.values)

    y_train = train.cardio
    y_train = torch.tensor(y_train.values)

    X_unlabeled = unlabeled.drop('cardio', axis=1)
    X_unlabeled = torch.tensor(X_unlabeled.values)
    X_unlabeled = X_unlabeled.type(torch.float)

    X_test = test.drop('cardio', axis=1)
    X_test = torch.tensor(X_test.values)

    y_test = test.cardio
    y_test = torch.tensor(y_test.values)
    
    return X_train, y_train, X_unlabeled, X_test, y_test

In [None]:
X_train, y_train, X_unlabeled, X_test, y_test = \
                        test_train_unlabeled(test, train, unlabeled)

<h1 id="model" style="color:black; background:white; border:0.5px dotted;"> 
    <center>Model
        <a class="anchor-link" href="#model" target="_self">¶</a>
    </center>
</h1>

In [None]:
class LogisticRegression(torch.nn.Module):
    def __init__(self, input_dim=11, output_dim=2):
        super(LogisticRegression, self).__init__()
        self.linear = torch.nn.Linear(input_dim, output_dim)

    def forward(self, x):
        outputs = self.linear(x)
        return outputs

In [None]:
model = LogisticRegression()

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

<h1 id="training" style="color:black; background:white; border:0.5px dotted;"> 
    <center>Training
        <a class="anchor-link" href="#training" target="_self">¶</a>
    </center>
</h1>

In [None]:
nr_episodes = 10

for ep in range(nr_episodes):

    X_train, y_train = shuffle(X_train, y_train)
    X_test, y_test = shuffle(X_test, y_test)

    train_ds = CardioDataset(X_train.clone().detach(), \
                             y_train.clone().detach())
    test_ds = CardioDataset(X_test.clone().detach(), \
                            y_test.clone().detach())

    train_loader = DataLoader(train_ds, batch_size=8)
    test_loader = DataLoader(train_ds, batch_size=8)

    nr_epochs = 100

    for e in range(nr_epochs):

        epoch_accs = 0
        epoch_loss = 0

        val_epoch_accs = 0
        val_epoch_loss = 0

        for i, (x, y) in enumerate(train_loader):
            optimizer.zero_grad()
            x = x.type(torch.FloatTensor)
            y = y.type(torch.FloatTensor)
            y_max = y.max(axis=1)[1]

            outputs = model(x)
            acc = sum(outputs.max(axis=1)[1] == y_max).item()
            acc /= len(x)
            epoch_accs += acc / len(train_loader)

            loss = criterion(outputs, y_max)
            epoch_loss += loss / len(train_loader)

            loss.backward()
            optimizer.step()

        for i, (x, y) in enumerate(test_loader):
            x = x.type(torch.FloatTensor)
            y = y.type(torch.FloatTensor)
            y_max = y.max(axis=1)[1]

            outputs = model(x)
            acc = sum(outputs.max(axis=1)[1] == y_max).item()
            acc /= len(x)
            val_epoch_accs += acc / len(test_loader)

            loss = criterion(outputs, y_max)
            val_epoch_loss += loss / len(test_loader)

        if((e+1) % 40 == 0):
            print('i:{:3d}, Epoch:{:4d}, train_loss:{:1.3f}, ' \
                  'epoch_acc:{:1.3f}, val_loss:{:1.3f}, val_accs:{:1.3f}'
                      .format(ep+1, e+1, epoch_loss, epoch_accs,
                              val_epoch_loss, val_epoch_accs))
            
         
    # add supervised features, labels
    logits = model(X_unlabeled)
    pred_probs = F.softmax(logits, dim=1)
    preds = torch.argmax(logits, dim=1)

    high_prob_indices = torch.where(pred_probs > (0.9 + ep * 0.01))
    
    X_train = torch.cat((X_train, X_unlabeled[high_prob_indices[0]]), 0)
    y_train = torch.cat(([y_train, high_prob_indices[1]]), 0)
    
    new_indexes = [a for a in np.arange(len(X_unlabeled)) 
                       if a not in high_prob_indices[0] ]
    X_unlabeled = X_unlabeled[new_indexes]
    
    print('------------- Adding {:}# of features, ' \
          '{:}# of unlabeled remaining -------------'
            .format(len(high_prob_indices[0]), len(X_unlabeled)))