In [None]:
import random

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
SEED = 42 # "Answer to the Ultimate Question of Life, the Universe, and Everything"

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

batch_size = 16

In [None]:
diabetes_data = pd.read_csv('../input/pima-indians-diabetes-database/diabetes.csv')
#diabetes_data.fillna(0)
diabetes_data.head(30)

In [None]:
model_data = pd.DataFrame(columns=['Insulin/Age', 'BMI/Age', 'Pregnancies/Age', 'Insulin*Glucose', 'BloodPressure', 'SkinThickness', 'DiabetesPedigreeFunction'])

model_data['Insulin/Age']                = diabetes_data['Insulin'] / diabetes_data['Age']
model_data['BMI/Age']                    = diabetes_data['BMI'] / diabetes_data['Age']
model_data['Pregnancies/Age']            = diabetes_data['Pregnancies'] / diabetes_data['Age']
model_data['Insulin*Glucose']            = diabetes_data['Insulin'] * diabetes_data['Glucose']
model_data['BloodPressure']              = diabetes_data['BloodPressure']
model_data['SkinThickness']              = diabetes_data['SkinThickness']
model_data['DiabetesPedigreeFunction']   = diabetes_data['DiabetesPedigreeFunction']
model_data['DiabetesDetected']           = diabetes_data['Outcome']

In [None]:
model_data.head(5)

In [None]:
model_data_size = len(model_data)
scaler = MinMaxScaler()
scaler.fit(model_data.iloc[:int(0.7*model_data_size), :-1])

train = model_data[:int(0.7*model_data_size)].reset_index(drop=True)
x_train, y_train = train.iloc[:, :-1], train.iloc[:, -1]
x_train = scaler.transform(x_train)

valid = model_data[int(0.7*model_data_size):int(0.7*model_data_size+0.2*model_data_size)].reset_index(drop=True)
x_valid, y_valid = valid.iloc[:, :-1], valid.iloc[:, -1]
x_valid = scaler.transform(x_valid)

test  = model_data[int(0.7*model_data_size+0.2*model_data_size):].reset_index(drop=True)
x_test, y_test = test.iloc[:, :-1], test.iloc[:, -1]
x_test = scaler.transform(x_test)

In [None]:
x_valid[:4]

In [None]:
model_data.corr()

In [None]:
print('Full dataset:')
print(model_data['DiabetesDetected'].unique())
print(model_data['DiabetesDetected'].value_counts())

print()
print('Train set:')
print(train['DiabetesDetected'].unique())
print(train['DiabetesDetected'].value_counts())

print()
print('Valid set:')
print(valid['DiabetesDetected'].unique())
print(valid['DiabetesDetected'].value_counts())

print()
print('Test set:')
print(test['DiabetesDetected'].unique())
print(test['DiabetesDetected'].value_counts())

In [None]:
plt.bar(model_data['DiabetesDetected'].unique(), model_data['DiabetesDetected'].value_counts())
plt.show()

In [None]:
class Dataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y
    
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        features = np.array(self.x[idx, :].tolist())
        labels = np.array(self.y[idx].tolist())
        
        sample = features, labels
        return sample
    
def unbalanced_dataset_weights(instances, num_classes):
    count = [0] * num_classes
    for item in instances:
        count[item[1]] += 1
        
    class_weight = [0.] * num_classes
    total = float(sum(count))
    
    for i in range(num_classes):
        class_weight[i] = total/float(count[i])
    
    weight = [0] * len(instances)
    
    for index, value in enumerate(instances):
        weight[index] = class_weight[value[1]]
        
    return weight

In [None]:
train_dataset = Dataset(x_train, y_train)
weights = torch.tensor(unbalanced_dataset_weights(train_dataset, 2), dtype=torch.float)
sampler = torch.utils.data.sampler.WeightedRandomSampler(weights, len(weights))
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, sampler=sampler)

valid_dataset = Dataset(x_valid, y_valid)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=True)

test_dataset = Dataset(x_test, y_test)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, drop_last=True)

In [None]:
item = next(iter(train_dataloader))
item

In [None]:
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        
        self.linear1 = nn.Linear(7, 16)
        self.linear2 = nn.Linear(16, 32)
        self.linear3 = nn.Linear(32, 64)
        self.linear4 = nn.Linear(64, 32)
        self.linear5 = nn.Linear(32, 16)
        self.linear6 = nn.Linear(16, 1)
        
        self.gelu    = nn.GELU()
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        hidden = self.gelu(self.linear1(x))
        hidden = self.gelu(self.linear2(hidden))
        hidden = self.gelu(self.linear3(hidden))
        hidden = self.gelu(self.linear4(hidden))
        hidden = self.gelu(self.linear5(hidden))
        
        out = self.sigmoid(self.linear6(hidden))
        return out

In [None]:
model = MLP()
model.to(device)

In [None]:
optimizer = torch.optim.SGD(model.parameters(), lr=10e-3)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer)
criterion = nn.BCELoss()

In [None]:
epochs = 5
train_size = len(train_dataset)
valid_size = len(valid_dataset)

for epoch in range(epochs):    
    labels = np.array([])
    preds  = np.array([])
    
    train_running_loss = 0.0
    
    for index, data in enumerate(train_dataloader):
        model.train()
        
        batch_inputs, batch_labels = data[0][:].to(device).type(torch.float), data[1][:].to(device).type(torch.float)
        
        outputs = model(batch_inputs)
        outputs = outputs.squeeze()
        
        loss    = criterion(outputs, batch_labels)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        for i, output in enumerate(outputs):
            if output <= 0.5:
                outputs[i] = 0
            else:
                outputs[i] = 1
                
        labels = np.concatenate((labels, batch_labels.cpu().numpy()))
        preds  = np.concatenate((preds, outputs.detach().cpu().numpy()))
        
        train_running_loss += loss.mean()
        
        if index+1 == int(train_size / batch_size):
            print(f'Train Epoch: {epoch+1}, step: {index+1}, mean training loss: {train_running_loss / 2000}')
            train_running_loss = 0.0
    
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
        
    correct_preds_count = tn + tp
    total_count         = tn + fp + fn + tp
        
    print(f'Train Epoch {epoch+1}:\n    Accuracy: {correct_preds_count} out of {total_count} ({correct_preds_count/total_count}%)\n    Precision: {tp / (tp + fp)}\n    Recall: {tp / (tp + tn)}\n    Conf. Mat:\n    [{tp}, {fp}]\n    [{fn}, {tn}]')
    print()
    
    labels = np.array([])
    preds  = np.array([])
    
    valid_running_loss = 0.0
    
    for index, data in enumerate(valid_dataloader):
        model.eval()
        batch_inputs, batch_labels = data[0][:].to(device).type(torch.float), data[1][:].to(device).type(torch.float)
        
        outputs = model(batch_inputs)
        outputs = outputs.squeeze()
        
        loss    = criterion(outputs, batch_labels)
        
        for i, output in enumerate(outputs):
            if output <= 0.5:
                outputs[i] = 0
            else:
                outputs[i] = 1
                
        labels = np.concatenate((labels, batch_labels.cpu().numpy()))
        preds  = np.concatenate((preds, outputs.detach().cpu().numpy()))
               
        valid_running_loss += loss.mean()
        
        if index+1 == int(valid_size / batch_size):
            print(f'Valid Epoch: {epoch+1}, mini_batch: {index+1}, mean validation loss: {valid_running_loss / int(valid_size/batch_size)}')
            valid_running_loss = 0.0
            
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    
    correct_preds_count = tn + tp
    total_count         = tn + fp + fn + tp
        
    print(f'Valid Epoch {epoch+1}:\n    Accuracy: {correct_preds_count} out of {total_count} ({correct_preds_count/total_count}%)\n    Precision: {tp / (tp + fp)}\n    Recall: {tp / (tp + tn)}\n    Conf. Mat:\n    [{tp}, {fp}]\n    [{fn}, {tn}]\n\n\n')