In [27]:
import pandas as pd

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler

import torch
from torch.utils.data import Dataset, DataLoader

In [2]:
data = pd.read_csv('dataset/diabetes.csv')
data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [4]:
data.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [5]:
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


`Blood Pressure` is 0 for some values which indicates that the data is tampered 

In [9]:
# Checking for class balance/ imbalance
data['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [18]:
# Split data into training and testing
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.4,random_state=7)
x = data.iloc[:,:-1].values
y = data.iloc[:,-1].values
for train_idx, test_idx in sss.split(x,y):
    x_train,group_x = x[train_idx], x[test_idx]
    y_train,group_y = y[train_idx],y[test_idx]
    
x_train.shape,group_x.shape

((460, 8), (308, 8))

In [19]:
# Further splitting the data into Validation and testing
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5,random_state=7)

for test_idx, val_idx in sss.split(group_x,group_y):
    x_test,x_val = x[test_idx], x[val_idx]
    y_test, y_val = y[test_idx],y[val_idx]

x_val.shape, x_test.shape

((154, 8), (154, 8))

In [21]:
# Perform Data Standardization
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
x_valid = scaler.transform(x_val)

In [23]:
# Building the Model
x_train,x_test,x_valid = map(torch.tensor,(x_train,x_test,x_valid))
y_train,y_test,y_valid = map(torch.tensor,(y_train,y_test,y_val))

In [28]:
class Diabetes(Dataset):
    
    def __init__(self,x,y):
        self.x = x
        self.y = y
        pass
    
    def __getitem__(self,index):
        return self.x[index], self.y[index]
    
    def __len__(self):
        return len(self.x)

In [29]:
data = Diabetes(x_train,y_train)
train_loader = DataLoader(dataset = data, batch_size = 16)

In [30]:
class DiabetesPredictorNN(torch.nn.Module):
    def __init__(self,n_inputs, n_outputs, hidden_size):
        super(DiabetesPredictorNN, self).__init__()
        self.n_inputs = n_inputs
        self.n_outputs = n_outputs
        self.hidden_size = hidden_size
        
        self.fc1 = torch.nn.Linear(self.n_inputs, self.hidden_size)
        self.fc2 = torch.nn.Linear(self.hidden_size, self.hidden_size)
        self.fc3 = torch.nn.Linear(self.hidden_size, self.n_outputs)
        
        self.relu = torch.nn.ReLU()
        
    def forward(self, features):
        output = self.fc1(features)
        output = self.relu(output)
        
        output = self.fc2(output)
        output = self.relu(output)
        
        output = self.fc3(output)
        return output