In [36]:
import pandas
import torch
import torch.nn as nn 
import torch.optim as optim
import torch.onnx
import numpy

In [37]:
# Load the dataset into a dataframe
dataframe = pandas.read_csv("../datasets/diabetes_likelihood.csv")

In [38]:
dataframe = pandas.get_dummies(dataframe, columns=['Sex', 'HighBP','HighChol','CholCheck', 'Smoker', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'DiffWalk', 'Diabetes_012'])

In [39]:
print(dataframe.iloc[[0],:32]) # inputs

   BMI  GenHlth  MentHlth  PhysHlth  Sex_0  Sex_1  HighBP_0  HighBP_1  \
0   40        5        18        15   True  False     False      True   

   HighChol_0  HighChol_1  ...  Veggies_0  Veggies_1  HvyAlcoholConsump_0  \
0       False        True  ...      False       True                 True   

   HvyAlcoholConsump_1  AnyHealthcare_0  AnyHealthcare_1  NoDocbcCost_0  \
0                False            False             True           True   

   NoDocbcCost_1  DiffWalk_0  DiffWalk_1  
0          False       False        True  

[1 rows x 32 columns]


In [40]:
print(dataframe.iloc[[0],32:]) # outputs

   Diabetes_012_0  Diabetes_012_1  Diabetes_012_2
0            True           False           False


In [41]:
# split dataset to train (70%) and test (30%)
train, test = numpy.split(dataframe.to_numpy(dtype="float32"), [int(.7*len(dataframe))])

In [42]:
# further split training and testing to their corresponding inputs and outputs
train_torch = torch.from_numpy(train)
test_torch = torch.from_numpy(test)

train_x = train_torch[:,:32]
train_y = train_torch[:,32:]
test_x = test_torch[:,:32]
test_y = test_torch[:,32:]

In [43]:
print(train_x[0])
print(train_y[0])

tensor([40.,  5., 18., 15.,  1.,  0.,  0.,  1.,  0.,  1.,  0.,  1.,  0.,  1.,
         1.,  0.,  1.,  0.,  1.,  0.,  1.,  0.,  0.,  1.,  1.,  0.,  0.,  1.,
         1.,  0.,  0.,  1.])
tensor([1., 0., 0.])


In [44]:
model = nn.Sequential(
    nn.Linear(32, 16),
    nn.ReLU(),
    nn.Linear(16, 8),
    nn.ReLU(),
    nn.Linear(8, 3)
)

In [45]:
# loss function and optimizer
loss_function = nn.MSELoss()  # mean square error
optimizer = optim.Adam(model.parameters(), lr=0.00001)

In [46]:
# training parameters
n_epochs = 15000   # number of epochs to run

In [47]:
for index in range(n_epochs):
    prediction_y = model(train_x)
    step_loss = loss_function(prediction_y, train_y)
    optimizer.zero_grad()
    step_loss.backward()
    optimizer.step()
    print ('epoch [{}], Loss: {:.2f}'.format(index, step_loss.item()))

epoch [0], Loss: 1.15
epoch [1], Loss: 1.15
epoch [2], Loss: 1.15
epoch [3], Loss: 1.15
epoch [4], Loss: 1.15
epoch [5], Loss: 1.15
epoch [6], Loss: 1.15
epoch [7], Loss: 1.15
epoch [8], Loss: 1.14
epoch [9], Loss: 1.14
epoch [10], Loss: 1.14
epoch [11], Loss: 1.14
epoch [12], Loss: 1.14
epoch [13], Loss: 1.14
epoch [14], Loss: 1.14
epoch [15], Loss: 1.14
epoch [16], Loss: 1.14
epoch [17], Loss: 1.14
epoch [18], Loss: 1.14
epoch [19], Loss: 1.14
epoch [20], Loss: 1.14
epoch [21], Loss: 1.14
epoch [22], Loss: 1.14
epoch [23], Loss: 1.14
epoch [24], Loss: 1.14
epoch [25], Loss: 1.13
epoch [26], Loss: 1.13
epoch [27], Loss: 1.13
epoch [28], Loss: 1.13
epoch [29], Loss: 1.13
epoch [30], Loss: 1.13
epoch [31], Loss: 1.13
epoch [32], Loss: 1.13
epoch [33], Loss: 1.13
epoch [34], Loss: 1.13
epoch [35], Loss: 1.13
epoch [36], Loss: 1.13
epoch [37], Loss: 1.13
epoch [38], Loss: 1.13
epoch [39], Loss: 1.13
epoch [40], Loss: 1.13
epoch [41], Loss: 1.13
epoch [42], Loss: 1.13
epoch [43], Loss: 1.1

In [50]:
prediction_y = model(test_x)
print(prediction_y)
print(test_y)

tensor([[ 1.0173, -0.0258, -0.0366],
        [ 0.7418,  0.0306,  0.2088],
        [ 1.1005,  0.0213, -0.0492],
        ...,
        [ 1.0277,  0.0290,  0.0034],
        [ 0.8310,  0.0370,  0.1157],
        [ 0.7039,  0.0542,  0.2158]], grad_fn=<AddmmBackward0>)
tensor([[1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        ...,
        [1., 0., 0.],
        [1., 0., 0.],
        [0., 0., 1.]])


In [51]:
# Accuracy when matching the interpretation of inputs and outputs
prediction_boolean = prediction_y.detach().numpy() > 0.5
test_boolean = test_y.numpy() > 0.5
correct_boolean = (prediction_boolean == test_boolean).sum()
total_boolean = test_boolean.size
print('Match: {}, Total: {}, Rate: {:.2f}%'.format(correct_boolean, total_boolean, correct_boolean/total_boolean * 100))

Match: 204452, Total: 228312, Rate: 89.55%


In [52]:
# Accuracy when EXACT label basis
prediction_labeled = numpy.argmax(prediction_y.detach().numpy(), axis=-1)
test_labeled = numpy.argmax(test_y.numpy(), axis=-1)
correct_answers = (prediction_labeled == test_labeled).sum()
total = test_labeled.size
print('Exact: {}, Total: {}, Rate: {:.2f}%'.format(correct_answers, total, correct_answers/total * 100))

Exact: 64195, Total: 76104, Rate: 84.35%


In [54]:
model.eval()
torch.onnx.export(model, test_x[0], "../onnx/diabetes_likelihood_model.onnx", export_params=True)