# CNN_Softmax - CS 598 DLH - Reproducable Paper Final Project

This contains the code required for the paper titled "A Novel Deep Similarity Learning Approach to Electronic Health Records Data"


## Environment setup + Data pre-processing

In [None]:
#Environment setup
!pip3 install gdown
!mkdir data

#Imports
import torch
import torch.nn.functional as F
import gdown
import pandas as pd
from tabulate import tabulate
from torch.utils.data import Dataset
from torchvision import datasets
from torchvision.transforms import ToTensor
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
import torch.nn as nn
from tqdm import tqdm
import numpy as np

mkdir: cannot create directory ‘data’: File exists


In [None]:
#https://drive.google.com/uc?id=1uBv9j602LGyN43wvbQDpvka49rR9eNUQ&export=download&confirm=t
'''from google_drive_downloader import GoogleDriveDownloader as gdd
gdd.download_file_from_google_drive(file_id='1uBv9j602LGyN43wvbQDpvka49rR9eNUQ',
                                    dest_path='./data/orbda.csv',
                                    unzip=True,
                                    showsize=True,
                                    overwrite=True)'''


url = "https://drive.google.com/uc?id=1uBv9j602LGyN43wvbQDpvka49rR9eNUQ"
output = "./data/orbda.csv"
gdown.download(url, output, quiet=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
df = pd.read_csv('data/orbda.csv', low_memory=False) #read in csv file (~800mb)

Downloading...
From: https://drive.google.com/uc?id=1uBv9j602LGyN43wvbQDpvka49rR9eNUQ
To: /content/data/orbda.csv
100%|██████████| 818M/818M [00:04<00:00, 190MB/s]


In [None]:
#Sanity checks for pandas import + read_csv
#print(df.columns)
#print(sorted(df['ap_cidpri'].unique()))
#df.head()
#print(df['id_'].nunique())
#print(df['id_'].count())
#print(df['ap_coduni'].unique())
#print(df['ap_coduni'].count())

In [None]:
#Filtering out for specific values as outlined in Table 3
kidney_codes = ['E10 ', 'E14 ', 'I10 ', 'I120', 'N039', 'N088', 'N083', 'N180', 'N188', 'N189']
df = df.loc[df['ap_cidpri'].isin(kidney_codes)]

In [None]:
#Filtering out columns to match input features
#Features: an_hcv, an_hiv, an_hbsag, ap_nuidade, ap_coduni, owner_id, ap_pripal, ap_motsai, estado, an_tru, an_intfis, vol
column_features = ['an_hcv', 'an_hiv', 'an_hbsag', 'ap_nuidade', 'ap_coduni', 'ap_pripal', 'ap_motsai', 'estado', 'an_tru', 'an_intfis', 'ap_cidpri']
df_input_features = df.filter(items=column_features)
print(tabulate(df_input_features.head(), headers = 'keys', tablefmt = 'psql'))

+----+----------+----------+------------+--------------+------------------+-------------+-------------+----------+----------+-------------+-------------+
|    | an_hcv   | an_hiv   | an_hbsag   |   ap_nuidade | ap_coduni        |   ap_pripal |   ap_motsai | estado   |   an_tru |   an_intfis | ap_cidpri   |
|----+----------+----------+------------+--------------+------------------+-------------+-------------+----------+----------+-------------+-------------|
|  0 | N        | N        | N          |           25 | a1042cb8e9265d4e |   305010107 |          21 | MG       |     628  |          03 | N180        |
|  1 | N        | N        | N          |           47 | 69ba059ff91532d3 |   305010107 |          21 | RJ       |     0065 |          00 | N180        |
|  2 | N        | N        | N          |           15 | a2b516fa1aa3cce0 |   305010107 |          21 | PR       |     0    |          01 | N180        |
|  3 | N        | N        | N          |           37 | 72f15d07e504318f | 

In [None]:
#Converting ap_coduni values into int via dict
coduni_dict = {}
unique_coduni = df_input_features['ap_coduni'].unique()
for i in range(0, len(unique_coduni)):
  coduni_dict[unique_coduni[i]] = i
df_input_features = df_input_features.replace({'ap_coduni': coduni_dict})
print(tabulate(df_input_features.head(), headers = 'keys', tablefmt = 'psql'))

+----+----------+----------+------------+--------------+-------------+-------------+-------------+----------+----------+-------------+-------------+
|    | an_hcv   | an_hiv   | an_hbsag   |   ap_nuidade |   ap_coduni |   ap_pripal |   ap_motsai | estado   |   an_tru |   an_intfis | ap_cidpri   |
|----+----------+----------+------------+--------------+-------------+-------------+-------------+----------+----------+-------------+-------------|
|  0 | N        | N        | N          |           25 |           0 |   305010107 |          21 | MG       |     628  |          03 | N180        |
|  1 | N        | N        | N          |           47 |           1 |   305010107 |          21 | RJ       |     0065 |          00 | N180        |
|  2 | N        | N        | N          |           15 |           2 |   305010107 |          21 | PR       |     0    |          01 | N180        |
|  3 | N        | N        | N          |           37 |           3 |   305010107 |          21 | RJ     

In [None]:
#Converting an_hcv, an_hiv, an_hbsag to int values from dict
yes_no_dict = {'N': 0, "P": 1}
df_input_features = df_input_features.replace({'an_hcv': yes_no_dict, 'an_hiv': yes_no_dict, 'an_hbsag': yes_no_dict})
print(tabulate(df_input_features.head(), headers = 'keys', tablefmt = 'psql'))

+----+----------+----------+------------+--------------+-------------+-------------+-------------+----------+----------+-------------+-------------+
|    |   an_hcv |   an_hiv |   an_hbsag |   ap_nuidade |   ap_coduni |   ap_pripal |   ap_motsai | estado   |   an_tru |   an_intfis | ap_cidpri   |
|----+----------+----------+------------+--------------+-------------+-------------+-------------+----------+----------+-------------+-------------|
|  0 |        0 |        0 |          0 |           25 |           0 |   305010107 |          21 | MG       |     628  |          03 | N180        |
|  1 |        0 |        0 |          0 |           47 |           1 |   305010107 |          21 | RJ       |     0065 |          00 | N180        |
|  2 |        0 |        0 |          0 |           15 |           2 |   305010107 |          21 | PR       |     0    |          01 | N180        |
|  3 |        0 |        0 |          0 |           37 |           3 |   305010107 |          21 | RJ     

In [None]:
#Converting estado to int values from dict
estado_dict = {}
unique_estado = df_input_features['estado'].unique()
for i in range(0, len(unique_estado)):
  estado_dict[unique_estado[i]] = i
df_input_features = df_input_features.replace({'estado': estado_dict})
print(tabulate(df_input_features.head(), headers = 'keys', tablefmt = 'psql'))

+----+----------+----------+------------+--------------+-------------+-------------+-------------+----------+----------+-------------+-------------+
|    |   an_hcv |   an_hiv |   an_hbsag |   ap_nuidade |   ap_coduni |   ap_pripal |   ap_motsai |   estado |   an_tru |   an_intfis | ap_cidpri   |
|----+----------+----------+------------+--------------+-------------+-------------+-------------+----------+----------+-------------+-------------|
|  0 |        0 |        0 |          0 |           25 |           0 |   305010107 |          21 |        0 |     628  |          03 | N180        |
|  1 |        0 |        0 |          0 |           47 |           1 |   305010107 |          21 |        1 |     0065 |          00 | N180        |
|  2 |        0 |        0 |          0 |           15 |           2 |   305010107 |          21 |        2 |     0    |          01 | N180        |
|  3 |        0 |        0 |          0 |           37 |           3 |   305010107 |          21 |        

In [None]:
#Converting an_tru to int values from dict 
df_input_features['an_tru'] = df_input_features['an_tru'].str.strip()
df_input_features['an_tru'] = df_input_features['an_tru'].str.extract('(\d+)', expand=False)


In [None]:
#Converting an_intfis to int values from dict 

intfis_dict = {}
unique_intfis = df_input_features['an_intfis'].unique()
for i in range(0, len(unique_intfis)):
  intfis_dict[unique_intfis[i]] = i
df_input_features = df_input_features.replace({'an_intfis': intfis_dict})
print(tabulate(df_input_features.head(), headers = 'keys', tablefmt = 'psql'))

+----+----------+----------+------------+--------------+-------------+-------------+-------------+----------+----------+-------------+-------------+
|    |   an_hcv |   an_hiv |   an_hbsag |   ap_nuidade |   ap_coduni |   ap_pripal |   ap_motsai |   estado |   an_tru |   an_intfis | ap_cidpri   |
|----+----------+----------+------------+--------------+-------------+-------------+-------------+----------+----------+-------------+-------------|
|  0 |        0 |        0 |          0 |           25 |           0 |   305010107 |          21 |        0 |      628 |           0 | N180        |
|  1 |        0 |        0 |          0 |           47 |           1 |   305010107 |          21 |        1 |     0065 |           1 | N180        |
|  2 |        0 |        0 |          0 |           15 |           2 |   305010107 |          21 |        2 |        0 |           2 | N180        |
|  3 |        0 |        0 |          0 |           37 |           3 |   305010107 |          21 |        

In [None]:
#Converting pandas object types into integer
df_input_features = df_input_features.dropna()
df_input_features['an_hcv'] = df_input_features['an_hcv'].astype('str').astype('float')
df_input_features['an_hiv'] = df_input_features['an_hiv'].astype('str').astype('float')
df_input_features['an_hbsag'] = df_input_features['an_hbsag'].astype('str').astype('float')
df_input_features['an_tru'] = df_input_features['an_tru'].astype('str').astype('float')
df_input_features['an_intfis'] = df_input_features['an_intfis'].astype('str').astype('float')

In [None]:
#Sanity checks post filtering
#display(df.groupby('ap_cidpri')['ap_cidpri'].transform('count'))
print(tabulate(df['ap_cidpri'].value_counts().to_frame(), headers = 'keys', tablefmt = 'psql'))
print(tabulate(df_input_features.head(), headers = 'keys', tablefmt = 'psql'))
print(df_input_features.dtypes)

+------+-----------------+
|      |       ap_cidpri |
|------+-----------------|
| N180 |     3.94376e+06 |
| N189 | 38160           |
| I120 | 19460           |
| N039 |  7349           |
| I10  |  5979           |
| N083 |  5782           |
| N188 |  4670           |
| N088 |  1698           |
| E10  |   190           |
| E14  |   168           |
+------+-----------------+
+----+----------+----------+------------+--------------+-------------+-------------+-------------+----------+----------+-------------+-------------+
|    |   an_hcv |   an_hiv |   an_hbsag |   ap_nuidade |   ap_coduni |   ap_pripal |   ap_motsai |   estado |   an_tru |   an_intfis | ap_cidpri   |
|----+----------+----------+------------+--------------+-------------+-------------+-------------+----------+----------+-------------+-------------|
|  0 |        0 |        0 |          0 |           25 |           0 |   305010107 |          21 |        0 |      628 |           0 | N180        |
|  1 |        0 |        0

In [None]:
#Converting categorical lables to one hot encoding
kidney_codes_dict = {'E10 ': 1, 'E14 ': 2, 'I10 ': 3, 'I120': 4, 'N039': 4, 'N088': 5, 'N083': 6, 'N180': 7, 'N188': 8, 'N189': 9}

cidpri_list = df_input_features['ap_cidpri'].tolist()
for i in range(0, len(cidpri_list)):
  cidpri_list[i] = kidney_codes_dict[cidpri_list[i]]

tensor_cidpri = torch.tensor(cidpri_list)
cidpri_one_hot = F.one_hot(tensor_cidpri)

#Drop cidpri column since we are now done with it
df_input_features = df_input_features.drop(['ap_cidpri'], axis=1)

In [None]:
#Dataloader calss
class NephrologyDataset(Dataset):
  def __init__(self, input_features, categorical_features):
    x = input_features.values
    y = categorical_features
    print(y)

    self.x_train = torch.tensor(x, device=device)
    self.y_train = y.to(device)#torch.tensor(y, device=device)

  def __len__(self):
    return len(self.y_train)
  
  def __getitem__(self, idx):
    return self.x_train[idx], self.y_train[idx]

In [77]:
#Dataloader initialization
dataloader = NephrologyDataset(df_input_features[:3190570], cidpri_one_hot[:3190570])
train_loader = DataLoader(dataloader,batch_size=10,shuffle=True)

tensor([[0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 1, 0, 0],
        ...,
        [0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 1, 0, 0]])


In [78]:
print(len(cidpri_one_hot))

3190573


In [79]:
#CNN_Softmax model definition
class CNN_Softmax(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv_a = torch.nn.Conv1d(in_channels=10, 
                                      out_channels=20, 
                                      kernel_size=1,
                                      stride=1)
        self.conv_b = torch.nn.Conv1d(in_channels=20, 
                                      out_channels=10, 
                                      kernel_size=1,
                                      stride=1)
        self.maxpool_a = torch.nn.MaxPool1d(kernel_size=1,
                                            stride=1)
        self.maxpool_b = torch.nn.MaxPool1d(kernel_size=1,
                                            stride=1)
        self.fc = torch.nn.Linear(in_features=10, 
                                  out_features=10)

    def forward(self, x):
        x = self.conv_a(x)
        x = F.relu(x)
        x = self.maxpool_a(x)
        #print(x.shape)
        #print('Finished first conv')
        x = self.conv_b(x)
        x = F.relu(x)
        x = self.maxpool_b(x)
        #print(x.shape)
        #print('Finished second conv')
        x = self.fc(x)
        x = F.relu(x)
        #print(x.shape)
        return x

In [None]:
#Train
import torch.optim as optim

print(device)
net = CNN_Softmax().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.001)


loss_values = []
for epoch in tqdm(range(20)):  # loop over the dataset multiple times

    running_loss = []
    for i, data in enumerate(train_loader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        inputs = inputs.float()
        labels = labels.float()

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        #print(inputs.shape)
        outputs = net(inputs)
        loss = criterion(outputs, 
                         labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss.append(loss.item())
        #if i % 2000 == 1999:    # print every 2000 mini-batches
        #    print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
        #    running_loss = 0.0

    loss_values.append(np.mean(running_loss))


print('Finished Training')
print(loss_values)

cuda


  5%|▌         | 1/20 [09:54<3:08:11, 594.29s/it]

In [None]:
#Graphs + Analysis
plt.plot(np.squeeze(loss_values[1:]))