In [1]:
# - preparing the master dataframe
from typing import List, Tuple
import os
import random
import numpy
import pandas
import matplotlib.pyplot as plt
import torch
import torch.utils.data.dataset
import torch.utils.data.dataloader


dataset_url = 'https://figshare.com/ndownloader/files/35249488'
# - getting the dataset
cache_path = os.path.abspath('../resources/raw_data/dataset.pkl')
if os.path.isfile(cache_path):
    df = pandas.read_pickle(cache_path)
else:
    df = pandas.read_csv(dataset_url).drop(columns=['Unnamed: 0'])
    df.to_pickle(cache_path)


df_meta = df.copy()


df_meta = df_meta.loc[:, ['PatientID', 'Gender', 'Ethnic']].groupby('PatientID').first().reset_index()
df_meta.head()

Unnamed: 0,PatientID,Gender,Ethnic
0,0,1.0,4.0
1,1,1.0,3.0
2,2,1.0,4.0
3,3,1.0,4.0
4,4,1.0,3.0


# Dataset Split

Given the nature of the problem, I will use patient ID as item identifier and perform stratified splitting based on the key demographics that are available (Gender and Ethnicity).

If the other features indicate things such as "pre-existing conditions" it could be better to consider those as well (so as to not end up with a all very diseased people being in the test set, and so on.)

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
df_train_meta, df_test_meta = train_test_split(
    df_meta,
    test_size=0.2,
    shuffle=True,
    random_state=29,
    stratify=df_meta.loc[:, ['Gender', 'Ethnic']]
)

In [4]:
assert len(set(df_train_meta.PatientID.unique()).intersection(df_test_meta.PatientID.unique())) == 0

In [7]:
import pickle

with open(os.path.abspath('../resources/train_test_split.pkl'), 'wb') as handle:
    pickle.dump((df_train_meta, df_test_meta), handle)