In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as torch_optim
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models
from datetime import datetime
import os

In [27]:
from sklearn.model_selection import train_test_split

def load_dummy_and_split(path, dummy_cols, split_ratio=0.2, destination_path='data/'):
    dataset = pd.read_csv(path).drop(['ID'],axis=1).dropna()
    # Dummy encode
    print(f'[INFO] Dummy encoding {dummy_cols}')
    dataset = pd.get_dummies(dataset, columns=dummy_cols)
    train, test = train_test_split(dataset, test_size=split_ratio)
    print(f'[INFO] Splitting data by split ratio: {split_ratio}')
    train.to_csv(os.path.join(destination_path, 'train.csv'))
    print(f'[INFO] train dataset saved to: {destination_path}')
    test.to_csv(os.path.join(destination_path, 'test.csv'))
    print(f'[INFO] test dataset saved to: {destination_path}')
    return dataset, train, test


# Unpack train and test dataset, ignoring the original dataset
dataset, train, test = load_dummy_and_split('data/thyroid.csv', 
                                      dummy_cols=['ref_src'],
                                      split_ratio=0.1)



[INFO] Dummy encoding ['ref_src']
[INFO] Splitting data by split ratio: 0.1
[INFO] train dataset saved to: data/
[INFO] test dataset saved to: data/


In [39]:
y_colname = 'ThryroidClass'
X = dataset.drop(y_colname, axis=1)
print(X.columns)
Y = dataset[y_colname]
Y = LabelEncoder().fit_transform(Y)
y_train = (train[y_colname] == 'sick').astype(int)
x_train = train.drop(y_colname, axis=1)
y_test = (test[y_colname] == 'sick').astype(int)
x_test = test.drop(y_colname, axis=1) 




Index(['patient_age', 'patient_gender', 'presc_thyroxine',
       'queried_why_on_thyroxine', 'presc_anthyroid_meds', 'sick', 'pregnant',
       'thyroid_surgery', 'radioactive_iodine_therapyI131',
       'query_hypothyroid', 'query_hyperthyroid', 'lithium', 'goitre', 'tumor',
       'hypopituitarism', 'psych_condition', 'TSH_measured', 'TSH_reading',
       'T3_measured', 'T3_reading', 'T4_measured', 'T4_reading',
       'thyrox_util_rate_T4U_measured', 'thyrox_util_rate_T4U_reading',
       'FTI_measured', 'FTI_reading', 'ref_src_STMW', 'ref_src_SVHC',
       'ref_src_SVHD', 'ref_src_SVI', 'ref_src_other'],
      dtype='object')


In [48]:
class ThyroidDataset(Dataset):
    def __init__(self, X, Y):
        X = X.copy()
        self.x = X
        self.y = Y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.y[idx]


# Create train and valid datasets
train_ds = ThyroidDataset(x_train, y_train)
test_df = ThyroidDataset(x_test, y_test)



2475


In [50]:
# Make device GPU compatible
def get_default_device():
    """Pick GPU if available, else CPU"""
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')

get_default_device()

device(type='cuda')

In [51]:
def to_device(data, device):
    """Move tensor(s) to chosen device"""
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)


class DeviceDataLoader():
    """Wrap a dataloader to move data to a device"""
    def __init__(self, dl, device):
        self.dl = dl
        self.device = device
        
    def __iter__(self):
        """Yield a batch of data after moving it to device"""
        for b in self.dl: 
            yield to_device(b, self.device)

    def __len__(self):
        """Number of batches"""
        return len(self.dl)