In [1]:
import torch
import pandas as pd
from dataset import get_train_valid, get_dataloaders
from utils import plot_training, n_p, get_count

from vgg16 import VGG16
from train import train_model, get_metrics

In [2]:
# read training set and the metadata file
df_train_set = pd.read_csv(r'calc_case_description_train_set.csv')
df_metadata = pd.read_csv(r'/data0/NIH-CXR14/images/CBIS_DDSM/Calc-Training_full_mammogram_images_1-doiJNLP-PrQ05L6k (1)/metadata.csv')

In [3]:
### Files preprocess

# pick file name and set labels
df_train_set['image_file_name'] = df_train_set['image file path'].str.split('/').apply(lambda x : x[0])
df_train_set.loc[df_train_set['pathology'].str.startswith('BENIGN'), 'pathology'] = 0
df_train_set.loc[df_train_set['pathology'] == 'MALIGNANT', 'pathology'] = 1

# train dataset does not have right file paths, take it from metadata
df_metadata = df_metadata[['Subject ID', 'File Location']]
df_metadata['File Location'] = df_metadata['File Location'].str[2:]
local_path = r'/data0/NIH-CXR14/images/CBIS_DDSM/Calc-Training_full_mammogram_images_1-doiJNLP-PrQ05L6k (1)'
df_metadata['File Location'] = local_path + '/' + df_metadata['File Location'] + '/' + '1-1.dcm'
df_train_set = df_train_set.merge(df_metadata.rename(columns = {'Subject ID' : 'image_file_name'}), on = ['image_file_name'], how = 'left')

# drop rows where file location is not available
df_train_set = df_train_set.dropna(subset = ['File Location'])
# subset required columns
df_train_set = df_train_set[['patient_id', 'image_file_name', 'File Location', 'pathology']]

In [4]:
### Dataloaders

# train validation split
train_data, valid_data = get_train_valid(df_train_set)

# get dataloaders
dataloaders = get_dataloaders(train_data, valid_data, batch_size = 10)
dataset_sizes = {'train': len(train_data), 'valid' : len(valid_data)}

In [5]:
# prepare & run model
data_cat = ['train', 'valid']
# tai = total abnormal images, tni = total normal images
tai = {'train': get_count(train_data, 1), 'valid': get_count(valid_data, 1)}
tni = {'train': get_count(train_data, 0), 'valid': get_count(valid_data, 0)}
Wt1 = {x: n_p(tni[x] / (tni[x] + tai[x])) for x in data_cat}
Wt0 = {x: n_p(tai[x] / (tni[x] + tai[x])) for x in data_cat}

print('tai:', tai)
print('tni:', tni, '\n')
print('Wt0 train:', Wt0['train'])
print('Wt0 valid:', Wt0['valid'])
print('Wt1 train:', Wt1['train'])
print('Wt1 valid:', Wt1['valid'])

tai: {'train': 443, 'valid': 101}
tni: {'train': 794, 'valid': 208} 

Wt0 train: tensor([0.3581], device='cuda:0')
Wt0 valid: tensor([0.3269], device='cuda:0')
Wt1 train: tensor([0.6419], device='cuda:0')
Wt1 valid: tensor([0.6731], device='cuda:0')


In [6]:
class Loss(torch.nn.modules.Module):
    def __init__(self, Wt1, Wt0):
        super(Loss, self).__init__()
        self.Wt1 = Wt1
        self.Wt0 = Wt0

    def forward(self, inputs, targets, phase):
#         targets = targets.squeeze(dim=1)
#         print(inputs, targets)
        loss = torch.nn.functional.binary_cross_entropy(inputs, targets,
                                                        weight=(self.Wt1[phase] * targets + self.Wt0[phase] * (1 - targets)))
        return loss
    
model = VGG16(num_classes=1)
model = model.cuda()

criterion = Loss(Wt1, Wt0)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=1, verbose=True)



In [None]:
# #### Train model
model = train_model(model, criterion, optimizer, dataloaders, scheduler, dataset_sizes, num_epochs=15)

Train batches: 118
Valid batches: 37 

Epoch 1/15
----------
train Loss: 0.0305 Acc: 0.6144
Confusion Matrix:
 [[441 296]
 [159 284]]
valid Loss: 0.0247 Acc: 0.7459
Confusion Matrix:
 [[215  50]
 [ 43  58]]
Time elapsed: 4m 30s

Epoch 2/15
----------
train Loss: 0.0294 Acc: 0.6441
Confusion Matrix:
 [[427 310]
 [110 333]]
valid Loss: 0.0234 Acc: 0.5683
Confusion Matrix:
 [[119 146]
 [ 12  89]]
Time elapsed: 8m 59s

Epoch 3/15
----------
train Loss: 0.0258 Acc: 0.7008
Confusion Matrix:
 [[463 274]
 [ 79 364]]
valid Loss: 0.0247 Acc: 0.7842
Confusion Matrix:
 [[215  50]
 [ 29  72]]
Time elapsed: 13m 30s

Epoch 4/15
----------
train Loss: 0.0256 Acc: 0.6992
Confusion Matrix:
 [[460 277]
 [ 78 365]]
valid Loss: 0.0244 Acc: 0.4973
Confusion Matrix:
 [[ 91 174]
 [ 10  91]]
Epoch 00004: reducing learning rate of group 0 to 1.0000e-05.
Time elapsed: 18m 5s

Epoch 5/15
----------
train Loss: 0.0227 Acc: 0.7364
Confusion Matrix:
 [[472 265]
 [ 46 397]]
valid Loss: 0.0236 Acc: 0.6557
Confusion Ma

In [None]:
torch.save(model.state_dict(), 'models/model.pth')

In [12]:
# Evaluation
model = VGG16(num_classes=1)
model = model.cuda()
model.load_state_dict(torch.load(r'models/model.pth'))

<All keys matched successfully>

In [13]:
get_metrics(model, criterion, dataloaders, dataset_sizes)

Confusion Matrix:
 [[175  33]
 [ 31  70]]
valid Loss: 0.2169 Acc: 0.7929
