# Modeling - First Pass

## Setup

### Installs, Packages, Seeds

In [1]:
%pip install efficientnet_pytorch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting efficientnet_pytorch
  Downloading efficientnet_pytorch-0.7.1.tar.gz (21 kB)
Building wheels for collected packages: efficientnet-pytorch
  Building wheel for efficientnet-pytorch (setup.py) ... [?25l[?25hdone
  Created wheel for efficientnet-pytorch: filename=efficientnet_pytorch-0.7.1-py3-none-any.whl size=16446 sha256=b58ce8c31162b56d77fe6ed44061c9009ea1131cff4d50e193eaf9283ed31ac8
  Stored in directory: /root/.cache/pip/wheels/0e/cc/b2/49e74588263573ff778da58cc99b9c6349b496636a7e165be6
Successfully built efficientnet-pytorch
Installing collected packages: efficientnet-pytorch
Successfully installed efficientnet-pytorch-0.7.1


In [2]:
%matplotlib inline

# python libraties
import os
import cv2
import itertools
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time
import seaborn as sns

# import imblearn
import logging
from tqdm import tqdm
from glob import glob
from PIL import Image
import ipywidgets

# pytorch libraries
import torch
from torch import optim,nn
from torch.autograd import Variable
from torch.utils.data import DataLoader,Dataset
from torchvision import models,transforms
from efficientnet_pytorch import EfficientNet

# sklearn libraries
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# google drive
from google.colab import drive # Connect colab to google drive

In [3]:
# Set Seeds
seed = 99 # go Aaron Judge!
np.random.seed(seed)
torch.cuda.manual_seed(seed)


# Logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)

### Mount to Google drive to connect to data

In [5]:
drive.mount('/drive') 
%cd /drive/MyDrive/W210 - Capstone/

Mounted at /drive
/drive/.shortcut-targets-by-id/1oLqejM9KnDiIgUupEGkxGM3_vdqboxlI/W210 - Capstone


In [None]:
%ls Data/

In [6]:
data_dir = './Data/'

## Data

### Load in full_csv which has all of the image paths

In [128]:
dev_state = False
dev_sample = 15000

data = pd.read_csv(data_dir + 'full_data.csv', index_col = 0).rename(columns = {'duplicated': 'duplicate', 'class':'label'})
data = data[data['label'] != 'Autoimmue Disorder']
data['label_idx'] = pd.Categorical(data['label']).codes
small_data = data.sample(n = dev_sample, random_state = seed)





In [129]:
split = 'split_1'
data.rename(columns = {'dataset':'split_0', split:'dataset'}, inplace = True)

In [131]:
if dev_state:
  data = small_data

### Get our final train and test sets

In [132]:
train_df = data[data.dataset == 'train'].reset_index(drop = True)
val_df = data[data.dataset == 'val'].reset_index(drop = True)
test_df = data[data.dataset == 'test'].reset_index(drop = True)

In [133]:
# logging.info("df train"+str(df_train.shape))
# logging.info("df val"+str(df_val.shape))

## Modeling

## Setup

In [134]:
# feature_extract is a boolean that defines if we are finetuning or feature extracting. 
# If feature_extract = False, the model is finetuned and all model parameters are updated. 
# If feature_extract = True, only the last layer parameters are updated, the others remain fixed.
def set_parameter_requires_grad(model, feature_extracting):
    if feature_extracting:
        for param in model.parameters():
            param.requires_grad = False

In [135]:
def initialize_model(model_name, num_classes, feature_extract, use_pretrained=True):
    # Initialize these variables which will be set in this if statement. Each of these
    #   variables is model specific.
    model_ft = None
    input_size = 0

    if model_name == "resnet":
        """ Resnet18, resnet34, resnet50, resnet101
        """
        model_ft = models.resnet50(pretrained=use_pretrained)
        set_parameter_requires_grad(model_ft, feature_extract)
        num_ftrs = model_ft.fc.in_features
        model_ft.fc = nn.Linear(num_ftrs, num_classes)
        input_size = 224


    elif model_name == "vgg":
        """ VGG11_bn
        """
        model_ft = models.vgg11_bn(pretrained=use_pretrained)
        set_parameter_requires_grad(model_ft, feature_extract)
        num_ftrs = model_ft.classifier[6].in_features
        model_ft.classifier[6] = nn.Linear(num_ftrs,num_classes)
        input_size = 224


    elif model_name == "densenet":
        """ Densenet121
        """
#         model_ft = models.densenet121(pretrained=use_pretrained)
        model_ft = models.densenet201(pretrained=use_pretrained)
        print(type(model_ft))
        print(feature_extract)
        set_parameter_requires_grad(model_ft, feature_extract)
        num_ftrs = model_ft.classifier.in_features
        model_ft.classifier = nn.Linear(num_ftrs, num_classes)
        input_size = 224

    elif model_name == "inception":
        """ Inception v3
        Be careful, expects (299,299) sized images and has auxiliary output
        """
        model_ft = models.inception_v3(pretrained=use_pretrained)
        set_parameter_requires_grad(model_ft, feature_extract)
        # Handle the auxilary net
        num_ftrs = model_ft.AuxLogits.fc.in_features
        model_ft.AuxLogits.fc = nn.Linear(num_ftrs, num_classes)
        # Handle the primary net
        num_ftrs = model_ft.fc.in_features
        model_ft.fc = nn.Linear(num_ftrs,num_classes)
        input_size = 299
        
    elif model_name == 'efficientnet':
        model_ft = EfficientNet.from_pretrained('efficientnet-b7',num_classes=num_classes)
        set_parameter_requires_grad(model_ft, feature_extract)

#         # Handle the primary net
#         num_ftrs = model_ft.fc.in_features
#         model_ft.fc = nn.Linear(num_ftrs,num_classes)
        input_size = 600

    else:
        print("Invalid model name, exiting...")
        exit()
    return model_ft, input_size

Set up the GPU

In [136]:
print('GPU Type:', torch.cuda.get_device_name())
print('GPU Count:', torch.cuda.device_count())

GPU Type: A100-SXM4-40GB
GPU Count: 1


In [137]:
len(data.label.unique())

5

In [138]:
# See notes in ln 498-500: https://github.com/abajaj25/MNIST-Skin-Cancer-with-Jetson/blob/main/notebooks/Final_Model/modeling-images-only-efficient.py
# resnet,vgg,densenet,inception
model_name = 'resnet' # 'efficientnet'
num_classes = len(data.label.unique())
feature_extract = False
# Initialize the model for this run
model_ft, input_size = initialize_model(model_name, num_classes, feature_extract, use_pretrained=True)
# # Define the device:
device = torch.device('cuda:0')
# # Put the model on the device:
model = model_ft.to(device)

  f"The parameter '{pretrained_param}' is deprecated since 0.13 and will be removed in 0.15, "


Transforms

In [139]:
# torch.cuda.empty_cache()

In [140]:
# # using torch vision 'transforms'
# train_transform = transforms.Compose([
#                                         transforms.Resize((input_size,input_size)),
#                                     #   transforms.RandomHorizontalFlip(),
#                                     #   transforms.RandomVerticalFlip(),transforms.RandomRotation(20),
#                                     #   transforms.RandomCrop(size=(input_size,input_size)),
# #                                       transforms.RandomInvert(), transforms.RandomPosterize(bits=2),
# #                                       transforms.RandomAdjustSharpness(sharpness_factor=2),
# #                                       transforms.RandomSolarize(threshold=192.0),
# #                                       transforms.ColorJitter(brightness=0.1, contrast=0.1, hue=0.1),
#                                         transforms.ToTensor(), 
#                                     #   transforms.Normalize(norm_mean, norm_std)
#                                       ])
# # define the transformation of the val images.
# val_transform = transforms.Compose([
#                                     transforms.Resize((input_size,input_size)), 
#                                     transforms.ToTensor(),
#                                     # transforms.Normalize(norm_mean, norm_std)
#                                     ])

In [141]:
# using torch vision 'transforms'
train_transform = transforms.Compose([
                                        transforms.Resize(255),
                                        transforms.CenterCrop(224),
                                    #   transforms.RandomHorizontalFlip(),
                                    #   transforms.RandomVerticalFlip(),transforms.RandomRotation(20),
                                    #   transforms.RandomCrop(size=(input_size,input_size)),
#                                       transforms.RandomInvert(), transforms.RandomPosterize(bits=2),
#                                       transforms.RandomAdjustSharpness(sharpness_factor=2),
#                                       transforms.RandomSolarize(threshold=192.0),
#                                       transforms.ColorJitter(brightness=0.1, contrast=0.1, hue=0.1),
                                        transforms.ToTensor(), 
                                        transforms.Normalize(mean=[.541, .414, .382], std=[.256,.215,.209])
                                      ])
# define the transformation of the val images. also used for test
val_transform = transforms.Compose([
                                        transforms.Resize(255),
                                        transforms.CenterCrop(224),
                                        transforms.ToTensor(), 
                                        transforms.Normalize(mean=[.541, .414, .382], std=[.256,.215,.209])
                                    ])

Pytorch Data Loader

In [142]:
class HAM10000(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        # Load data and get label
        X = Image.open(self.df['path'][index]).convert('RGB')
        
        y = torch.tensor(int(self.df['label_idx'][index]))

        if self.transform:
            X = self.transform(X)

        return X, y

In [143]:
# Only selecting the columns we need from train and val

In [144]:
# Image.open(train['path'][0])

In [145]:
model_cols = ['path', 'label', 'label_idx']

In [146]:
# next(iter(train_loader))

In [147]:
train_df = train_df[model_cols].reset_index(drop = True)
val_df = val_df[model_cols].reset_index(drop = True)
test_df = test_df[model_cols].reset_index(drop = True)

In [148]:
training_set = HAM10000(train_df, transform = train_transform)
train_loader = DataLoader(training_set, batch_size= 64, 
                          shuffle=True, num_workers=24)

val_set = HAM10000(val_df, transform = val_transform)
val_loader = DataLoader(val_set, batch_size= 64, 
                          shuffle=False, num_workers=24)

test_set = HAM10000(test_df, transform = val_transform)
test_loader = DataLoader(test_set, batch_size= 64, 
                          shuffle=False, num_workers=24)


  cpuset_checked))


In [149]:
# Set model params
optimizer = optim.SGD(model.parameters(), lr=0.002)
criterion = nn.CrossEntropyLoss().to(device)

Class to Track metrics during training

In [150]:
class AverageMeter(object):
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

Define train function

In [151]:
total_loss_train, total_acc_train = [],[]
def train(train_loader, model, criterion, optimizer, epoch):
    model.train()
    train_loss = AverageMeter()
    train_acc = AverageMeter()
    curr_iter = (epoch - 1) * len(train_loader)

    for i, data in enumerate(train_loader):
        images, labels = data
        N = images.size(0)
        # print('image shape:',images.size(0), 'label shape',labels.size(0))
        images = Variable(images).to(device)
        labels = Variable(labels).to(device)
        
        optimizer.zero_grad()
        outputs = model(images)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        prediction = outputs.max(1, keepdim=True)[1]
        train_acc.update(prediction.eq(labels.view_as(prediction)).sum().item()/N)
        train_loss.update(loss.item())
        curr_iter += 1
        if (i + 1) % 100 == 0:
#         if (i + 1) % 1 == 0:
            print(f'[epoch {epoch}], [iter {i+1} of {len(train_loader)}],[train loss {train_loss.avg:.5f}], [train acc {train_acc.avg:.5f}]')
            total_loss_train.append(train_loss.avg)
            total_acc_train.append(train_acc.avg)
    return train_loss.avg, train_acc.avg
    print('finished train data load')

Define val function

In [152]:
def validate(val_loader, model, criterion, optimizer, epoch):
    model.eval()
    val_loss = AverageMeter()
    val_acc = AverageMeter()
    with torch.no_grad():
        for i, data in enumerate(val_loader):
            images, labels = data
            N = images.size(0)
            images = Variable(images).to(device)
            labels = Variable(labels).to(device)

            outputs = model(images)
            prediction = outputs.max(1, keepdim=True)[1]

            val_acc.update(prediction.eq(labels.view_as(prediction)).sum().item()/N)

            val_loss.update(criterion(outputs, labels).item())

    print('------------------------------------------------------------')
    print(f'[epoch {epoch}], [val loss {val_loss.avg:.5f}], [val acc {val_acc.avg:.5f}]')
    print('------------------------------------------------------------')
    return val_loss.avg, val_acc.avg

In [153]:
def test(test_loader, model):
    model.eval()
    true_labels = []
    predictions_out = []

    test_loss = AverageMeter()
    test_acc = AverageMeter()
    with torch.no_grad():
        for i, data in enumerate(test_loader):
            images, labels = data
            N = images.size(0)
            images = Variable(images).to(device)
            labels = Variable(labels).to(device)

            outputs = model(images)
            prediction = outputs.max(1, keepdim=True)[1]

            # Save test output
            predictions_out.append(prediction.cpu().numpy())
            true_labels.append(labels.cpu().numpy())

            test_acc.update(prediction.eq(labels.view_as(prediction)).sum().item()/N)

            test_loss.update(criterion(outputs, labels).item())

    print('------------------------------------------------------------')
    print(f'[test loss {test_loss.avg:.5f}], [test acc {test_acc.avg:.5f}]')
    print('------------------------------------------------------------')
    return test_loss.avg, test_acc.avg, predictions_out, true_labels

Run the model

## Train Sequence

In [None]:
model_name = 'resnet_split1_3e'
epoch_num = 3
best_val_acc = 0
total_loss_val, total_acc_val = [],[]
print("Starting Training")
total_since = time.time()
for epoch in range(1, epoch_num+1):

    # timing
    since = time.time()

    loss_train, acc_train = train(train_loader, model, criterion, optimizer, epoch)
    loss_val, acc_val = validate(val_loader, model, criterion, optimizer, epoch)
    total_loss_val.append(loss_val)
    total_acc_val.append(acc_val)

    if acc_val > best_val_acc:
        best_val_acc = acc_val
        torch.save(model, f'./Models/{model_name}.pt')
    
    time_elapsed = time.time() - since

    print('\nEPOCH', epoch, ":")
    print('*****************************************************')
    print('Complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    print(f'best record: [epoch {epoch}], [val loss {loss_val:.5f}], [val acc {acc_val:.5f}]')
    print('*****************************************************')
    # print(logging.info('*****************************************************'))
    # print(logging.info(f'best record: [epoch {epoch}], [val loss {loss_val:.5f}], [val acc {acc_val:.5f}]'))
    # print(logging.info('*****************************************************'))

total_time_elapsed = time.time() - total_since
print('\nTotal run Complete in {:.0f}m {:.0f}s'.format(total_time_elapsed // 60, total_time_elapsed % 60))

Starting Training
[epoch 1], [iter 100 of 658],[train loss 0.96839], [train acc 0.62906]
[epoch 1], [iter 200 of 658],[train loss 0.80823], [train acc 0.69703]
[epoch 1], [iter 300 of 658],[train loss 0.73678], [train acc 0.72479]
[epoch 1], [iter 400 of 658],[train loss 0.68474], [train acc 0.74422]
[epoch 1], [iter 500 of 658],[train loss 0.65381], [train acc 0.75503]
[epoch 1], [iter 600 of 658],[train loss 0.62744], [train acc 0.76375]


#### Previous Runs

In [None]:
# model_name = 'resnet_split3_3e'
# epoch_num = 3
# best_val_acc = 0
# total_loss_val, total_acc_val = [],[]
# print("Starting Training")
# total_since = time.time()
# for epoch in range(1, epoch_num+1):

#     # timing
#     since = time.time()

#     loss_train, acc_train = train(train_loader, model, criterion, optimizer, epoch)
#     loss_val, acc_val = validate(val_loader, model, criterion, optimizer, epoch)
#     total_loss_val.append(loss_val)
#     total_acc_val.append(acc_val)

#     if acc_val > best_val_acc:
#         best_val_acc = acc_val
#         torch.save(model, f'./Models/{model_name}.pt')
    
#     time_elapsed = time.time() - since

#     print('\nEPOCH', epoch, ":")
#     print('*****************************************************')
#     print('Complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
#     print(f'best record: [epoch {epoch}], [val loss {loss_val:.5f}], [val acc {acc_val:.5f}]')
#     print('*****************************************************')
#     # print(logging.info('*****************************************************'))
#     # print(logging.info(f'best record: [epoch {epoch}], [val loss {loss_val:.5f}], [val acc {acc_val:.5f}]'))
#     # print(logging.info('*****************************************************'))

# total_time_elapsed = time.time() - total_since
# print('\nTotal run Complete in {:.0f}m {:.0f}s'.format(total_time_elapsed // 60, total_time_elapsed % 60))

In [None]:
# model_name = 'resnet_15k_3e'
# epoch_num = 3
# best_val_acc = 0
# total_loss_val, total_acc_val = [],[]
# print("Starting Training")
# total_since = time.time()
# for epoch in range(1, epoch_num+1):

#     # timing
#     since = time.time()

#     loss_train, acc_train = train(train_loader, model, criterion, optimizer, epoch)
#     loss_val, acc_val = validate(val_loader, model, criterion, optimizer, epoch)
#     total_loss_val.append(loss_val)
#     total_acc_val.append(acc_val)

#     if acc_val > best_val_acc:
#         best_val_acc = acc_val
#         torch.save(model, f'./Models/{model_name}.pt')
    
#     time_elapsed = time.time() - since

#     print('\nEPOCH', epoch, ":")
#     print('*****************************************************')
#     print('Complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
#     print(f'best record: [epoch {epoch}], [val loss {loss_val:.5f}], [val acc {acc_val:.5f}]')
#     print('*****************************************************')
#     # print(logging.info('*****************************************************'))
#     # print(logging.info(f'best record: [epoch {epoch}], [val loss {loss_val:.5f}], [val acc {acc_val:.5f}]'))
#     # print(logging.info('*****************************************************'))

# total_time_elapsed = time.time() - total_since
# print('\nTotal run Complete in {:.0f}m {:.0f}s'.format(total_time_elapsed // 60, total_time_elapsed % 60))

In [None]:
# model_name = 'model_resnet_full'
# epoch_num = 5
# best_val_acc = 0
# total_loss_val, total_acc_val = [],[]
# print("Starting Training")
# total_since = time.time()
# for epoch in range(1, epoch_num+1):

#     # timing
#     since = time.time()

#     loss_train, acc_train = train(train_loader, model, criterion, optimizer, epoch)
#     loss_val, acc_val = validate(val_loader, model, criterion, optimizer, epoch)
#     total_loss_val.append(loss_val)
#     total_acc_val.append(acc_val)

#     if acc_val > best_val_acc:
#         best_val_acc = acc_val
#         torch.save(model, f'./Models/{model_name}.pt')
    
#     time_elapsed = time.time() - since

#     print('\nEPOCH', epoch, ":")
#     print('*****************************************************')
#     print('Complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
#     print(f'best record: [epoch {epoch}], [val loss {loss_val:.5f}], [val acc {acc_val:.5f}]')
#     print('*****************************************************')
#     # print(logging.info('*****************************************************'))
#     # print(logging.info(f'best record: [epoch {epoch}], [val loss {loss_val:.5f}], [val acc {acc_val:.5f}]'))
#     # print(logging.info('*****************************************************'))

# total_time_elapsed = time.time() - total_since
# print('\nTotal run Complete in {:.0f}m {:.0f}s'.format(total_time_elapsed // 60, total_time_elapsed % 60))

In [None]:
best_val_acc

In [None]:
# model_name = 'resnet_split2_3e'
# epoch_num = 3
# best_val_acc = 0
# total_loss_val, total_acc_val = [],[]
# print("Starting Training")
# total_since = time.time()
# for epoch in range(1, epoch_num+1):

#     # timing
#     since = time.time()

#     loss_train, acc_train = train(train_loader, model, criterion, optimizer, epoch)
#     loss_val, acc_val = validate(val_loader, model, criterion, optimizer, epoch)
#     total_loss_val.append(loss_val)
#     total_acc_val.append(acc_val)

#     if acc_val > best_val_acc:
#         best_val_acc = acc_val
#         torch.save(model, f'./Models/{model_name}.pt')
    
#     time_elapsed = time.time() - since

#     print('\nEPOCH', epoch, ":")
#     print('*****************************************************')
#     print('Complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
#     print(f'best record: [epoch {epoch}], [val loss {loss_val:.5f}], [val acc {acc_val:.5f}]')
#     print('*****************************************************')
#     # print(logging.info('*****************************************************'))
#     # print(logging.info(f'best record: [epoch {epoch}], [val loss {loss_val:.5f}], [val acc {acc_val:.5f}]'))
#     # print(logging.info('*****************************************************'))

# total_time_elapsed = time.time() - total_since
# print('\nTotal run Complete in {:.0f}m {:.0f}s'.format(total_time_elapsed // 60, total_time_elapsed % 60))

## Test/Eval

In [None]:
# model_name = 'model_resnet_full'

In [None]:

# # model_in = torch.load(f'./Models/{model_name}.pth')
# model.load_state_dict(torch.load(f'./Models/{model_name}.pth'))

In [None]:
# model_name = 'resnet_split3_3e'
model_name

In [None]:
# torch.save(model, './Models/test.pt')
# model_in = torch.load('./Models/model_resnet_full.pt')
model_in = torch.load(f'./Models/{model_name}.pt')

In [None]:
loss_test, acc_test, preds, labs = test(test_loader, model_in)

In [None]:
true_labels = np.array(list(itertools.chain(*labs)))
predictions = np.array(list(itertools.chain(*preds)))

In [None]:
correct = (true_labels == predictions.flatten())
accur = correct.sum() / correct.size
accur

In [None]:
model_name

In [None]:
labels_idx = np.sort(data.label_idx.unique())
label_map = data[['label', 'label_idx']].drop_duplicates().sort_values('label_idx')
label_dict = dict(zip(label_map.label_idx, label_map.label))

In [None]:
labs = pd.Series(true_labels).map(label_dict)
preds = pd.Series(predictions.flatten()).map(label_dict)
labels = pd.Series(labels_idx).map(label_dict)

In [None]:
pred_df = pd.concat([labs, 
           preds, 
           pd.Series(true_labels), 
           pd.Series(predictions.flatten())], axis = 1)\
           .rename(columns = {0:'lab', 1: 'pred', 2: 'lab_idx', 3: 'pred_idx'})

In [None]:
pred_df.to_pickle(f'./Models/{model_name}_preds.pkl')

In [None]:
d = pd.read_pickle(f'./Models/{model_name}_preds.pkl')

In [None]:
labs

In [None]:
# c_matrix = confusion_matrix(labs, preds, normalize = 'true')
# plt.title("Confusion matrix")
# sns.heatmap(c_matrix, cmap='Blues', annot=True, xticklabels=labels, yticklabels=labels, fmt='.1%', cbar=True)
# plt.xlabel('predictions')
# plt.ylabel('true labels')
# plt.show()