# Modeling - First Pass

### Installs, Packages, Seeds

In [1]:
%pip install efficientnet_pytorch

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_p38/bin/python -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
%pip install torch
# %pip install torchvision

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_p38/bin/python -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [3]:
%matplotlib inline

# python libraties
import os
import cv2
import itertools
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time
import seaborn as sns
from itertools import combinations, product

# import imblearn
import logging
from tqdm import tqdm
from glob import glob
from PIL import Image
import ipywidgets

# pytorch libraries
import torch
from torch import optim,nn
from torch.autograd import Variable
from torch.utils.data import DataLoader,Dataset
from torchvision import models,transforms
from efficientnet_pytorch import EfficientNet

# sklearn libraries
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# google drive
# from google.colab import drive # Connect colab to google drive

# custom modeling libraries
from build_model2 import initialize_model, load_split_data, build_loader, evaluate, train_model, model_scores, eval_model, add_results

# other
import importlib

In [4]:
import build_model2
importlib.reload(build_model2)

from build_model2 import initialize_model, load_split_data, build_loader, evaluate, train_model, model_scores, eval_model, add_results

# import matplotlib as mpl
# print(mpl.get_cachedir())

## Grid Components

In [5]:
# epoch_search = [50]
# # optim_search = ['SGD']
# model_search = ['resnet', 'vgg']
# lr_search = [.003, .0035]
# split_search = ['split_8', 'split_9', 'split_10']

# prods = list(product(epoch_search, model_search, lr_search, split_search))

# es = pd.Series(list(zip(*prods))[0], name = 'epochs', dtype = 'int')
# mods = pd.Series(list(zip(*prods))[1], name = 'pretrained_model')
# learns = pd.Series(list(zip(*prods))[2], name = 'learning_rate')
# splits = pd.Series(list(zip(*prods))[3], name = 'data_split')

# g_search = pd.concat([es, mods, learns, splits], axis = 1)

Load custom grid

In [6]:
custom_grid = pd.read_csv('/home/ec2-user/SageMaker/teledermatologyAI_capstone/model/gridsearch5/custom_grid.csv')
g_search = custom_grid
g_search

Unnamed: 0,epochs,pretrained_model,learning_rate,data_split
0,50,resnet,0.003,split_8
1,50,resnet,0.003,split_9
2,50,resnet,0.003,split_10
3,50,vgg,0.003,split_8
4,50,vgg,0.003,split_9
5,50,vgg,0.003,split_10


In [7]:
g_search.index+=500 # index to add to prior run

## Parameters

In [8]:
model_dict = {'pretrained_model': None, 
              'epochs': None, # NEEDS UPDATE
              'home_directory': '/home/ec2-user/SageMaker/teledermatologyAI_capstone',
              'mod_directory': '/home/ec2-user/SageMaker/teledermatologyAI_capstone/model/gridsearch5',
              'csv_name': 'full_data_final_diverse',
              'split': 'split_3',
              'cl': 'label_0',
              'dev_state': False,
              'dev_sample': 15000,
              'seed': 99,
              'lr': .0035,                  # from prior gridsearch
              'batch_size':64,
              'num_workers':24,
              'transform':3,
              'results_file':'gridsearch_results',
              'model':None, # NEEDS UPDATE
              'device': torch.device('cuda:0'), # NEEDS UPDATE
              'optimizer': None, # NEEDS UPDATE
              'criterion': None, # NEEDS UPDATE
              'tuned_model_name': None, # NEEDS UPDATE
              'show_val_cm': False,
             }

np.random.seed(model_dict['seed'])
torch.cuda.manual_seed(model_dict['seed'])

# Check GPU
print('GPU Type:', torch.cuda.get_device_name())
print('GPU Count:', torch.cuda.device_count())

HOME = model_dict['home_directory']

GPU Type: Tesla T4
GPU Count: 1


In [9]:
cd $HOME

/home/ec2-user/SageMaker/teledermatologyAI_capstone


## Data

## In for loop

In [None]:
# Gridsearch

for i in g_search.iterrows():

    # extract gridsearch features
    model_dict['epochs'] = i[1]['epochs']
    model_dict['pretrained_model'] = i[1]['pretrained_model']
    model_dict['optimizer_name'] = 'SGD'
    model_dict['lr'] = i[1]['learning_rate']
    model_dict['split'] = i[1]['data_split']
    me = i[1]['epochs']
    mn = i[1]['pretrained_model']
    mo = model_dict['optimizer_name']
    ms = model_dict['split']
    mlr = i[1]['learning_rate']
    model_dict['alias'] = i[0]
    model_dict['tuned_model_name'] = f'{mn}_{me}e_{mo}_{ms}_diverse_GS{i[0]}'
    direc = model_dict['mod_directory']
    nam = model_dict['tuned_model_name']
    print(f'Learning Rate: {mlr}')
    
    # data for each split
    data, train, test, val = load_split_data(directory = model_dict['home_directory'],
                                             csv_name = model_dict['csv_name'], 
                                             data_split = model_dict['split'], 
                                             label = model_dict['cl'],
                                             mode = 'all',
                                             dev_state = model_dict['dev_state'], 
                                             dev_sample = model_dict['dev_sample'], 
                                             seed = model_dict['seed']
                                             )
    
    # Label dictionary for evaluation
    labels_idx = np.sort(data.label_idx.unique())
    label_map = data[['label', 'label_idx']].drop_duplicates().sort_values('label_idx')
    label_dict = dict(zip(label_map.label_idx, label_map['label']))
    model_dict['label_dict'] = label_dict
    
    # set batch size
    if model_dict['pretrained_model'] == 'efficientnet':
        model_dict['batch_size'] = 3
    else: 
        model_dict['batch_size'] = 64
    
    # Load each model
    model_ft, input_size = initialize_model(model_name = model_dict['pretrained_model'], 
                                            num_classes = len(data.label.unique()),
                                            feature_extract = False, 
                                            use_pretrained=True)
    
    # Move model to GPU
    model = model_ft.to(model_dict['device'])
    
    model_dict.update({
                       'model':model,
                       'criterion': nn.CrossEntropyLoss().to(model_dict['device']),
    })
    
    # Define optimizer options:
    if model_dict['optimizer_name'] == 'SGD':
        model_dict.update({'optimizer': optim.SGD(model.parameters(), lr=model_dict['lr'])})
    elif model_dict['optimizer_name'] == 'Adam':
        model_dict.update({'optimizer': optim.Adam(model.parameters(), lr=model_dict['lr'])})
    elif model_dict['optimizer_name'] == 'AdamW':
        model_dict.update({'optimizer': optim.AdamW(model.parameters(), lr=model_dict['lr'])})
    
    # Update dictionary
    model_dict['resize'] = int(input_size/.85)

    
    # Set Transforms
    transform_header = [
                        transforms.Resize(model_dict['resize']), #255
                        transforms.CenterCrop(input_size)
                        ]

    transform_body = [
                      transforms.RandomHorizontalFlip(), # a
                      transforms.RandomVerticalFlip(), # b
                      transforms.RandomRotation(20), # c
                      transforms.RandomCrop(size=(input_size,input_size)), # d
#                       transforms.RandomInvert(), transforms.RandomPosterize(bits=2), # e
#                       transforms.RandomAdjustSharpness(sharpness_factor=2), # f
#                       transforms.RandomSolarize(threshold=192.0), # g
#                       transforms.ColorJitter(brightness=0.1, contrast=0.1, hue=0.1) # h
                      ]

    transform_footer = [transforms.ToTensor(), 
                      transforms.Normalize(mean=[.541, .414, .382], std=[.256,.215,.209])]
    
    val_transform = transforms.Compose([
                                      transforms.Resize(model_dict['resize']),
                                      transforms.CenterCrop(input_size),
                                      transforms.ToTensor(), 
                                      transforms.Normalize(mean=[.541, .414, .382], std=[.256,.215,.209])
                                    ])
    
    test_loader = build_loader(mode = 'test', df = test, transform = val_transform, batch_size = model_dict['batch_size'], num_workers = model_dict['num_workers'])
    val_loader = build_loader(mode = 'val', df = val, transform = val_transform, batch_size = model_dict['batch_size'], num_workers = model_dict['num_workers'])   
    
    transform_list = transform_header + transform_body + transform_footer
    train_transform = transforms.Compose(transform_list)
    train_loader = build_loader(mode = 'train', df = train, transform = train_transform, batch_size = model_dict['batch_size'], num_workers = model_dict['num_workers'])


    loaders = {'train_loader':train_loader,
                            'val_loader': val_loader,
                            'test_loader': test_loader}
    model_dict['loader'] = loaders

    pred_df, val_scores, tot_time = train_model(model_dict = model_dict)

    acc, f1, f2, f5, prec, rec, d_0, d_1, d_2, d_3, d_4 = val_scores
    

    pred_df.to_pickle(f'{direc}/{nam}_preds.pkl')
    
    col_dict = {
#              'model': pd.Series(dtype = 'int'),
#              'file': pd.Series(dtype = 'str'),
             'tuned_model': model_dict['tuned_model_name'],
             'transform': model_dict['transform'],
             'lr': model_dict['lr'],
             'pretrained_model': model_dict['pretrained_model'],
             'optimizer': model_dict['optimizer_name'],
             'epochs': model_dict['epochs'],
#              'num_classes': model_dict['num_classes'],
             'batch_size': model_dict['batch_size'],
             'workers': model_dict['num_workers'],
             'train_time': tot_time,
             'data_split': model_dict['split'],
             'label_set': model_dict['cl'],
             'accur': acc,
             'F1': f1,
             'F0.5': f5,
             'F2': f2,
             'benign_accur': d_0,
             'noncancerous_accur': d_1,
             'malignant_accur': d_2,
             'infection_accur': d_3,
             'unclassified_accur': d_4
    }
    
#     print(tdf.iloc[:i[0]+1][['transform', 'lr', 'accur']])
    add_results(model_dict['results_file'], direc, pd.DataFrame(col_dict, index = [i[0]]))
    print('xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx')

Learning Rate: 0.003


  data, train, test, val = load_split_data(directory = model_dict['home_directory'],


Starting Training resnet_50e_SGD_split_8_diverse_GS500
[epoch 1], [iter 100 of 283],[train loss 1.27063], [train acc 0.49578]
[epoch 1], [iter 200 of 283],[train loss 1.12201], [train acc 0.54977]
------------------------------------------------------------
[epoch 1], [val loss 0.82244], [val acc 0.65832]
------------------------------------------------------------

EPOCH 1 :
*****************************************************
Complete in 3m 16s
best record: [epoch 1], [val loss 0.82244], [val acc 0.65832]
*****************************************************
[epoch 2], [iter 100 of 283],[train loss 0.83903], [train acc 0.65766]
[epoch 2], [iter 200 of 283],[train loss 0.82795], [train acc 0.66266]
------------------------------------------------------------
[epoch 2], [val loss 0.75270], [val acc 0.69196]
------------------------------------------------------------

EPOCH 2 :
*****************************************************
Complete in 3m 34s
best record: [epoch 2], [val loss 0

In [None]:
# # # 
# pred_df = evaluate(model_name = 'gridsearch4/vgg_50e_SGD_split_10_GS220', 
#                    model_source = 'pt', 
#                    model_dict = model_dict, 
#                    label_dict = label_dict, 
#                    show_cm = True)

In [None]:
# new = pd.read_csv('full_data_final_not_diverse.csv', index_col = 0)

In [None]:
# new.iloc[0].path

In [None]:
# new[new.image_id == 'ISIC_0000000'].path[0]

In [None]:
# ipath = './Data/ISIC_2019/ISIC_2019_Training_Input/ISIC_0000000.jpg'
# image = mp_image.imread(ipath)
# # imshow(image)
# fig, axes = plt.subplots(nrows = 1, ncols = 1, figsize = (15, 15))
# axes.imshow(image)

In [None]:
# import matplotlib.pyplot.imshow