In [1]:

import os
from convnet import ConvNet
import cv2

import pandas as pd

import numpy as np
import torch


from CustomDataset import CustomTensorDataset
from torch.utils.data import DataLoader


from transformers import AutoFeatureExtractor, AutoModelForImageClassification
from caxton_model.network_module import ParametersClassifier


import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

from lightning_model import LightningModel

from utils import get_images_and_targets

import torch.multiprocessing
torch.multiprocessing.set_sharing_strategy('file_system')


np.random.seed(42)

ROOT_DATA_PATH = '/s/babbage/e/nobackup/nkrishna/m3x/others/data/'
SAVING_OUTPUTS = './'
train_set = ROOT_DATA_PATH + 'train.csv'
test_set = ROOT_DATA_PATH + 'test.csv'
images = ROOT_DATA_PATH + 'images/'
MODEL_NAME = 'microsoft/resnet-50'


# HYPERPARAMETERS
LEARNING_RATE = 1e-03
EPOCHS = 15
BATCH_SIZE = 100

n_hiddens_per_conv_layer = [10, 10, 10]
n_hiddens_per_fc_layer = [10, 10, 10]
patch_size_per_conv_layer = [50, 10, 5]
stride_per_conv_layer = [7, 3, 1]

In [2]:
train_labels = pd.read_csv(train_set)

image_processor = AutoFeatureExtractor.from_pretrained(MODEL_NAME)

    


In [5]:
'''
training for images greater than cutoff
'''
trainX, trainY = get_images_and_targets(train_labels, images, image_processor, lesser=False)

dataset_train = CustomTensorDataset(trainX, trainY)
trainloader = DataLoader(dataset=dataset_train, batch_size=BATCH_SIZE, shuffle=True, num_workers=6)

model_greater = ConvNet(224*224, n_hiddens_per_conv_layer, n_hiddens_per_fc_layer, 2, patch_size_per_conv_layer, stride_per_conv_layer, learning_rate=LEARNING_RATE)

print("model loaded")

if not os.path.exists('{}{}/'.format(SAVING_OUTPUTS, 'output/model')):
    os.makedirs('{}{}/'.format(SAVING_OUTPUTS, 'output/model'))


checkpoint_callback = ModelCheckpoint(
    monitor='train_f1',
    dirpath='{}{}/'.format(SAVING_OUTPUTS, 'output/model'),
    filename='{}-{}-{}-{}'.format(MODEL_NAME, '3dprint_convnet', LEARNING_RATE, BATCH_SIZE)+'-{epoch:02d}-{train_f1:.4f}_greater_than_cutoff',
    save_top_k=5,
    mode='max',
)
early_stopping = EarlyStopping(monitor="train_f1", min_delta=0.00, patience=10, verbose=False, mode="max")


logger = TensorBoardLogger('lightning_logs', name=f'{MODEL_NAME}_convnet_lr_{LEARNING_RATE}_greater_than_cutoff_{EPOCHS}')


trainer = pl.Trainer(
    max_epochs=EPOCHS,
    precision=16,
    accelerator='gpu', devices=[0],
    num_sanity_val_steps=0,
    # check_val_every_n_epoch=5,
    callbacks=[checkpoint_callback, early_stopping],
    logger=logger,
    # strategy='ddp'
)

print('Start Training...')
trainer.fit(model_greater, trainloader)


print("training done")




100%|██████████| 61791/61791 [22:01<00:00, 46.75it/s]
Using 16bit None Automatic Mixed Precision (AMP)


model loaded


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(
You are using a CUDA device ('NVIDIA RTX A6000') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Missing logger folder: lightning_logs/microsoft/resnet-50_convnet_lr_0.001_greater_than_cutoff_15
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")


Start Training...


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type             | Params
-------------------------------------------------
0 | conv_layers | ModuleList       | 87.5 K
1 | fc_layers   | ModuleList       | 652   
2 | criterion   | CrossEntropyLoss | 0     
3 | softmax     | Softmax          | 0     
-------------------------------------------------
88.2 K    Trainable params
0         Non-trainable params
88.2 K    Total params
0.176     Total estimated model params size (MB)
2023-04-03 15:48:57.959807: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


Training: 0it [00:00, ?it/s]

training done


  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [6]:
from tqdm import tqdm


CHECKPOINT_MODEL_PATH='./output/model/microsoft/resnet-50-3dprint_convnet-0.001-100-epoch=02-train_f1=0.9727_greater_than_cutoff.ckpt'

test_labels = pd.read_csv(test_set)

testX, imgpathsY = get_images_and_targets(test_labels, images, image_processor, test=True, lesser=False)
dataset_test = CustomTensorDataset(testX, torch.ones(len(imgpathsY)).reshape(-1,1))
testloader = DataLoader(dataset=dataset_test, batch_size=100, shuffle=False, num_workers=6)

model_greater = ConvNet(224*224, n_hiddens_per_conv_layer, n_hiddens_per_fc_layer, 2, patch_size_per_conv_layer, stride_per_conv_layer, learning_rate=LEARNING_RATE).load_from_checkpoint(CHECKPOINT_MODEL_PATH,n_inputs =224*224, n_hiddens_per_conv_layer=n_hiddens_per_conv_layer, n_hiddens_per_fc_layer=n_hiddens_per_fc_layer, n_outputs=2, patch_size_per_conv_layer=patch_size_per_conv_layer, stride_per_conv_layer=stride_per_conv_layer, learning_rate=LEARNING_RATE)

device = torch.device('cuda:0')
model_greater = model_greater.cuda(device).eval()

softmax = torch.nn.Softmax(dim=1)

gt_max, pred_max, probs_all = [], [], []
with torch.no_grad():

    for idx, data in tqdm(enumerate(testloader)):

        img_seq, label = data
        
        img_seq = img_seq.cuda(device)
        
        logits = model_greater(img_seq)


        probs = softmax(logits)
        preds = torch.max(probs, 1, keepdim=True)[1].int().cpu()
        
        
        pred_max.append( preds)


test_predictions = torch.vstack(pred_max).cpu()


result = np.hstack((imgpathsY.reshape(-1,1), test_predictions.numpy()))

results_df = pd.DataFrame(result, columns=['img_path', 'has_under_extrusion'])
saving_name=MODEL_NAME.replace('/','_')
results_df.to_csv(SAVING_OUTPUTS+f'results_{saving_name}_convnet_greater_than_cutoff.csv', index=False)


100%|██████████| 7831/7831 [02:57<00:00, 44.24it/s]
79it [00:03, 20.69it/s]


In [3]:
'''
training for images lesser than cutoff
'''
trainX, trainY = get_images_and_targets(train_labels, images, image_processor, lesser=True)

dataset_train = CustomTensorDataset(trainX, trainY)
trainloader = DataLoader(dataset=dataset_train, batch_size=BATCH_SIZE, shuffle=True, num_workers=6)

model_lesser = ConvNet(224*224, n_hiddens_per_conv_layer, n_hiddens_per_fc_layer, 2, patch_size_per_conv_layer, stride_per_conv_layer, learning_rate=LEARNING_RATE)

print("model loaded")

if not os.path.exists('{}{}/'.format(SAVING_OUTPUTS, 'output/model')):
    os.makedirs('{}{}/'.format(SAVING_OUTPUTS, 'output/model'))


checkpoint_callback = ModelCheckpoint(
    monitor='train_f1',
    dirpath='{}{}/'.format(SAVING_OUTPUTS, 'output/model'),
    filename='{}-{}-{}-{}'.format(MODEL_NAME, '3dprint_convnet', LEARNING_RATE, BATCH_SIZE)+'-{epoch:02d}-{train_f1:.4f}_lesser_than_cutoff',
    save_top_k=5,
    mode='max',
)
early_stopping = EarlyStopping(monitor="train_f1", min_delta=0.00, patience=10, verbose=False, mode="max")


logger = TensorBoardLogger('lightning_logs', name=f'{MODEL_NAME}_convnet_lr_{LEARNING_RATE}_lesser_than_cutoff_{EPOCHS}')


trainer = pl.Trainer(
    max_epochs=EPOCHS,
    precision=16,
    accelerator='gpu', devices=[0],
    num_sanity_val_steps=0,
    # check_val_every_n_epoch=5,
    callbacks=[checkpoint_callback, early_stopping],
    logger=logger,
    # strategy='ddp'
)

print('Start Training...')
trainer.fit(model_lesser, trainloader)


print("training done")




100%|██████████| 19269/19269 [08:33<00:00, 37.49it/s]
Using 16bit None Automatic Mixed Precision (AMP)


model loaded


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(
You are using a CUDA device ('NVIDIA RTX A6000') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Missing logger folder: lightning_logs/microsoft/resnet-50_convnet_lr_0.001_lesser_than_cutoff_15
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")


Start Training...


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type             | Params
-------------------------------------------------
0 | conv_layers | ModuleList       | 87.5 K
1 | fc_layers   | ModuleList       | 652   
2 | criterion   | CrossEntropyLoss | 0     
3 | softmax     | Softmax          | 0     
-------------------------------------------------
88.2 K    Trainable params
0         Non-trainable params
88.2 K    Total params
0.176     Total estimated model params size (MB)
2023-04-03 16:58:40.578417: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


Training: 0it [00:00, ?it/s]

training done


  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [4]:
from tqdm import tqdm


CHECKPOINT_MODEL_PATH='./output/model/microsoft/resnet-50-3dprint_convnet-0.001-100-epoch=00-train_f1=0.8893_lesser_than_cutoff.ckpt'

test_labels = pd.read_csv(test_set)

testX, imgpathsY = get_images_and_targets(test_labels, images, image_processor, test=True, lesser=True)
dataset_test = CustomTensorDataset(testX, torch.ones(len(imgpathsY)).reshape(-1,1))
testloader = DataLoader(dataset=dataset_test, batch_size=100, shuffle=False, num_workers=6)

model_lesser = ConvNet(224*224, n_hiddens_per_conv_layer, n_hiddens_per_fc_layer, 2, patch_size_per_conv_layer, stride_per_conv_layer, learning_rate=LEARNING_RATE).load_from_checkpoint(CHECKPOINT_MODEL_PATH,n_inputs =224*224, n_hiddens_per_conv_layer=n_hiddens_per_conv_layer, n_hiddens_per_fc_layer=n_hiddens_per_fc_layer, n_outputs=2, patch_size_per_conv_layer=patch_size_per_conv_layer, stride_per_conv_layer=stride_per_conv_layer, learning_rate=LEARNING_RATE)

device = torch.device('cuda:0')
model_lesser = model_lesser.cuda(device).eval()

softmax = torch.nn.Softmax(dim=1)

gt_max, pred_max, probs_all = [], [], []
with torch.no_grad():

    for idx, data in tqdm(enumerate(testloader)):

        img_seq, label = data
        
        img_seq = img_seq.cuda(device)
        
        logits = model_lesser(img_seq)


        probs = softmax(logits)
        preds = torch.max(probs, 1, keepdim=True)[1].int().cpu()
        
        
        pred_max.append( preds)


test_predictions = torch.vstack(pred_max).cpu()


result = np.hstack((imgpathsY.reshape(-1,1), test_predictions.numpy()))

results_df = pd.DataFrame(result, columns=['img_path', 'has_under_extrusion'])
saving_name=MODEL_NAME.replace('/','_')
results_df.to_csv(SAVING_OUTPUTS+f'results_{saving_name}_convnet_lesser_than_cutoff.csv', index=False)


