# Pneumothorax binary classification with Pytorch 

### Tasks
##### 1. With small sub sample of Pneumothorax dataset use pre trained Pytorch models to get Pneumothorax sufficient level of accuracy
##### 2. Explore Pytorch API

##### Insipred by https://www.kaggle.com/abhishek and his book *Approaching (Almost) Any Machine Learning Problem* https://www.amazon.com/Approaching-Almost-Machine-Learning-Problem-ebook/dp/B089P13QHT

In [None]:
pip install pretrainedmodels

In [None]:
import albumentations
import torch
from sklearn import metrics
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import pretrainedmodels
import torch.nn as nn
from PIL import Image
from PIL import ImageFile
from tqdm import tqdm
import pandas as pd
import numpy as np

In [None]:
# Class that read data in map-stype way 
# To read more https://pytorch.org/docs/stable/data.html

class ClassificationDataset:
    """
    A general classification dataset class
    """
    def __init__(self, image_paths, targets, resize=None, augmentations=None):
        """
         image_paths: list of path to images
         targets: numpy array
         resize: tuple. Will resizes image if not None
         augmentations: albumentation augmentations of images
        """
        self.image_paths = image_paths
        self.targets = targets
        self.resize = resize
        self.augmentations = augmentations

    def __len__(self):
        """
        Return the total number of samples in the dataset
        """
        return len(self.image_paths)

    def __getitem__(self, item):
        """
        Given an index will get image from dataset
        """
        # PIL to open the image
        image = Image.open(self.image_paths[item])
        # convert image to RGB
        image = image.convert("RGB")
        # get the from data targets
        targets = self.targets[item]
        # resize if Not None
        if self.resize is not None:
            image = image.resize((self.resize[1], self.resize[0]), resample=Image.BILINEAR)
        # convert to numpy array
        image = np.array(image)
        # if albumentation not None
        if self.augmentations is not None:
            augmented = self.augmentations(image=image)
            image = augmented["image"]
        # pytorch expects CHW instead of HWC
        image = np.transpose(image, (2, 0, 1)).astype(np.float32)
        # Return tensor of images and targets 
        return {"image": torch.tensor(image, dtype=torch.float), "targets": torch.tensor(targets, dtype=torch.long)}

In [None]:
# function to get an model
# more about pretrained models  https://pytorch.org/vision/stable/models.html

def load_model_from():
    # pretrained models from Pytorch with pretrainedmodels libs
    model = pretrainedmodels.__dict__["resnet18"](pretrained='imagenet')
    # add final layers 
    model.last_linear = nn.Sequential(
        nn.BatchNorm1d(512), # more here https://pytorch.org/docs/master/generated/torch.nn.BatchNorm1d.html#batchnorm1d
        nn.Dropout(p=0.25), # 
        nn.Linear(in_features=512, out_features=2048),
        nn.ReLU(),
        nn.BatchNorm1d(2048, eps=1e-05, momentum=0.1),
        nn.Dropout(p=0.5),
        nn.Linear(in_features=2048, out_features=1))

    return model

# You could play with BatchNorm1d, Dropout

In [None]:
# Prepare device, set the paths, initialize the data loaders of Dataset class

EPOCHS = 10 # set number of epoch to run 
NUM_WORKERS = 12 # could increase the time to calculate the results 
RANDOM_STATE = 11 # to repproduce results

# path to images
data_path = "../input/pneumothorax-binary-classification-task/small_train_data_set/small_train_data_set"

# cuda/cpu device (depends on your settings)
if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"
print(device)

# load the dataframe of images path and targets 
df = pd.read_csv("../input/pneumothorax-binary-classification-task/train_data.csv")
# add new column with full path
df['full_path_to_images'] = data_path + "/" + df.file_name.values

# image ids and targets values 
images = df.full_path_to_images.values.tolist()
targets = df.target.values

# get the pretrained model
model = load_model_from()
print(model)

# move model to device https://pytorch.org/docs/stable/notes/cuda.html
model.to(device)
# mean and std values of RGB channels for imagenet dataset
mean = (0.485, 0.456, 0.406)
std = (0.229, 0.224, 0.225)
# albumentations is an image augmentation library
aug = albumentations.Compose([albumentations.Normalize(mean, std, max_pixel_value=255.0, always_apply=True)])

# train_test_split date 
train_images, valid_images, train_targets, valid_targets = train_test_split(images, targets, stratify=targets, random_state=RANDOM_STATE)

# set train dataset with batch_size
train_dataset = ClassificationDataset(image_paths=train_images, targets=train_targets, resize=(227, 227), augmentations=aug)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=NUM_WORKERS)

# set test dataset with batch_size
valid_dataset = ClassificationDataset(image_paths=valid_images, targets=valid_targets, resize=(227, 227), augmentations=aug)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=16, shuffle=False, num_workers=NUM_WORKERS)

# simple Adam optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)

In [None]:
# Train and evaluate functions 

def train(data_loader, model, optimizer, device):
    """
    training for one epoch with selected model and params
     data_loader:  pytorch dataloader
     model: pytorch model
     optimizer: optimizer 
     device: cuda/cpu
    """
    # set training mode 
    model.train()
    # go over every batch of data in data loader
    for data in data_loader:
        inputs = data["image"]
        targets = data["targets"]
        # move inputs/targets to cuda/cpu device
        inputs = inputs.to(device, dtype=torch.float)
        targets = targets.to(device, dtype=torch.float)
        # zero grad the optimizer
        optimizer.zero_grad()
        # do the forward step of model
        outputs = model(inputs)
        # calculate loss
        loss = nn.BCEWithLogitsLoss()(outputs, targets.view(-1, 1))
        # backward step the loss
        loss.backward()
        # step optimizer
        optimizer.step()
        
def evaluate(data_loader, model, device):
    """
    Evaluation for one epoch
    data_loader: this is the pytorch dataloader
    model: pytorch model
    device: cuda/cpu
    """
    # put model in evaluation mode
    model.eval()
    # init lists to store targets and outputs
    final_targets = []
    final_outputs = []
    # no_grad context
    with torch.no_grad():
        for data in data_loader:
            inputs = data["image"]
            targets = data["targets"]
            inputs = inputs.to(device, dtype=torch.float)
            targets = targets.to(device, dtype=torch.float)
            # generate prediction
            output = model(inputs)
            # convert targets and outputs to lists
            targets = targets.detach().cpu().numpy().tolist()
            output = output.detach().cpu().numpy().tolist()
            # extend the original list
            final_targets.extend(targets)
            final_outputs.extend(output)
            
    return final_outputs, final_targets

In [None]:
# train and print auc score for all epochs
for epoch in tqdm(range(EPOCHS)):
    # train 
    train(train_loader, model, optimizer, device=device)
    # predict 
    predictions, valid_targets = evaluate(valid_loader, model, device=device)
    # metrics 
    roc_auc = metrics.roc_auc_score(valid_targets, predictions)
    print(f"Epoch={epoch}, Valid ROC AUC={roc_auc}")