## Setup

In [None]:
import random
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim
from torch.utils.data import random_split, DataLoader

import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models

import seaborn as sns

from neural_net_template import ModelTrainer
from csv_image_reader import CSVClassImageFolder
from resize_tiff_file import resize_image

In [None]:
ROOT_IMAGE_DIR = '/kaggle/input/mayo-clinic-strip-ai'
BATCH_SIZE = 50
NUM_WORKERS = 2
N_EPOCHS = 10

In [None]:
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

## Read in datasets

In [None]:
# Required as described here: https://pytorch.org/vision/stable/models.html
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])

base_transforms = [
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        normalize,
    ]

In [None]:
dataset_without_augmentation = CSVClassImageFolder(
    ROOT_IMAGE_DIR + '/train',
    class_file=ROOT_IMAGE_DIR + '/train.csv',
    class_csv_label="label",
    data_csv_label="image_id",
    pre_transform=resize_image,
    transform=transforms.Compose(base_transforms))

N_LABELS = len(dataset_without_augmentation.find_classes(ROOT_IMAGE_DIR)[0])

len(dataset_without_augmentation)

In [None]:
dataset_without_augmentation.find_classes(ROOT_IMAGE_DIR)[0]

In [None]:
full_dataset = dataset_without_augmentation

In [None]:
full_data_size = len(full_dataset)
percent_train = 0.7
train_size = int(full_data_size * percent_train)
test_size = full_data_size - train_size

train_dataset, test_dataset = random_split(full_dataset, [train_size, test_size], generator=torch.Generator().manual_seed(RANDOM_SEED))

In [None]:
full_data_size

In [None]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, pin_memory=True)

## Fine-tune pretrained model

In [None]:
class ObviouslyLameCnnModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear_layer = nn.Sequential(
            nn.Flatten(),
            nn.Linear(150528, N_LABELS),
        )

    def forward(self, x):
        x = self.linear_layer(x)
        return x

In [None]:
def create_model(): 
    model = ObviouslyLameCnnModel()
    model.to(device)
    return model 

In [None]:
criterion = nn.CrossEntropyLoss()
optimizers = [
    {
        "type": torch.optim.SGD,
        "params": {
            "lr": 0.01,
            "momentum": 0.9,
            "weight_decay": 1e-4
        }
    }
]

In [None]:
results = []
model_trainer = ModelTrainer(device, N_LABELS, N_EPOCHS)
for case in optimizers:
    model = create_model()
    optimizer = case["type"](model.parameters(), **case["params"])
    print("Training optimizer", optimizer)
    result = model_trainer.train_model(model, criterion, optimizer, train_loader, test_loader)
    results.append(result)

In [None]:
results[0]['training_history']

## Create results file

In [None]:
best_idx = np.argmax(x["final_accuracy"] for x in results)
result = results[best_idx]
print("best", best_idx, result["final_accuracy"])


In [None]:
model = result["model"]

test_dataset = CSVClassImageFolder(
    ROOT_IMAGE_DIR + '/test',
    class_file=ROOT_IMAGE_DIR + '/test.csv',
    class_csv_label="image_id",
    data_csv_label="image_id",
    pre_transform=resize_image,
    transform=transforms.Compose(base_transforms))
len(test_dataset)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)

predicted_values, expected_targets = model_trainer._validate_model(model, test_loader, device)

In [None]:
print(expected_targets, predicted_values)

In [None]:
names = test_dataset.find_classes(ROOT_IMAGE_DIR + '/test')[0]
predictions = np.exp(predicted_values)/np.sum(np.exp(predicted_values))
index_to_prediction = [(expected_targets[idx], predictions[idx]) for idx, prediction in enumerate(predicted_values)]
index_to_prediction.sort(key=lambda x: x[0])

In [None]:
sample_df = pd.read_csv(os.path.join(ROOT_IMAGE_DIR, 'sample_submission.csv'), index_col='patient_id')
sample_df

In [None]:
test_df = pd.read_csv(os.path.join(ROOT_IMAGE_DIR, 'test.csv'))
test_df

In [None]:
prediction_df = pd.DataFrame([_[1] for _ in index_to_prediction], index = names, columns=['CE', 'LAA'])
prediction_df

In [None]:
submission_df = test_df.join(prediction_df, on='image_id').drop(['image_id', 'center_id', 'image_num'], axis=1).drop_duplicates()
submission_df

In [None]:
!rm /kaggle/working/*.* 

In [None]:
submission_df.to_csv('/kaggle/working/submission.csv', index=False)