In [1]:
import numpy as np
import pandas as pd

import cv2
import torch
from torch.utils.data import DataLoader, Dataset
from torch import nn
from torch.cuda.amp import GradScaler, autocast
import torchvision as tv

import os
from os import listdir
from os.path import isfile, join

import wandb
import gc
from tqdm.notebook import tqdm
import ipywidgets as widgets

In [2]:
VERSION = 'DenseNet_V2'
BATCH_SIZE = 2
MAX_TRAIN_BATCHES = 305
MAX_VAL_BATCHES = 35
MAX_TEST_BATCHES = 155
NB_EPOCHS = 3
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
#Data Labelling
train_img_dir_path = 'images/train'
valid_img_dir_path = 'images/valid'
test_img_dir_path = 'images/test'

train_normal_images = [f'{train_img_dir_path}/normal/{i}' for i in listdir(f'{train_img_dir_path}/normal')]
train_adenocarcinoma_images = [f'{train_img_dir_path}/adenocarcinoma/{i}' for i in listdir(f'{train_img_dir_path}/adenocarcinoma')]
train_largecell_carcinoma_images = [f'{train_img_dir_path}/large.cell.carcinoma/{i}' for i in listdir(f'{train_img_dir_path}/large.cell.carcinoma')]
train_squamouscell_carcinoma_images = [f'{train_img_dir_path}/squamous.cell.carcinoma/{i}' for i in listdir(f'{train_img_dir_path}/squamous.cell.carcinoma')]

valid_normal_images = [f'{valid_img_dir_path}/normal/{i}' for i in listdir(f'{valid_img_dir_path}/normal')]
valid_adenocarcinoma_images = [f'{valid_img_dir_path}/adenocarcinoma/{i}' for i in listdir(f'{valid_img_dir_path}/adenocarcinoma')]
valid_largecell_carcinoma_images = [f'{valid_img_dir_path}/large.cell.carcinoma/{i}' for i in listdir(f'{valid_img_dir_path}/large.cell.carcinoma')]
valid_squamouscell_carcinoma_images = [f'{valid_img_dir_path}/squamous.cell.carcinoma/{i}' for i in listdir(f'{valid_img_dir_path}/squamous.cell.carcinoma')]

test_normal_images = [f'{test_img_dir_path}/normal/{i}' for i in listdir(f'{test_img_dir_path}/normal')]
test_adenocarcinoma_images = [f'{test_img_dir_path}/adenocarcinoma/{i}' for i in listdir(f'{test_img_dir_path}/adenocarcinoma')]
test_largecell_carcinoma_images = [f'{test_img_dir_path}/large.cell.carcinoma/{i}' for i in listdir(f'{test_img_dir_path}/large.cell.carcinoma')]
test_squamouscell_carcinoma_images = [f'{test_img_dir_path}/squamous.cell.carcinoma/{i}' for i in listdir(f'{test_img_dir_path}/squamous.cell.carcinoma')]

In [4]:
train_cancer_df = pd.DataFrame(columns=['image_id','label'])
val_cancer_df = pd.DataFrame(columns=['image_id','label'])
test_cancer_df = pd.DataFrame(columns=['image_id','label'])

train_cancer_df['image_id'] = train_normal_images + train_adenocarcinoma_images + train_largecell_carcinoma_images + train_squamouscell_carcinoma_images
train_cancer_df['label'] = np.zeros_like(train_normal_images,dtype=int).tolist()+np.ones_like(train_adenocarcinoma_images,dtype=int).tolist()+np.full_like(train_largecell_carcinoma_images,2,dtype=int).tolist()+np.full_like(train_squamouscell_carcinoma_images,3,dtype=int).tolist()
val_cancer_df['image_id'] = valid_normal_images + valid_adenocarcinoma_images + valid_largecell_carcinoma_images + valid_squamouscell_carcinoma_images 
val_cancer_df['label'] = np.zeros_like(valid_normal_images,dtype=int).tolist() + np.ones_like(valid_adenocarcinoma_images,dtype=int).tolist() + np.full_like(valid_largecell_carcinoma_images,2,dtype=int).tolist() + np.full_like(valid_squamouscell_carcinoma_images,3,dtype=int).tolist()
test_cancer_df['image_id'] = test_normal_images + test_adenocarcinoma_images + test_largecell_carcinoma_images + test_squamouscell_carcinoma_images
test_cancer_df['label'] = np.zeros_like(test_normal_images,dtype=int).tolist() + np.ones_like(test_adenocarcinoma_images,dtype=int).tolist() + np.full_like(test_largecell_carcinoma_images,2,dtype=int).tolist() + np.full_like(test_squamouscell_carcinoma_images,3,dtype=int).tolist()

In [5]:
train_cancer_df.head()

Unnamed: 0,image_id,label
0,images/train/normal/19 - Copy.png,0
1,images/train/normal/11 - Copy (2) - Copy.png,0
2,images/train/normal/n6 - Copy.jpg,0
3,images/train/normal/7.png,0
4,images/train/normal/16 - Copy.png,0


In [6]:
print(len(train_cancer_df))
print(len(val_cancer_df))
print(len(test_cancer_df))

613
72
315


In [7]:
val_cancer_df.head(15)

Unnamed: 0,image_id,label
0,images/valid/normal/7.png,0
1,images/valid/normal/6 - Copy (3).png,0
2,images/valid/normal/004007_01_01_519.png,0
3,images/valid/normal/7 - Copy (2).png,0
4,images/valid/normal/6 - Copy.png,0
5,images/valid/normal/4 (2).png,0
6,images/valid/normal/003828_02_01_174.png,0
7,images/valid/normal/4 - Copy (2).png,0
8,images/valid/normal/5.png,0
9,images/valid/normal/004162_01_01_150.png,0


In [8]:
class LungsCancerDetectionDataset(Dataset):
    def __init__(self, annotations_file,transform=None, target_transform=None):
        self.img_labels = annotations_file
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.img_labels)

    def __getitem__(self, idx):
        img_path = str(self.img_labels.iloc[idx].image_id)
        image = cv2.imread(img_path)
        image = cv2.resize(image, (400,400))
        image = np.transpose(image, (2, 0, 1))
        image = torch.as_tensor(image)
        image = image.float()
        image = image/255
        label = torch.as_tensor(self.img_labels.iloc[idx].label)
        return image, label

In [9]:
class LungsCancerDetectionModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = tv.models.densenet121()
        self.nn_detection = torch.nn.Sequential(
            torch.nn.Linear(1000,4)
        )

    def forward(self, x):
        x = self.model(x)
        logits = self.nn_detection(x)
        return logits
    
    def predict(self,logits):
        preds = self.forward(logits)
        preds = torch.sigmoid(preds)
        return preds

In [10]:
def save_model(name, model):
    torch.save(model.state_dict(), f'{name}.tph')

def load_model(model, name, path='.'):
    data = torch.load(os.path.join(path, f'{name}.tph'))
    model.load_state_dict(data)
    return model

def gc_collect():
    gc.collect()
    torch.cuda.empty_cache()

In [11]:
def train_model(ds_train, logger, name):
    
    dl_train = torch.utils.data.DataLoader(ds_train, batch_size=BATCH_SIZE, shuffle=True,num_workers=os.cpu_count())
    model = LungsCancerDetectionModel().to(DEVICE)
    optim = torch.optim.Adam(model.parameters())
    
    scheduler = torch.optim.lr_scheduler.OneCycleLR(optim, max_lr=0.001, epochs=NB_EPOCHS,
                                                steps_per_epoch=min(MAX_TRAIN_BATCHES, len(dl_train)),
                                                pct_start=0.3)
    model.train()
    scaler = GradScaler()
    
    for epoch in tqdm(range(NB_EPOCHS), desc='Epoch'):
        with tqdm(dl_train, desc='Train') as progress:

            for batch_idx, (X, y) in enumerate(progress):

                if batch_idx >= MAX_TRAIN_BATCHES:
                    save_model(name, model)
                    break

                optim.zero_grad()
                with autocast():
                    pred = model.forward(X.to(DEVICE))
                    pred = pred.squeeze()
                    loss = torch.nn.functional.cross_entropy(pred,y.to(DEVICE))

                    if np.isinf(loss.item()) or np.isnan(loss.item()):
                        print(f'Bad loss, skipping the batch {batch_idx}')
                        del loss, pred
                        gc_collect()
                        continue

                scaler.scale(loss).backward()
                scaler.step(optim)
                scaler.update()
                scheduler.step()

                logger.log({'training loss': (loss.item()),
                            'learning rate': scheduler.get_last_lr()[0],
                            'epoch': epoch})
                
    save_model(name, model)
    return model

In [12]:
with wandb.init(project='DS50', name=VERSION) as run:
    gc_collect()
    ds_train = LungsCancerDetectionDataset(train_cancer_df)
    model = train_model(ds_train,run,VERSION)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mrbizet[0m. Use [1m`wandb login --relogin`[0m to force relogin




Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Train:   0%|          | 0/307 [00:00<?, ?it/s]



Train:   0%|          | 0/307 [00:00<?, ?it/s]

Train:   0%|          | 0/307 [00:00<?, ?it/s]

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▅▅▅▅▅▅▅▅▅▅▅▅▅█████████████
learning rate,▁▁▂▂▃▄▅▆▇▇███████▇▇▇▇▆▆▅▅▅▄▄▃▃▃▂▂▂▂▁▁▁▁▁
training loss,▃▁▇▃▅▂█▄▄▅▃▄▆▃▄▃▃▇▅▄▅▂▃▄▁▄▄▃▃▄▃▄▅▃▁▃▃▃▃▃

0,1
epoch,2.0
learning rate,0.0
training loss,1.49206


In [13]:
def evaluate_model(model, ds, max_batches):
    model = model.to(DEVICE)
    dl_val = torch.utils.data.DataLoader(ds, batch_size=BATCH_SIZE, shuffle=False)
    preds = []
    labels = []
    with torch.no_grad():
        model.eval()
        losses = []
        with tqdm(dl_val, desc='Val') as progress:
            for i, (X, y) in enumerate(progress):
                with autocast():
                    pred = model.forward(X.to(DEVICE))
                    pred = pred.squeeze()
                    loss = torch.nn.functional.cross_entropy(pred,y.to(DEVICE))
                    pred = torch.sigmoid(pred)
                    preds.append(pred.cpu())
                    losses.append(loss)
                    labels.append(y.cpu().numpy())
                    
                if i >= max_batches:
                    break
        preds = torch.concat(preds).cpu().numpy()
        labels = np.concatenate(labels)
        return np.mean(losses), preds

In [14]:
def gen_model_predictions(model, val_cancer_df, max_batches):
    ds_eval = LungsCancerDetectionDataset(val_cancer_df)
    loss, preds = evaluate_model(model, ds_eval, max_batches)
    df_pred = pd.DataFrame(data=preds,columns=['normal','adenocarcinoma','largecell_carcinoma','squamouscell_carcinoma'])
    df_train_predictions = pd.concat(
            [val_cancer_df.head(len(df_pred)).reset_index(drop=True),df_pred],axis=1
    )
    return df_train_predictions ,loss

val_df_pred, val_loss = gen_model_predictions(model,val_cancer_df,MAX_VAL_BATCHES)
print(f'Val loss {val_loss}')
print(val_df_pred)
val_df_pred.to_csv(f'val_predictions_{VERSION}.csv', index=False)

Val:   0%|          | 0/36 [00:00<?, ?it/s]

Val loss 0.9335488080978394
                                             image_id  label    normal   
0                           images/valid/normal/7.png      0  1.000000  \
1                images/valid/normal/6 - Copy (3).png      0  1.000000   
2            images/valid/normal/004007_01_01_519.png      0  1.000000   
3                images/valid/normal/7 - Copy (2).png      0  1.000000   
4                    images/valid/normal/6 - Copy.png      0  1.000000   
..                                                ...    ...       ...   
67  images/valid/squamous.cell.carcinoma/000115 (5...      3  0.089464   
68  images/valid/squamous.cell.carcinoma/000118 (5...      3  0.274495   
69  images/valid/squamous.cell.carcinoma/000118 (4...      3  0.212546   
70  images/valid/squamous.cell.carcinoma/000116 (2...      3  0.131298   
71  images/valid/squamous.cell.carcinoma/000108 (3...      3  0.099356   

    adenocarcinoma  largecell_carcinoma  squamouscell_carcinoma  
0         0.00016

In [15]:
test_df_pred, test_loss = gen_model_predictions(model,test_cancer_df,MAX_TEST_BATCHES)
print(f'Test loss {test_loss}')
print(test_df_pred)
test_df_pred.to_csv(f'test_predictions_{VERSION}.csv', index=False)

Val:   0%|          | 0/158 [00:00<?, ?it/s]



Test loss 0.8809480667114258
                                              image_id  label    normal   
0          images/test/normal/11 - Copy (2) - Copy.png      0  1.000000  \
1                             images/test/normal/7.png      0  1.000000   
2                 images/test/normal/11 - Copy (3).png      0  1.000000   
3                  images/test/normal/6 - Copy (3).png      0  1.000000   
4                            images/test/normal/23.png      0  1.000000   
..                                                 ...    ...       ...   
307  images/test/squamous.cell.carcinoma/000114 (3)...      3  0.196547   
308  images/test/squamous.cell.carcinoma/000139 (6)...      3  0.188168   
309  images/test/squamous.cell.carcinoma/000163 (6)...      3  0.052829   
310  images/test/squamous.cell.carcinoma/000168 (2)...      3  0.066711   
311  images/test/squamous.cell.carcinoma/000126 (4)...      3  0.141513   

     adenocarcinoma  largecell_carcinoma  squamouscell_carcinoma  
0  