In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
TRAIN_DF_PATH = "/kaggle/input/rsna-breast-cancer-detection/train.csv"

df = pd.read_csv(TRAIN_DF_PATH)
df.head()

Unnamed: 0,site_id,patient_id,image_id,laterality,view,age,cancer,biopsy,invasive,BIRADS,implant,density,machine_id,difficult_negative_case
0,2,10006,462822612,L,CC,61.0,0,0,0,,0,,29,False
1,2,10006,1459541791,L,MLO,61.0,0,0,0,,0,,29,False
2,2,10006,1864590858,R,MLO,61.0,0,0,0,,0,,29,False
3,2,10006,1874946579,R,CC,61.0,0,0,0,,0,,29,False
4,2,10011,220375232,L,CC,55.0,0,0,0,0.0,0,,21,True


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54706 entries, 0 to 54705
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   site_id                  54706 non-null  int64  
 1   patient_id               54706 non-null  int64  
 2   image_id                 54706 non-null  int64  
 3   laterality               54706 non-null  object 
 4   view                     54706 non-null  object 
 5   age                      54669 non-null  float64
 6   cancer                   54706 non-null  int64  
 7   biopsy                   54706 non-null  int64  
 8   invasive                 54706 non-null  int64  
 9   BIRADS                   26286 non-null  float64
 10  implant                  54706 non-null  int64  
 11  density                  29470 non-null  object 
 12  machine_id               54706 non-null  int64  
 13  difficult_negative_case  54706 non-null  bool   
dtypes: bool(1), float64(2)

In [4]:
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='median')
df.age = imputer.fit_transform(df.age.values.reshape(-1, 1))

label_encode_cols = ["laterality", "view"]
for column in label_encode_cols:
    label_encoder = LabelEncoder()
    df[column] = label_encoder.fit_transform(df[column])
    
df.head()

Unnamed: 0,site_id,patient_id,image_id,laterality,view,age,cancer,biopsy,invasive,BIRADS,implant,density,machine_id,difficult_negative_case
0,2,10006,462822612,0,1,61.0,0,0,0,,0,,29,False
1,2,10006,1459541791,0,5,61.0,0,0,0,,0,,29,False
2,2,10006,1864590858,1,5,61.0,0,0,0,,0,,29,False
3,2,10006,1874946579,1,1,61.0,0,0,0,,0,,29,False
4,2,10011,220375232,0,1,55.0,0,0,0,0.0,0,,21,True


In [5]:
df["fold"] = 0

from sklearn.model_selection import StratifiedGroupKFold

N_FOLDS = 5
skf = StratifiedGroupKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)
for fold, (train_index, valid_index) in enumerate(skf.split(df, df.cancer,groups=df.patient_id),\
                                                  start=1):
    df.loc[valid_index, "fold"] = fold
    
df.head()

Unnamed: 0,site_id,patient_id,image_id,laterality,view,age,cancer,biopsy,invasive,BIRADS,implant,density,machine_id,difficult_negative_case,fold
0,2,10006,462822612,0,1,61.0,0,0,0,,0,,29,False,2
1,2,10006,1459541791,0,5,61.0,0,0,0,,0,,29,False,2
2,2,10006,1864590858,1,5,61.0,0,0,0,,0,,29,False,2
3,2,10006,1874946579,1,1,61.0,0,0,0,,0,,29,False,2
4,2,10011,220375232,0,1,55.0,0,0,0,0.0,0,,21,True,1


In [6]:
import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2
from albumentations import (RandomResizedCrop, HorizontalFlip, VerticalFlip,
                            ShiftScaleRotate, Resize)

IMG_SIZE_HEIGHT, IMG_SIZE_WIDTH = 1024, 512
def getTransforms(mode="TRAIN"):
    if mode == "TRAIN":
        transforms = A.Compose([
                RandomResizedCrop(IMG_SIZE_HEIGHT, IMG_SIZE_WIDTH),
                ShiftScaleRotate(rotate_limit=90, scale_limit = [0.8, 1.2]),
                HorizontalFlip(p=.5),
                VerticalFlip(p=.5),
                ToTensorV2()
            ])
    else:
        transforms = A.Compose([
            Resize(IMG_SIZE_HEIGHT, IMG_SIZE_WIDTH),
            ToTensorV2(),
        ])
    return transforms

In [7]:
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from PIL import Image

class RSNADataset(Dataset):
    def __init__(self, df, other_features, img_dir, mode="TRAIN", transforms_mode="TRAIN"):
        self.df = df
        self.img_dir = img_dir
        self.mode = mode
        self.other_features = other_features
        self.transforms = getTransforms(transforms_mode)
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, i):
        row = df.loc[i]
        image_name = "{}/{}.png".format(row.patient_id, row.image_id)
        image_path = os.path.join(self.img_dir, image_name)
        img = Image.open(image_path)
        img = np.array(img)
        img = self.transforms(image=img)["image"]
        img = np.concatenate([img, img, img], axis=0).astype(np.float32)
        label = row.cancer.astype(np.float32)
        other_features = np.array(row[self.other_features].values,
                                  dtype=np.float32)
        if self.mode == "TRAIN":
            return img, other_features, label
        else:
            return img, other_features

In [8]:
from sklearn.utils import compute_class_weight

class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(df.cancer),
                                     y=df.cancer)
class_weights = dict(zip(np.unique(df.cancer), class_weights))
class_weights

{0: 0.510812728766714, 1: 23.620898100172713}

In [9]:
IMAGES_DIR = "/kaggle/input/rsna-cut-off-empty-space-from-images"
BATCH_SIZE = 8
OTHER_FEATURES = ['laterality', 'view', 'age', 'implant']

def getDataloader(df, mode="TRAIN", transforms_mode="TRAIN"):
    dataset = RSNADataset(df, OTHER_FEATURES, IMAGES_DIR, mode=mode, transforms_mode=transforms_mode)
    if mode == "TRAIN":
        sample_weights = [0] * len(dataset)
        loop = tqdm(enumerate(dataset), total=len(dataset))
        for idx, (_, _, y) in loop:
            class_weight = class_weights[y]
            sample_weights[idx] = class_weight
            if idx == len(dataset)-1: break
            sampler = WeightedRandomSampler(sample_weights, num_samples=len(sample_weights), replacement=True)
        return DataLoader(dataset, batch_size=BATCH_SIZE, sampler=sampler, pin_memory=True)
    else:
        return DataLoader(dataset, batch_size=BATCH_SIZE)

In [10]:
!pip install efficientnet_pytorch -q
!pip install torchmetrics -q

import torch
import torch.nn as nn
import torch.nn.functional as F

from efficientnet_pytorch import EfficientNet
from torchmetrics import Accuracy, F1Score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
accuracy = Accuracy(task="binary").to(device)
f1_score = F1Score(task="binary").to(device)

[0m

In [11]:
class Model(nn.Module):
    def __init__(self, num_in_other_features, hidden_state_features):
        super(Model, self).__init__()
        self.num_in_other_features = num_in_other_features
        self.eff_feature_extractor = EfficientNet.from_pretrained("efficientnet-b4")
        self.num_eff_feature_extractor = 1792
        
        self.classification_network = nn.Sequential(
            nn.Linear(self.num_eff_feature_extractor+num_in_other_features, hidden_state_features),
            nn.BatchNorm1d(hidden_state_features),
            nn.ReLU(),
            nn.Dropout(.2),
            nn.Linear(hidden_state_features, 1),
        )
        
    def forward(self, X, other_features):
        X = self.eff_feature_extractor.extract_features(X)
        X = F.avg_pool2d(X, X.size()[2:]).reshape(-1, self.num_eff_feature_extractor)
        X_concat = torch.cat((X, other_features), dim=1)
        
        return self.classification_network(X_concat)

In [12]:
import torch.optim as optim

LR = 3e-4
LR_PATIENCE = 1
LR_FACTOR = 0.4

model = Model(len(OTHER_FEATURES), 1024).to(device)
model.load_state_dict(torch.load("/kaggle/input/rsna-models/fold-1 epoch-3.pth"))
weight = torch.Tensor([float(class_weights[1]), ]).to(device)
loss_fn = nn.BCEWithLogitsLoss(pos_weight=weight)
optimizer = optim.Adam(model.parameters(), lr=LR)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=LR_PATIENCE,
                                                 verbose=True, factor=LR_FACTOR)
scaler = torch.cuda.amp.GradScaler()

Downloading: "https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b4-6ed6700e.pth" to /root/.cache/torch/hub/checkpoints/efficientnet-b4-6ed6700e.pth


  0%|          | 0.00/74.4M [00:00<?, ?B/s]

Loaded pretrained weights for efficientnet-b4


In [13]:
import csv

cols = ["fold", "epoch", "training_loss", "training_acc", "training_f1",
        "validation_loss","validation_acc", "validation_f1"]
with open("log.csv", "w") as f:
    csv_writer = csv.writer(f)
    csv_writer.writerow(cols)
    
def writeCSVLog(vals):
    with open("log.csv", "a") as f:
        csv_writer = csv.writer(f)
        csv_writer.writerow(vals)

In [14]:
from tqdm import tqdm
N_ACCUMULATION_STEPS = 4

def train(model, dataloader, loss_fn, optimizer, epoch, fold):
    model.train()
    losses = []
    mean_accuracy = 0.
    mean_f1_score = 0.
    
    loop = tqdm(enumerate(dataloader), total=len(dataloader))
    
    for batch_idx, (X, other_features, y) in loop:
        loop.set_description("Fold [{}/{}] Epoch [{}/{}]".format(fold, N_FOLDS, epoch, EPOCHS))
        
        #model = model.to(device)
        X = X.to(device)
        other_features = other_features.to(device)
        y = y.to(device)
        
        with torch.cuda.amp.autocast():
            logits = model(X, other_features)
            loss = loss_fn(logits, y.unsqueeze(1))
        losses.append(loss.item())
        loss = loss / N_ACCUMULATION_STEPS
        scaler.scale(loss).backward()
        
        if ((batch_idx+1) % N_ACCUMULATION_STEPS == 0) or (batch_idx + 1 == len(dataloader)):
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
        
        probabilities = torch.sigmoid(logits).squeeze()
        accuracy_value = accuracy(probabilities, y).item()
        mean_accuracy += accuracy_value
        f1_score_value = f1_score(probabilities, y).item()
        mean_f1_score += f1_score_value
        
        loop.set_postfix(loss=loss.item(), acc=accuracy_value, f1=f1_score_value)
        
    mean_loss = sum(losses) / len(losses)
    mean_accuracy /= len(losses)
    mean_f1_score /= len(losses)
    
    return [mean_loss, mean_accuracy, mean_f1_score]

In [15]:
def valid(model, dataloader, loss_fn, scheduler, epoch, fold):
    model.eval()
    losses = []
    mean_accuracy = 0.
    mean_f1_score = 0.
    
    with torch.inference_mode():
        loop = tqdm(enumerate(dataloader), total=len(dataloader))
        for batch_idx, (X, other_features, y) in loop:
            loop.set_description("Fold [{}/{}] Epoch [{}/{}]".format(fold, N_FOLDS, epoch, EPOCHS))
            
            #model = model.to(device)
            X = X.to(device)
            other_features = other_features.to(device)
            y = y.to(device)
            
            
            logits = model(X, other_features)
            loss = loss_fn(logits, y.unsqueeze(1))
            losses.append(loss.item())
            
            probabilities = torch.sigmoid(logits).squeeze()
            accuracy_value = accuracy(probabilities, y).item()
            mean_accuracy += accuracy_value
            f1_score_value = f1_score(probabilities, y).item()
            mean_f1_score += f1_score_value
            
            loop.set_postfix(loss=loss.item(), acc=accuracy_value, f1=f1_score_value)
            
        mean_loss = sum(losses) / len(losses)
        mean_accuracy /= len(losses)
        mean_f1_score /= len(losses)
        scheduler.step(mean_loss)
        
    return [mean_loss, mean_accuracy, mean_f1_score]

In [16]:
for key in class_weights.keys():
    item = class_weights[key]
    class_weights[key] = item.astype(np.float32)

In [17]:
EPOCHS = 5

fold = 1
train_df = df.query('fold != @fold')
valid_df = df.query('fold == @fold')

train_dataloader = getDataloader(train_df, mode="TRAIN", transforms_mode="TRAIN")
valid_dataloader = getDataloader(valid_df, mode="TRAIN", transforms_mode="VALID")

for epoch in range(4, EPOCHS+4):
    train_vals = train(model, train_dataloader, loss_fn, optimizer, epoch, fold)
    valid_vals = valid(model, valid_dataloader, loss_fn, scheduler, epoch, fold)
    torch.save(model.state_dict(), "fold-{} epoch-{}.pth".format(fold, epoch))
    vals = [fold, epoch, ] + train_vals + valid_vals
    writeCSVLog(vals)

100%|█████████▉| 43726/43727 [13:34<00:00, 53.66it/s]
100%|█████████▉| 10978/10979 [02:35<00:00, 70.71it/s]
Fold [1/5] Epoch [4/5]: 100%|██████████| 5466/5466 [1:23:25<00:00,  1.09it/s, acc=0.429, f1=0.6, loss=0.498]
Fold [1/5] Epoch [4/5]: 100%|██████████| 1373/1373 [07:57<00:00,  2.88it/s, acc=0.667, f1=0.8, loss=1.28]
Fold [1/5] Epoch [5/5]: 100%|██████████| 5466/5466 [1:23:48<00:00,  1.09it/s, acc=0.714, f1=0.833, loss=0.248]
Fold [1/5] Epoch [5/5]: 100%|██████████| 1373/1373 [08:03<00:00,  2.84it/s, acc=0.667, f1=0.8, loss=0.528]
Fold [1/5] Epoch [6/5]: 100%|██████████| 5466/5466 [1:23:35<00:00,  1.09it/s, acc=0.429, f1=0.6, loss=0.579]
Fold [1/5] Epoch [6/5]: 100%|██████████| 1373/1373 [07:49<00:00,  2.92it/s, acc=0.667, f1=0.8, loss=1.38]
Fold [1/5] Epoch [7/5]: 100%|██████████| 5466/5466 [1:23:11<00:00,  1.10it/s, acc=0.571, f1=0.727, loss=0.346]
Fold [1/5] Epoch [7/5]: 100%|██████████| 1373/1373 [07:53<00:00,  2.90it/s, acc=0.667, f1=0.667, loss=1.79]


Epoch 00004: reducing learning rate of group 0 to 1.2000e-04.


Fold [1/5] Epoch [8/5]: 100%|██████████| 5466/5466 [1:22:51<00:00,  1.10it/s, acc=0.714, f1=0.833, loss=0.431]
Fold [1/5] Epoch [8/5]: 100%|██████████| 1373/1373 [07:59<00:00,  2.86it/s, acc=0.667, f1=0.8, loss=0.862]
