# TRAIN notebook
cometition : RSNA Screening Mammography Breast Cancer Detection  
url : https://www.kaggle.com/competitions/rsna-breast-cancer-detection  

## import

In [2]:
!python -m pip install --no-index --find-links=/kaggle/input/dicom-whls pydicom pylibjpeg
!python -m pip install --no-index --find-links=/kaggle/input/rsna-datasets/ENV python_gdcm

import sys
sys.path.append('../input/timm-pytorch-image-models/pytorch-image-models-master')
import timm

import datetime
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
import cv2
import PIL
import pydicom
import gdcm
import pylibjpeg
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

Looking in links: /kaggle/input/dicom-whls
[0mLooking in links: /kaggle/input/rsna-datasets/ENV
[0m

In [3]:
[m for m in timm.list_models(pretrained=True) if "effi" in m]

['efficientnet_b0',
 'efficientnet_b1',
 'efficientnet_b1_pruned',
 'efficientnet_b2',
 'efficientnet_b2_pruned',
 'efficientnet_b3',
 'efficientnet_b3_pruned',
 'efficientnet_b4',
 'efficientnet_el',
 'efficientnet_el_pruned',
 'efficientnet_em',
 'efficientnet_es',
 'efficientnet_es_pruned',
 'efficientnet_lite0',
 'efficientnetv2_rw_m',
 'efficientnetv2_rw_s',
 'efficientnetv2_rw_t',
 'gc_efficientnetv2_rw_t',
 'tf_efficientnet_b0',
 'tf_efficientnet_b0_ap',
 'tf_efficientnet_b0_ns',
 'tf_efficientnet_b1',
 'tf_efficientnet_b1_ap',
 'tf_efficientnet_b1_ns',
 'tf_efficientnet_b2',
 'tf_efficientnet_b2_ap',
 'tf_efficientnet_b2_ns',
 'tf_efficientnet_b3',
 'tf_efficientnet_b3_ap',
 'tf_efficientnet_b3_ns',
 'tf_efficientnet_b4',
 'tf_efficientnet_b4_ap',
 'tf_efficientnet_b4_ns',
 'tf_efficientnet_b5',
 'tf_efficientnet_b5_ap',
 'tf_efficientnet_b5_ns',
 'tf_efficientnet_b6',
 'tf_efficientnet_b6_ap',
 'tf_efficientnet_b6_ns',
 'tf_efficientnet_b7',
 'tf_efficientnet_b7_ap',
 'tf_effi

## Config

In [4]:
class Config:
    def __init__(self, model_path:str, model_name:str='resnet26d', pretrained:bool=True,
                 epochs:int=10, batch_size:int=32, size:int=512,
                 seed:int=0, train_new:bool=True):
        self.model_name = model_name
        self.pretrained = pretrained
        self.epochs = epochs
        self.batch_size = batch_size
        self.size = size
        self.seed = seed
        self.train_new = train_new
        self.model_path = model_path

MODEL_NAME = 'efficientnet_b0'
MODEL_PATH = '/kaggle/input/rsna-datasets/resnet18d_230216.pth'
TRAIN_NEW = True
config = Config(model_name=MODEL_NAME, model_path=MODEL_PATH, train_new=TRAIN_NEW, batch_size=16)

## Image Processing

In [5]:
def transform_image(paths, side='left', size=512, threshold=0.05):
    dicom_data = pydicom.dcmread(paths)
    data = np.array(dicom_data.pixel_array)
    data = data - np.min(data)
    data = data / np.max(data)
    if dicom_data.PhotometricInterpretation == "MONOCHROME1":
        data = 1.0 - data
    image = data[5:-5, 5:-5]

    ret, thresh = cv2.threshold(image, threshold, 1, 0)

    width = image.shape[1]
    # take all columns up to half image (in width), sumarize them and compare with other half
    if sum(sum(thresh[:, :width // 2])) > sum(sum(thresh[:, width // 2:])): 
        image_side = 'left'
    else:
        image_side = 'right'

    if image_side != side: 
        image = cv2.flip(image, 1)
    output= cv2.connectedComponentsWithStats((image > 0.05).astype(np.uint8)[:, :], 8, cv2.CV_32S)
    stats = output[2] # left, top, width, height, area_size

    idx = stats[1:, 4].argmax() + 1
    x1, y1, w, h = stats[idx][:4]
    x2 = x1 + w
    y2 = y1 + h

    image = image[y1: y2, x1: x2]
    image = cv2.resize(image, (size, int(size*1.8)))
    return image


## Pytorch Model Tools

In [6]:
class EarlyStopping:
    def __init__(self, patience=3, verbose=False, path='checkpoint_model.pth'):
        self.patience = patience #設定ストップカウンタ
        self.verbose = verbose #表示の有無
        self.counter = 0 #現在のカウンタ値
        self.best_score = None #ベストスコア
        self.early_stop = False #ストップフラグ
        self.val_loss_min = np.Inf # 前回のベストスコア記憶用
        self.path = path #ベストモデルの格納パス
    
    def __call__(self, val_loss, model):
        score = -val_loss
        
        if self.best_score is None:
            self.best_score = score
            self.checkpoint(val_loss, model)
        elif score < self.best_score:
            self.counter += 1
            if self.verbose:
                print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.checkpoint(val_loss, model)
            self.counter = 0
    
    def checkpoint(self, val_loss, model):
        if self.verbose:
            print(f'validation loss decreased({self.val_loss_min:.6f} ---> {val_loss:.6f}). saving model....')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

class TrainDataset(Dataset):
    def __init__(self, df):
        super().__init__()
        self.df = df
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        f = self.df.filename.tolist()[index]
        image = transform_image(f)
        target = torch.Tensor(self.df.cancer.tolist())[index]
        image = torch.Tensor(image)
        send = {'image': image, 'target': target}
        return send

## Custom Pytorch Model

In [7]:
class Model_from_timm(nn.Module):
    def __init__(self, model_name:str, pretrained:bool=True):
        super().__init__()
        self.backbone = timm.create_model(model_name, pretrained=pretrained, in_chans=1, num_classes=0)
        self.in_features = self.backbone.num_features
        
        self.head = nn.Sequential(
            nn.Linear(self.in_features, 100),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(100, 1),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        h = self.backbone(x)
        y = self.head(h)
        return y

## other functions

In [8]:
def balance_df(df):
    one_x = df[df['cancer']==1]
    zero_x = df[df['cancer']==0]
    zero_x_2 = zero_x.sample(one_x.shape[0], random_state=config.seed)
    zero_other_idx = list(set(zero_x.index) - set(zero_x_2.index))
    zero_other = zero_x.loc[zero_other_idx]
    new_df = pd.concat([one_x, zero_x_2])
    new_df = new_df.reset_index()
    zero_other = zero_other.reset_index()
    return new_df, zero_other

def convert(lists):
    tmp = []
    for i in range(len(lists)):
        x = lists[i].tolist()
        for j in range(len(x)):
            tmp.append(x[j][0])
    return tmp

def pfbeta(labels, predictions, beta = 1):
    y_true_count = 0
    ctp = 0
    cfp = 0

    for idx in range(len(labels)):
        prediction = min(max(predictions[idx], 0), 1)
        if (labels[idx]):
            y_true_count += 1
            ctp += prediction
        else:
            cfp += prediction

    beta_squared = beta * beta
    c_precision = ctp / (ctp + cfp)
    c_recall = ctp / y_true_count
    if (c_precision > 0 and c_recall > 0):
        result = (1 + beta_squared) * (c_precision * c_recall) / (beta_squared * c_precision + c_recall)
        return result

## Data

In [9]:
train = pd.read_csv('/kaggle/input/rsna-breast-cancer-detection/train.csv')
train['filename'] = train.apply(lambda x: '/kaggle/input/rsna-breast-cancer-detection/train_images/'+str(x.patient_id)+'/'+str(x.image_id)+'.dcm', axis=1)
new_df, zero_other = balance_df(train.copy())
train, test = train_test_split(new_df, stratify=new_df.cancer, random_state=config.seed, test_size=0.2)
val, test = train_test_split(test, stratify=test.cancer, random_state=config.seed, test_size=0.5)
# val = pd.concat([val, zero_other.loc[0:100]])
# test = pd.concat([test, zero_other.loc[100:200]])
dataset = TrainDataset(train)
dataset_val = TrainDataset(val)
dataloader = DataLoader(dataset, batch_size=config.batch_size, drop_last=False, shuffle=True)
dataloader_val = DataLoader(dataset_val, batch_size=config.batch_size, drop_last=False)

In [10]:
print(train.shape)
print(val.shape)
print(test.shape)

(1852, 16)
(232, 16)
(232, 16)


## Train

In [11]:
def train_net(n_epochs, train_loader,val_loader, net, optimizer, loss_fn, model_name, device='cpu'):
    save_path = f'{model_name}_{datetime.datetime.now().strftime("%y%m%d")}.pth'
    earlystopping = EarlyStopping(verbose=True, path=save_path)
    print(f'device=={device}')
    print(f'model_name=={model_name}')
    losses = []
    losses_val = []
    net.to(device)
    torch.cuda.manual_seed(config.seed)
    
    for epoch in range(n_epochs):
        running_loss = 0
        net.train()
        print(f'epoch {epoch} start')
        for index, data in enumerate(train_loader):
            inputs, labels = data['image'], data['target']
            inputs = inputs.unsqueeze(dim=1)
            labels = labels.unsqueeze(dim=1)
            inputs = inputs.to(device)
            labels = labels.to(device)
            
            optimizer.zero_grad()
            outputs = net(inputs)
            loss = loss_fn(outputs, labels)
            print(f'index: {index}, loss: {loss}')
            print(f'outputs: {outputs.tolist()[:3]}, labels: {labels.tolist()[:3]}')
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            
            del inputs, labels
            torch.cuda.empty_cache()
        
        
        tr_loss = running_loss / (index+1)
        losses.append(tr_loss)
        
        val_loss = 0
        net.eval()
        for index, data in enumerate(val_loader):
            with torch.no_grad():
                inputs, labels = data['image'], data['target']
                inputs = inputs.to(device)
                labels = labels.to(device)
                inputs = inputs.unsqueeze(dim=1)
                labels = labels.unsqueeze(dim=1)

                output = net(inputs)
                loss = loss_fn(output, labels)
                
                val_loss += loss.item()
            
            print(f'val:{index}: loss:{loss.item()}')
            del inputs, labels
            torch.cuda.empty_cache()
        
        v_loss = val_loss / (index+1)
        losses_val.append(v_loss)
        
        print(f'epoch, {epoch}')
        print(f'train_loss, {tr_loss}')
        print(f'valid_loss, {v_loss}')
        
        earlystopping(v_loss, net)
        if earlystopping.early_stop:
            print('Early Stop!!!!!')
            break
    return losses, losses_val

In [12]:
print(config.train_new)
if config.train_new:
    net = Model_from_timm(config.model_name)
else:
    net = Model_from_timm(config.model_name, pretrained=False)
    net.load_state_dict(torch.load(config.model_path))

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
criterion = nn.BCELoss()
optimizer = optim.AdamW(net.parameters())
# optimizer = optim.SGD(net.parameters(), lr=0.0001, momentum=0.9, weight_decay=0.005)

True


Downloading: "https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b0_ra-3dd342df.pth" to /root/.cache/torch/hub/checkpoints/efficientnet_b0_ra-3dd342df.pth


In [13]:
losses_tr, losses_val = train_net(n_epochs=config.epochs, train_loader=dataloader, val_loader=dataloader_val, net=net,
                   optimizer=optimizer, loss_fn=criterion,
                   model_name=config.model_name, device=device)

device==cuda
model_name==efficientnet_b0
epoch 0 start
index: 0, loss: 0.6837092638015747
outputs: [[0.5114609003067017], [0.4742077887058258], [0.49877336621284485]], labels: [[0.0], [1.0], [0.0]]
index: 1, loss: 0.6689447164535522
outputs: [[0.6048725247383118], [0.6234895586967468], [0.6030918955802917]], labels: [[0.0], [1.0], [1.0]]
index: 2, loss: 0.6902515888214111
outputs: [[0.5285473465919495], [0.8020411133766174], [0.7785217761993408]], labels: [[0.0], [0.0], [1.0]]
index: 3, loss: 1.0333466529846191
outputs: [[0.5937134623527527], [0.7979913353919983], [0.7067204713821411]], labels: [[0.0], [0.0], [1.0]]
index: 4, loss: 0.6082509756088257
outputs: [[0.5587603449821472], [0.6198078393936157], [0.5559120178222656]], labels: [[0.0], [1.0], [1.0]]
index: 5, loss: 0.6674495935440063
outputs: [[0.49771618843078613], [0.5027093291282654], [0.7159760594367981]], labels: [[0.0], [0.0], [0.0]]
index: 6, loss: 0.6905795335769653
outputs: [[0.4498511254787445], [0.6675860285758972], [0

## Test

In [15]:
TEST_MODEL_PATH = '/kaggle/working/efficientnet_b0_230223.pth'

test_dataset = TrainDataset(test)
test_dataloader = DataLoader(test_dataset, batch_size=32)
model = Model_from_timm(config.model_name, pretrained=False)
model.load_state_dict(torch.load(TEST_MODEL_PATH))

def test_net(dataloader, model, device, loss_fn):
    model = model.eval()
    preds_lis = []
    labels_lis = []
    test_loss = []
    model.to(device)
    for i, data in enumerate(dataloader):
        loss = 0
        with torch.no_grad():
            inputs, labels = data['image'], data['target']
            inputs = inputs.to(device)
            labels = labels.to(device)
            inputs = inputs.unsqueeze(dim=1)
            labels = labels.unsqueeze(dim=1)
#             inputs.to(device)
            
            output = model(inputs)
            loss_ = loss_fn(output, labels)
            loss += loss_
            preds_lis.append(output)
            labels_lis.append(labels)
        test_loss.append(loss/(i+1))
        print(loss/(i+1))
    return preds_lis, labels_lis, test_loss

preds, labels, test_loss = test_net(test_dataloader, model, device, criterion)

preds = convert(preds)
labels = convert(labels)

print(f'pf1:{pfbeta(labels, preds)}')

tensor(0.6944, device='cuda:0')
tensor(0.3176, device='cuda:0')
tensor(0.2170, device='cuda:0')
tensor(0.1702, device='cuda:0')
tensor(0.1432, device='cuda:0')
tensor(0.1137, device='cuda:0')
tensor(0.0897, device='cuda:0')
tensor(0.0763, device='cuda:0')
pf1:0.5195249059616933


In [16]:
df = pd.DataFrame(data={'pred': preds, 'label': labels})
df

Unnamed: 0,pred,label
0,0.638842,0.0
1,0.471904,1.0
2,0.661989,0.0
3,0.635484,1.0
4,0.454187,0.0
...,...,...
227,0.631643,1.0
228,0.191031,0.0
229,0.255945,0.0
230,0.473078,0.0


In [36]:
df['new_pred'] = df['pred'].apply(lambda x: 1 if x > 0.3 else 0)

In [38]:
pfbeta(df['label'], df['pred'])


0.5000541935629237