# TRAIN notebook
cometition : RSNA Screening Mammography Breast Cancer Detection  
url : https://www.kaggle.com/competitions/rsna-breast-cancer-detection  

## import

In [3]:
!python -m pip install --no-index --find-links=/kaggle/input/dicom-whls pydicom pylibjpeg
!python -m pip install --no-index --find-links=/kaggle/input/rsna-datasets/ENV python_gdcm

import sys
sys.path.append('../input/timm-pytorch-image-models/pytorch-image-models-master')
import timm

import datetime
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
import cv2
import PIL
import pydicom
import gdcm
import pylibjpeg
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

Looking in links: /kaggle/input/dicom-whls
[0mLooking in links: /kaggle/input/rsna-datasets/ENV
[0m

In [None]:
timm.list_models(pretrained=True)

## Config

In [4]:
class Config:
    def __init__(self, model_name:str='resnet26d', pretrained:bool=True,
                 epochs:int=10, batch_size:int=32, size:int=512,
                 seed:int=0):
        self.model_name = model_name
        self.pretrained = pretrained
        self.epochs = epochs
        self.batch_size = batch_size
        self.size = size
        self.seed = seed

MODEL_NAME = 'vgg11'
config = Config(model_name=MODEL_NAME)

## Image Processing

In [5]:
def transform_image(paths, side='left', size=512, threshold=0.05):
    dicom_data = pydicom.dcmread(paths)
    data = np.array(dicom_data.pixel_array)
    data = data - np.min(data)
    data = data / np.max(data)
    if dicom_data.PhotometricInterpretation == "MONOCHROME1":
        data = 1.0 - data
    image = data[5:-5, 5:-5]

    ret, thresh = cv2.threshold(image, threshold, 1, 0)

    width = image.shape[1]
    # take all columns up to half image (in width), sumarize them and compare with other half
    if sum(sum(thresh[:, :width // 2])) > sum(sum(thresh[:, width // 2:])): 
        image_side = 'left'
    else:
        image_side = 'right'

    if image_side != side: 
        image = cv2.flip(image, 1)
    output= cv2.connectedComponentsWithStats((image > 0.05).astype(np.uint8)[:, :], 8, cv2.CV_32S)
    stats = output[2] # left, top, width, height, area_size

    idx = stats[1:, 4].argmax() + 1
    x1, y1, w, h = stats[idx][:4]
    x2 = x1 + w
    y2 = y1 + h

    image = image[y1: y2, x1: x2]
    image = cv2.resize(image, (size, size))
    return image


## Pytorch Model Tools

In [6]:
class EarlyStopping:
    def __init__(self, patience=3, verbose=False, path='checkpoint_model.pth'):
        self.patience = patience #設定ストップカウンタ
        self.verbose = verbose #表示の有無
        self.counter = 0 #現在のカウンタ値
        self.best_score = None #ベストスコア
        self.early_stop = False #ストップフラグ
        self.val_loss_min = np.Inf # 前回のベストスコア記憶用
        self.path = path #ベストモデルの格納パス
    
    def __call__(self, val_loss, model):
        score = -val_loss
        
        if self.best_score is None:
            self.best_score = score
            self.checkpoint(val_loss, model)
        elif score < self.best_score:
            self.counter += 1
            if self.verbose:
                print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.checkpoint(val_loss, model)
            self.counter = 0
    
    def checkpoint(self, val_loss, model):
        if self.verbose:
            print(f'validation loss decreased({self.val_loss_min:.6f} ---> {val_loss:.6f}). saving model....')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

class TrainDataset(Dataset):
    def __init__(self, df):
        super().__init__()
        self.df = df
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        f = self.df.filename.tolist()[index]
        image = transform_image(f)
        target = torch.Tensor(self.df.cancer.tolist())[index]
        image = torch.Tensor(image)
        send = {'image': image, 'target': target}
        return send

## Custom Pytorch Model

In [7]:
class Model_from_timm(nn.Module):
    def __init__(self, model_name:str, pretrained:bool=True):
        super().__init__()
        self.backbone = timm.create_model(model_name, pretrained=pretrained, in_chans=1, num_classes=0)
        self.in_features = self.backbone.num_features
        
        self.head = nn.Sequential(
            nn.Linear(self.in_features, 100),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(100, 1),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        h = self.backbone(x)
        y = self.head(h)
        return y

## other functions

In [8]:
def balance_df(df):
    one_x = df[df['cancer']==1]
    zero_x = df[df['cancer']==0]
    zero_x_2 = zero_x.sample(one_x.shape[0], random_state=config.seed)
    new_df = pd.concat([one_x, zero_x_2])
    new_df = new_df.reset_index()
    return new_df

def convert(lists):
    tmp = []
    for i in range(len(lists)):
        x = lists[i].tolist()
        for j in range(len(x)):
            tmp.append(x[j][0])
    return tmp

def pfbeta(labels, predictions, beta = 1):
    y_true_count = 0
    ctp = 0
    cfp = 0

    for idx in range(len(labels)):
        prediction = min(max(predictions[idx], 0), 1)
        if (labels[idx]):
            y_true_count += 1
            ctp += prediction
        else:
            cfp += prediction

    beta_squared = beta * beta
    c_precision = ctp / (ctp + cfp)
    c_recall = ctp / y_true_count
    if (c_precision > 0 and c_recall > 0):
        result = (1 + beta_squared) * (c_precision * c_recall) / (beta_squared * c_precision + c_recall)
        return result

## Data

In [9]:
train = pd.read_csv('/kaggle/input/rsna-breast-cancer-detection/train.csv')
train['filename'] = train.apply(lambda x: '/kaggle/input/rsna-breast-cancer-detection/train_images/'+str(x.patient_id)+'/'+str(x.image_id)+'.dcm', axis=1)
new_df = balance_df(train.copy())
train, test = train_test_split(new_df, stratify=new_df.cancer, random_state=config.seed, test_size=0.2)
dataset = TrainDataset(train)
dataloader = DataLoader(dataset, batch_size=config.batch_size, drop_last=False, shuffle=True)

## Train

In [10]:
def train_net(n_epochs, train_loader, net, optimizer, loss_fn, model_name, device='cpu'):
    save_path = f'{model_name}_{datetime.datetime.now().strftime("%y%m%d")}.pth'
    earlystopping = EarlyStopping(verbose=True, path=save_path)
    print(f'device=={device}')
    print(f'model_name=={model_name}')
    losses = []
    net.to(device)
    torch.cuda.manual_seed(config.seed)
    
    for epoch in range(n_epochs):
        running_loss = 0
        net.train()
        print(f'epoch {epoch} start')
        for index, data in enumerate(train_loader):
            inputs, labels = data['image'], data['target']
            inputs = inputs.to(device)
            labels = labels.to(device)
            inputs = inputs.unsqueeze(dim=1)
            labels = labels.unsqueeze(dim=1)
            
            optimizer.zero_grad()
            outputs = net(inputs)
            loss = loss_fn(outputs, labels)
            print(f'index: {index}, loss: {loss}')
            print(f'outputs: {outputs.tolist()[:3]}, labels: {labels.tolist()[:3]}')
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        
        losses.append( running_loss / index)
        print(f'epoch, {epoch} : {running_loss/ index}')
        
        earlystopping((running_loss/index), net)
        if earlystopping.early_stop:
            print('Early Stop!!!!!')
            break
    return losses

In [14]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
net = Model_from_timm(config.model_name)
criterion = nn.BCELoss()
optimizer = optim.AdamW(net.parameters())
# optimizer = optim.SGD(net.parameters(), lr=0.0001, momentum=0.9, weight_decay=0.005)

In [15]:
losses = train_net(n_epochs=config.epochs, train_loader=dataloader, net=net,
                   optimizer=optimizer, loss_fn=criterion,
                   model_name=config.model_name, device=device)

device==cuda
model_name==vgg11
epoch 0 start
index: 0, loss: 0.7068426609039307
outputs: [[0.5438529253005981], [0.5222128033638], [0.49238890409469604]], labels: [[1.0], [1.0], [0.0]]
index: 1, loss: 3.3973629474639893
outputs: [[0.002164656762033701], [0.006180374417454004], [0.014860199764370918]], labels: [[1.0], [0.0], [0.0]]
index: 2, loss: 0.6974515914916992
outputs: [[0.5021376609802246], [0.5225943922996521], [0.5185756683349609]], labels: [[0.0], [0.0], [0.0]]
index: 3, loss: 0.763258695602417
outputs: [[0.3309766352176666], [0.2525508403778076], [0.4333162009716034]], labels: [[1.0], [1.0], [1.0]]
index: 4, loss: 0.7074992656707764
outputs: [[0.5832001566886902], [0.5465620160102844], [0.5402883291244507]], labels: [[0.0], [1.0], [1.0]]
index: 5, loss: 0.7048647403717041
outputs: [[0.48436132073402405], [0.4937305152416229], [0.45168793201446533]], labels: [[0.0], [1.0], [1.0]]
index: 6, loss: 0.7059527635574341
outputs: [[0.4917164444923401], [0.4981887638568878], [0.487502

KeyboardInterrupt: 

## Test

In [13]:
TEST_MODEL_PATH = '/kaggle/working/vgg11_230214.pth'

test_dataset = TrainDataset(test)
test_dataloader = DataLoader(test_dataset, batch_size=32)
model = Model_from_timm(config.model_name, pretrained=False)
model.load_state_dict(torch.load(TEST_MODEL_PATH))

def test_net(dataloader, model, device):
    model = model.eval()
    preds_lis = []
    labels_lis = []
    model.to(device)
    for i, data in enumerate(dataloader):
        with torch.no_grad():
            inputs, labels = data['image'], data['target']
            inputs = inputs.to(device)
            labels = labels.to(device)
            inputs = inputs.unsqueeze(dim=1)
            labels = labels.unsqueeze(dim=1)
            inputs.to(device)
            
            output = model(inputs)
            preds_lis.append(output)
            labels_lis.append(labels)
    return preds_lis, labels_lis

preds, labels = test_net(test_dataloader, model, device)

preds = convert(preds)
labels = convert(labels)

print(f'pf1:{pfbeta(labels, preds)}')

pf1:0.5014033879219387
