# SUBMIT notebook
competition : RSNA Screening Mammography Breast Cancer Detection  
url : https://www.kaggle.com/competitions/rsna-breast-cancer-detection

## import

In [1]:
!python -m pip install --no-index --find-links=/kaggle/input/dicom-whls pydicom pylibjpeg
!python -m pip install --no-index --find-links=/kaggle/input/rsna-datasets/ENV python_gdcm

Looking in links: /kaggle/input/dicom-whls
Processing /kaggle/input/dicom-whls/pylibjpeg-1.4.0-py3-none-any.whl
Installing collected packages: pylibjpeg
Successfully installed pylibjpeg-1.4.0
[0mLooking in links: /kaggle/input/rsna-datasets/ENV
Processing /kaggle/input/rsna-datasets/ENV/python_gdcm-3.0.21-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Installing collected packages: python_gdcm
Successfully installed python_gdcm-3.0.21
[0m

In [2]:
import sys
sys.path.append('../input/timm-pytorch-image-models/pytorch-image-models-master')
import timm

import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
import cv2
import PIL
import pydicom
import gdcm
import pylibjpeg
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import tqdm
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [3]:
def transform_image(paths, side='left', size=512, threshold=0.05):
    dicom_data = pydicom.dcmread(paths)
    data = np.array(dicom_data.pixel_array)
    data = data - np.min(data)
    data = data / np.max(data)
    if dicom_data.PhotometricInterpretation == "MONOCHROME1":
        data = 1.0 - data
    image = data[5:-5, 5:-5]

    ret, thresh = cv2.threshold(image, threshold, 1, 0)

    width = image.shape[1]
    # take all columns up to half image (in width), sumarize them and compare with other half
    if sum(sum(thresh[:, :width // 2])) > sum(sum(thresh[:, width // 2:])): 
        image_side = 'left'
    else:
        image_side = 'right'

    if image_side != side: 
        image = cv2.flip(image, 1)
    output= cv2.connectedComponentsWithStats((image > 0.05).astype(np.uint8)[:, :], 8, cv2.CV_32S)
    stats = output[2] # left, top, width, height, area_size

    idx = stats[1:, 4].argmax() + 1
    x1, y1, w, h = stats[idx][:4]
    x2 = x1 + w
    y2 = y1 + h

    image = image[y1: y2, x1: x2]
    image = cv2.resize(image, (size, size))
    return image

def submit_df(sample_df, value_df):
    for i in range(sample_df.shape[0]):
        id_ = sample_df.prediction_id[i]
        p = value_df[id_]
        sample_df.loc[i, 'cancer'] = p
    return sample_df

In [4]:
class Model_from_timm(nn.Module):
    def __init__(self, model_name:str, pretrained:bool=False):
        super().__init__()
        self.backbone = timm.create_model(model_name, pretrained=pretrained, in_chans=1, num_classes=0)
        self.in_features = self.backbone.num_features
        
        self.head = nn.Sequential(
            nn.Linear(self.in_features, 100),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(100, 1),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        h = self.backbone(x)
        y = self.head(h)
        return y


class SubmitDataset(Dataset):
    def __init__(self, df):
        super().__init__()
        self.df = df
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        f = self.df.filename.tolist()[index]
        image = transform_image(f)
        image = torch.Tensor(image)
        pred_id = self.df.prediction_id.tolist()[index]
        send = {'image': image, 'prediction_id': pred_id}
        return send

In [5]:
test = pd.read_csv('/kaggle/input/rsna-breast-cancer-detection/test.csv')
test['filename'] = test.apply(lambda x: '/kaggle/input/rsna-breast-cancer-detection/test_images/'+str(x.patient_id)+'/'+str(x.image_id)+'.dcm', axis=1) 
sample = pd.read_csv('/kaggle/input/rsna-breast-cancer-detection/sample_submission.csv')

In [6]:
def sub_net(dataloader, model, device):
    model = model.eval()
    preds_lis = []
    id_lis = []
    model.to(device)
    for i, data in enumerate(dataloader):
        with torch.no_grad():
            inputs, pred_id = data['image'], data['prediction_id']
            inputs = inputs.to(device)
            inputs = inputs.unsqueeze(dim=1)
            inputs.to(device)
            
            output = model(inputs)
            preds_lis.append(output)
            id_lis.append(pred_id)
    return preds_lis, id_lis

def convert(lists):
    tmp = []
    for i in range(len(lists)):
        x = lists[i].tolist()
        for j in range(len(x)):
            tmp.append(x[j][0])
    return tmp


In [7]:
MODEL_NAME = 'vgg11'
MODEL_W_PATH = '/kaggle/input/rsna-datasets/vgg11_230214.pth'

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = Model_from_timm(MODEL_NAME)
model.load_state_dict(torch.load(MODEL_W_PATH))
submit_dataset = SubmitDataset(test)
submit_dataloader = DataLoader(submit_dataset, batch_size=16)
preds_lis, id_lis = sub_net(dataloader=submit_dataloader, model=model, device=device)
preds_lis = convert(preds_lis)
try:
    id_lis = convert(id_lis)
except:
    id_lis = id_lis[0]
sub_df = pd.DataFrame(data={'prediction_id': id_lis, 'cancer': preds_lis})
tmp = sub_df.groupby('prediction_id')['cancer'].mean()
submit = submit_df(sample, tmp)

In [8]:
display(submit)

Unnamed: 0,prediction_id,cancer
0,10008_L,0.503258
1,10008_R,0.502914


In [9]:
submit.to_csv('submit.csv', index=False)