In [None]:
import os
import sys 
import json
import glob
import random
import collections
import time

import numpy as np
import pandas as pd
import pydicom
import cv2
import matplotlib.pyplot as plt
import seaborn as sns

import torch
from torch import nn
from torch.utils import data as torch_data
from sklearn import model_selection as sk_model_selection
from torch.nn import functional as torch_functional
import torch.nn.functional as F

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

In [None]:
if os.path.exists("../input/rsna-miccai-brain-tumor-radiogenomic-classification"):
    data_directory = '../input/rsna-miccai-brain-tumor-radiogenomic-classification'
    pytorch3dpath = "../input/efficientnetpyttorch3d/EfficientNet-PyTorch-3D"
else:
    data_directory = '/media/roland/data/kaggle/rsna-miccai-brain-tumor-radiogenomic-classification'
    pytorch3dpath = "EfficientNet-PyTorch-3D"
    
mri_types = ['FLAIR','T1w','T1wCE','T2w']
SIZE = 256
NUM_IMAGES = 64

sys.path.append(pytorch3dpath)
from efficientnet_pytorch_3d import EfficientNet3D

In [None]:
def load_dicom_image(path, img_size=SIZE):
    dicom = pydicom.read_file(path)
    data = dicom.pixel_array
    if np.min(data)==np.max(data):
        data = np.zeros((img_size,img_size))
        return data
    data = data - np.min(data)
    if np.max(data) != 0:
        data = data / np.max(data)
    
    #data = (data * 255).astype(np.uint8)
    data = cv2.resize(data, (img_size, img_size))
    return data

def load_dicom_images_3d(scan_id, num_imgs=NUM_IMAGES, img_size=SIZE, mri_type="FLAIR", split="train"):

    files = sorted(glob.glob(f"{data_directory}/{split}/{scan_id}/{mri_type}/*.dcm"))
    
    middle = len(files)//2
    num_imgs2 = num_imgs//2
    p1 = max(0, middle - num_imgs2)
    p2 = min(len(files), middle + num_imgs2)
    img3d = np.stack([load_dicom_image(f) for f in files[p1:p2]]).T 
    if img3d.shape[-1] < num_imgs:
        n_zero = np.zeros((img_size, img_size, num_imgs - img3d.shape[-1]))
        img3d = np.concatenate((img3d,  n_zero), axis = -1)
            
    return np.expand_dims(img3d,0)

load_dicom_images_3d("00000").shape

In [None]:
def set_seed(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True

set_seed(3407)

In [None]:
train_df = pd.read_csv(f"{data_directory}/train_labels.csv")
display(train_df)

df_train, df_valid = sk_model_selection.train_test_split(
    train_df, 
    test_size=0.2, 
    random_state=177, 
    stratify=train_df["MGMT_value"],
)


In [None]:
df_train.tail()

In [None]:
class Dataset(torch_data.Dataset):
    def __init__(self, paths, targets=None, mri_type=None, label_smoothing=0.01, split="train"):
        self.paths = paths
        self.targets = targets
        self.mri_type = mri_type
        self.label_smoothing = label_smoothing
        self.split = split
          
    def __len__(self):
        return len(self.paths)
    
    def __getitem__(self, index):
        scan_id = self.paths[index]
        if self.targets is None:
            data = load_dicom_images_3d(str(scan_id).zfill(5), mri_type=self.mri_type[index], split=self.split)
        else:
            data = load_dicom_images_3d(str(scan_id).zfill(5), mri_type=self.mri_type[index], split="train")

        if self.targets is None:
            return {"X": torch.tensor(data).float(), "id": scan_id}
        else:
            y = torch.tensor(abs(self.targets[index]-self.label_smoothing), dtype=torch.float)
            return {"X": torch.tensor(data).float(), "y": y}

class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = EfficientNet3D.from_name("efficientnet-b0", override_params={'num_classes': 2}, in_channels=1)
        n_features = self.net._fc.in_features
        self.net._fc = nn.Linear(in_features=n_features, out_features=1, bias=True)
    
    def forward(self, x):
        out = self.net(x)
        return out

# class Model(nn.Module):
#     def __init__(self):
#         super().__init__()
#         self.net1 = EfficientNet3D.from_name("efficientnet-b0", override_params={'num_classes': 2}, in_channels=1)
#         self.net2 = EfficientNet3D.from_name("efficientnet-b1", override_params={'num_classes': 2}, in_channels=1)
#         n_features = self.net1._fc.in_features
#         self.net1._fc = nn.Linear(in_features=n_features, out_features=1, bias=True)
#         self.net2._fc = nn.Linear(in_features=n_features, out_features=1, bias=True)
#         self.classifier = nn.Linear(2, 1)
    
#     def forward(self, x):
#         out1 = self.net1(x)
#         out2 = self.net2(x)
#         out = torch.cat((out1, out2), dim=1)
#         out = self.classifier(out)
#         return out
    
    

class Trainer:
    def __init__(
        self, 
        model, 
        device, 
        optimizer, 
        criterion
    ):
        self.model = model
        self.device = device
        self.optimizer = optimizer
        self.criterion = criterion

        self.best_valid_score = np.inf
        self.n_patience = 0
        self.lastmodel = None
        
    def fit(self, epochs, train_loader, valid_loader, save_path, patience):        
        for n_epoch in range(1, epochs + 1):
            self.info_message("EPOCH: {}", n_epoch)
            
            train_loss, train_time = self.train_epoch(train_loader)
            valid_loss, valid_auc, valid_time = self.valid_epoch(valid_loader)
            
            self.info_message(
                "[Epoch Train: {}] loss: {:.4f}, time: {:.2f} s            ",
                n_epoch, train_loss, train_time
            )
            
            self.info_message(
                "[Epoch Valid: {}] loss: {:.4f}, auc: {:.4f}, time: {:.2f} s",
                n_epoch, valid_loss, valid_auc, valid_time
            )

            # if True:
            # if self.best_valid_score < valid_auc: 
            if self.best_valid_score > valid_loss: 
                self.save_model(n_epoch, save_path, valid_loss, valid_auc)
                self.info_message(
                     "auc improved from {:.4f} to {:.4f}. Saved model to '{}'", 
                    self.best_valid_score, valid_loss, self.lastmodel
                )
                self.best_valid_score = valid_loss
                self.n_patience = 0
            else:
                self.n_patience += 1
            
            if self.n_patience >= patience:
                self.info_message("\nValid auc didn't improve last {} epochs.", patience)
                break
            
    def train_epoch(self, train_loader):
        self.model.train()
        t = time.time()
        sum_loss = 0

        for step, batch in enumerate(train_loader, 1):
            X = batch["X"].to(self.device)
            targets = batch["y"].to(self.device)
            self.optimizer.zero_grad()
            outputs = self.model(X).squeeze(1)
            
            loss = self.criterion(outputs, targets)
            loss.backward()

            sum_loss += loss.detach().item()

            self.optimizer.step()
            
            message = 'Train Step {}/{}, train_loss: {:.4f}'
            self.info_message(message, step, len(train_loader), sum_loss/step, end="\r")
        
        return sum_loss/len(train_loader), int(time.time() - t)
    
    def valid_epoch(self, valid_loader):
        self.model.eval()
        t = time.time()
        sum_loss = 0
        y_all = []
        outputs_all = []

        for step, batch in enumerate(valid_loader, 1):
            with torch.no_grad():
                X = batch["X"].to(self.device)
                targets = batch["y"].to(self.device)

                outputs = self.model(X).squeeze(1)
                loss = self.criterion(outputs, targets)

                sum_loss += loss.detach().item()
                y_all.extend(batch["y"].tolist())
                outputs_all.extend(outputs.tolist())

            message = 'Valid Step {}/{}, valid_loss: {:.4f}'
            self.info_message(message, step, len(valid_loader), sum_loss/step, end="\r")
            
        y_all = [1 if x > 0.5 else 0 for x in y_all]
        auc = roc_auc_score(y_all, outputs_all)
        
        return sum_loss/len(valid_loader), auc, int(time.time() - t)
    
    def save_model(self, n_epoch, save_path, loss, auc):
        self.lastmodel = f"{save_path}-e{n_epoch}-loss{loss:.3f}-auc{auc:.3f}.pth"
        torch.save(
            {
                "model_state_dict": self.model.state_dict(),
                "optimizer_state_dict": self.optimizer.state_dict(),
                "best_valid_score": self.best_valid_score,
                "n_epoch": n_epoch,
            },
            self.lastmodel,
        )
    
    @staticmethod
    def info_message(message, *args, end="\n"):
        print(message.format(*args), end=end)
        
def predict(modelfile, df, mri_type, split):
    #print("Predict:", modelfile, mri_type, df.shape)
    df.loc[:,"MRI_Type"] = mri_type
    data_retriever = Dataset(
        df.index.values, 
        mri_type=df["MRI_Type"].values,
        split=split
    )

    data_loader = torch_data.DataLoader(
        data_retriever,
        batch_size=4,
        shuffle=False,
        num_workers=8,
    )
   
    model = Model()
    model.to(device)
    
    checkpoint = torch.load(modelfile)
    model.load_state_dict(checkpoint["model_state_dict"])
    model.eval()
    
    y_pred = []
    ids = []

    for e, batch in enumerate(data_loader,1):
        print(f"{e}/{len(data_loader)}", end="\r")
        with torch.no_grad():
            tmp_pred = torch.sigmoid(model(batch["X"].to(device))).cpu().numpy().squeeze()
            if tmp_pred.size == 1:
                y_pred.append(tmp_pred)
            else:
                y_pred.extend(tmp_pred.tolist())
            ids.extend(batch["id"].numpy().tolist())
            
    preddf = pd.DataFrame({"BraTS21ID": ids, "MGMT_value": y_pred}) 
    preddf = preddf.set_index("BraTS21ID")
    return preddf

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def train_mri_type(df_train, df_valid, mri_type,checkpoint_path):
    if mri_type=="all":
        train_list = []
        valid_list = []
        for mri_type in mri_types:
            df_train.loc[:,"MRI_Type"] = mri_type
            train_list.append(df_train.copy())
            df_valid.loc[:,"MRI_Type"] = mri_type
            valid_list.append(df_valid.copy())

        df_train = pd.concat(train_list)
        df_valid = pd.concat(valid_list)
    else:
        df_train.loc[:,"MRI_Type"] = mri_type
        df_valid.loc[:,"MRI_Type"] = mri_type

    print(df_train.shape, df_valid.shape)
    display(df_train.head())
    
    train_data_retriever = Dataset(
        df_train["BraTS21ID"].values, 
        df_train["MGMT_value"].values, 
        df_train["MRI_Type"].values
    )

    valid_data_retriever = Dataset(
        df_valid["BraTS21ID"].values, 
        df_valid["MGMT_value"].values,
        df_valid["MRI_Type"].values
    )

    train_loader = torch_data.DataLoader(
        train_data_retriever,
        batch_size=4,
        shuffle=True,
        num_workers=8,
    )

    valid_loader = torch_data.DataLoader(
        valid_data_retriever, 
        batch_size=4,
        shuffle=False,
        num_workers=8,
    )

    model = Model()
    

    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint["model_state_dict"])
    model.to(device)

#     #print(model)

#     optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
#     #optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

#     criterion = torch_functional.binary_cross_entropy_with_logits

#     trainer = Trainer(
#         model, 
#         device, 
#         optimizer, 
#         criterion
#     )

#     history = trainer.fit(
#         0, 
#         train_loader, 
#         valid_loader, 
#         f"{mri_type}", 
#         0,
#     )
    
    return model

modelfiles = None

# if not modelfiles:
# modelfiles = [train_mri_type(df_train, df_valid, 'FLAIR','../input/efficientnet3d-with-one-mri-type/FLAIR-e2-loss0.696-auc0.605.pth'),
#               train_mri_type(df_train, df_valid, 'T1w','../input/efficientnet3d-with-one-mri-type/T1w-e2-loss0.718-auc0.579.pth'),
#               train_mri_type(df_train, df_valid, 'T1wCE','../input/efficientnet3d-with-one-mri-type/T1wCE-e6-loss0.683-auc0.633.pth'),
#               train_mri_type(df_train, df_valid, 'T2w','../input/efficientnet3d-with-one-mri-type/T2w-e8-loss0.658-auc0.677.pth'),
#              ]
#print(modelfiles)

In [None]:
# modelfiles = [
#     '../input/efficientnet3d684/T1w-e7-loss0.685-auc0.555.pth',
#     '../input/efficientnet3d684/T1wCE-e6-loss0.683-auc0.633.pth',
#     '../input/efficientnet3d684/T2w-e8-loss0.658-auc0.677.pth',
# ]
modelfiles = [
    '../input/efficientnet3d684/T1w-e7-loss0.685-auc0.555.pth',
    '../input/efficientnet3d684/T1wCE-e6-loss0.683-auc0.633.pth',
    '../input/efficientnet3d684/T2w-e8-loss0.658-auc0.677.pth',
]

mri_types = ['T1w', 
             'T1wCE',
             'T2w',]

In [None]:
df_valid = df_valid.set_index("BraTS21ID")
df_valid["MGMT_pred"] = 0
for m, mtype in zip(modelfiles,  mri_types):
    pred = predict(m, df_valid, mtype, "train")
    df_valid["MGMT_pred"] += pred["MGMT_value"]
df_valid["MGMT_pred"] /= len(modelfiles)
auc = roc_auc_score(df_valid["MGMT_value"], df_valid["MGMT_pred"])
print(f"Validation ensemble AUC: {auc:.4f}")
sns.displot(df_valid["MGMT_pred"])

In [None]:
import os
import json
import glob
import random
import collections

import numpy as np
import pandas as pd
import pydicom
from pydicom.pixel_data_handlers.util import apply_voi_lut
import cv2
import matplotlib.pyplot as plt
import seaborn as sns
import random
from tqdm.notebook import tqdm

import tensorflow as tf
from tensorflow import keras
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import layers



TYPES = ["FLAIR", "T1w", "T2w", "T1wCE"]
WHITE_THRESHOLD = 10 # out of 255
EXCLUDE = [109, 123, 709]


train_df = pd.read_csv("../input/rsna-miccai-brain-tumor-radiogenomic-classification/train_labels.csv")
test_df = pd.read_csv('../input/rsna-miccai-brain-tumor-radiogenomic-classification/sample_submission.csv')
train_df = train_df[~train_df.BraTS21ID.isin(EXCLUDE)]
def load_dicom(path, size = 224):
    ''' 
    Reads a DICOM image, standardizes so that the pixel values are between 0 and 1, then rescales to 0 and 255
    
    Note super sure if this kind of scaling is appropriate, but everyone seems to do it. 
    '''
    dicom = pydicom.read_file(path)
    data = dicom.pixel_array
    if np.max(data) != 0:
        data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
    return cv2.resize(data, (size, size))

def get_all_image_paths(brats21id, image_type, folder='train'): 
    '''
    Returns an arry of all the images of a particular type for a particular patient ID
    '''
    assert(image_type in TYPES)
    
    patient_path = os.path.join(
        "../input/rsna-miccai-brain-tumor-radiogenomic-classification/%s/" % folder, 
        str(brats21id).zfill(5),
    )

    paths = sorted(
        glob.glob(os.path.join(patient_path, image_type, "*")), 
        key=lambda x: int(x[:-4].split("-")[-1]),
    )
    
    num_images = len(paths)
    
    start = int(num_images * 0.25)
    end = int(num_images * 0.75)

    interval = 3
    
    if num_images < 10: 
        interval = 1
    
    return np.array(paths[start:end:interval])

def get_all_images(brats21id, image_type, folder='train', size=225):
    return [load_dicom(path, size) for path in get_all_image_paths(brats21id, image_type, folder)]
IMAGE_SIZE = 128

def get_all_data_for_train(image_type):
    global train_df
    
    X = []
    y = []
    train_ids = []

    for i in tqdm(train_df.index):
        x = train_df.loc[i]
        images = get_all_images(int(x['BraTS21ID']), image_type, 'train', IMAGE_SIZE)
        label = x['MGMT_value']

        X += images
        y += [label] * len(images)
        train_ids += [int(x['BraTS21ID'])] * len(images)
        assert(len(X) == len(y))
    return np.array(X), np.array(y), np.array(train_ids)

def get_all_data_for_test(image_type):
    global test_df
    
    X = []
    test_ids = []

    for i in tqdm(test_df.index):
        x = test_df.loc[i]
        images = get_all_images(int(x['BraTS21ID']), image_type, 'test', IMAGE_SIZE)
        X += images
        test_ids += [int(x['BraTS21ID'])] * len(images)

    return np.array(X), np.array(test_ids)

X_test, testidt = get_all_data_for_test('T1wCE')
file_path = '../input/rsna-miccai-2dcnn-inference/best_model.h5'
model_best = tf.keras.models.load_model(filepath=file_path)
sample = pd.read_csv('../input/rsna-miccai-brain-tumor-radiogenomic-classification/sample_submission.csv')

y_pred = model_best.predict(X_test)

pred = np.argmax(y_pred, axis=1)

result=pd.DataFrame(testidt)
result[1]=pred
result.columns=['BraTS21ID','MGMT_value']
result2 = result.groupby('BraTS21ID',as_index=False).mean()
result2['BraTS21ID'] = sample['BraTS21ID']

In [None]:
model2_pred = result2['MGMT_value']

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# Scalers
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle

# Models
from sklearn.linear_model import LogisticRegression #logistic regression
from sklearn.linear_model import Perceptron
from sklearn import svm #support vector Machine
from sklearn.ensemble import RandomForestClassifier #Random Forest
from sklearn.neighbors import KNeighborsClassifier #KNN
from sklearn.naive_bayes import GaussianNB #Naive bayes
from sklearn.tree import DecisionTreeClassifier #Decision Tree
from sklearn.model_selection import train_test_split #training and testing data split
from sklearn import metrics #accuracy measure
from sklearn.metrics import confusion_matrix #for confusion matrix
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier

# Cross-validation
from sklearn.model_selection import KFold #for K-fold cross validation
from sklearn.model_selection import cross_val_score #score evaluation
from sklearn.model_selection import cross_val_predict #prediction
from sklearn.model_selection import cross_validate

# GridSearchCV
from sklearn.model_selection import GridSearchCV

#Common Model Algorithms
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process

#Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

import pydicom
import random
import matplotlib.pyplot as plt
import glob

# directory setting
INPUT = '../input/rsna-miccai-brain-tumor-radiogenomic-classification'

train_lab = pd.read_csv(INPUT + '/' + 'train_labels.csv')
sample_sub = pd.read_csv(INPUT + '/' + 'sample_submission.csv')

print('Train labels')
train_lab

temp = train_lab['BraTS21ID'] + 100000
item_id = []
for i in range(len(train_lab)):
    item_id = item_id + [str(temp[i])[-5:]]
print('Number of samples in training data')
len(item_id)   # 585

from tqdm import tqdm

def AddDif1(df):
    df2 = df
    df2['d1'] = df['c11'] - df['c12']
    df2['d2'] = df['c12'] - df['c13']
    df2['d3'] = df['c13'] - df['c14']
    df2['d4'] = df['c14'] - df['c15']
    df2['d5'] = df['c15'] - df['c16']
    return df2
    


train_lab = pd.read_csv('../input/train-data/Table_trainA.csv')
train_lab2 = train_lab[train_lab['c2'] != 0]
train_data = train_lab2.reset_index()
train_data = AddDif1(train_data)
train_data

temp = sample_sub['BraTS21ID'] + 100000
item_id = []
for i in range(len(sample_sub)):
    item_id = item_id + [str(temp[i])[-5:]]
print('Number of samples in test data')
len(item_id)   # 87

train_lab = sample_sub

print(i, 'number of images', 'intensity', 'volume', 'average', 'Gmin', 'Gmax', 'Gmax-average', 'CmaxName')# for i in range(len(item_id[:10])):
# for i in range(len(item_id[:1])):
for i in tqdm(range(len(item_id[:]))):
    item_fol = os.listdir(INPUT + '/test/' + item_id[i] + '/FLAIR')
    item_fol2 = []
    for j in item_fol:
#         k = 'A' + j[6:len(j)-4]
        k = 1000 + int(j[6:len(j)-4])
        item_fol2 = item_fol2 + [k]
    item_fols = sorted(item_fol2)
    volume = 0
    intensity = 0
    vac = 0
    Gmax = 0
    Gmin = 0
    Amax = 0
    Imax = 0
    area_prev = 0
    sumN_prev = 0
    changeMax = 0
    maxName ='none'
    AmaxName ='none'
    ImaxName ='none'
    CmaxName ='00000'
    for j in item_fols:
        l = str(j-1000)
        path = INPUT + '/test/' + item_id[i] + '/FLAIR/Image-' + l + '.dcm'
        dicom = pydicom.read_file(path)
        data = dicom.pixel_array
        sumN = np.sum(data)
        sumN_plus = sumN - sumN_prev
        sumN_prev = sumN
        if sumN > Imax:
            Imax = sumN
            ImaxName = j
        maxN = np.max(data)
        if maxN > Gmax:
            Gmax = maxN
            maxName = j
        minN = np.min(data)
        if minN < Gmin:
            Gmin = minN
        zerocount = np.count_nonzero(data == 0)
        area = np.count_nonzero(data != 0)
        area_plus = area - area_prev
        area_prev = area
        if area > Amax:
            Amax = area
            AmaxName = j
        change = -(sumN_plus/area_plus)
        if change > changeMax:
            changeMax = change
            CmaxName = j
        intensity = intensity + sumN
        volume = volume + area
        vac = vac + zerocount
#         print(i, j, sumN, maxN, minN, zerocount, area, area_plus, sumN_plus, change)
    average = intensity/volume
    train_lab.loc[i,'c1'] = len(item_fol)
    train_lab.loc[i,'c2'] = int(intensity)
    train_lab.loc[i,'c3'] = volume
    train_lab.loc[i,'c4'] = vac
    train_lab.loc[i,'c5'] = volume+vac
    train_lab.loc[i,'c6'] = average
    train_lab.loc[i,'c7'] = Gmin
    train_lab.loc[i,'c8'] = Gmax
    train_lab.loc[i,'c9'] = Gmax-average
    train_lab.loc[i,'c10'] = 'Image-' + str(int(CmaxName) -1000) + '.dcm'
#     train_lab.loc[i,['c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7', 'c8', 'c9']] = (len(item_fol), intensity, volume, vac, volume+vac, average, Gmin, Gmax, int(Gmax-average))
#     train_lab.loc[i,['c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7', 'c8', 'c9']] = (len(item_fol), intensity, volume, vac, volume+vac, average, Gmin, Gmax, int(Gmax-average))
#    print(i, len(item_fol), intensity, volume, average, Gmin, Gmax, Gmax-average, 'Image-' + str(int(CmaxName) -1000) + '.dcm')

#     train_lab.c1[i] = len(item_fol)

# print(item_id[:1], maxName, AmaxName, ImaxName)
train_lab.head(10)

print(i, 'number of images', 'intensity', 'volume', 'average', 'Gmin', 'Gmax', 'Gmax-average', 'CmaxName')# for i in range(len(item_id[:10])):
# for i in range(len(item_id[:1])):
for i in tqdm(range(len(item_id[:]))):
    item_fol = os.listdir(INPUT + '/test/' + item_id[i] + '/FLAIR')
    item_fol2 = []
    for j in item_fol:
#         k = 'A' + j[6:len(j)-4]
        k = 1000 + int(j[6:len(j)-4])
        item_fol2 = item_fol2 + [k]
    item_fols = sorted(item_fol2)
#     volume = 0
#     intensity = 0
#     vac = 0
#     Gmax = 0
#     Gmin = 0
#     Amax = 0
#     Imax = 0
#     area_prev = 0
#     sumN_prev = 0
#     changeMax = 0
#     maxName ='none'
#     AmaxName ='none'
#     ImaxName ='none'
#     CmaxName ='00000'
    P50 = 0
    P60 = 0
    P70 = 0
    P80 = 0
    P90 = 0
    P95 = 0
    F2 = 0
    F3 = 0
    F4 = 0
    F5 = 0
    F6 = 0
    val50 = train_lab['c9'][i] * 0.5 + train_lab['c6'][i]
    val60 = train_lab['c9'][i] * 0.6 + train_lab['c6'][i]
    val70 = train_lab['c9'][i] * 0.7 + train_lab['c6'][i]
    val80 = train_lab['c9'][i] * 0.8 + train_lab['c6'][i]
    val90 = train_lab['c9'][i] * 0.9 + train_lab['c6'][i]
    val95 = train_lab['c9'][i] * 0.95 + train_lab['c6'][i]
    F2val = train_lab['c6'][i] * 2
    F3val = train_lab['c6'][i] * 3
    F4val = train_lab['c6'][i] * 4
    F5val = train_lab['c6'][i] * 5
    F6val = train_lab['c6'][i] * 6
    for j in item_fols:
        l = str(j-1000)
        path = INPUT + '/test/' + item_id[i] + '/FLAIR/Image-' + l + '.dcm'
        dicom = pydicom.read_file(path)
        data = dicom.pixel_array
#         sumN = np.sum(data)
#         sumN_plus = sumN - sumN_prev
#         sumN_prev = sumN
#         if sumN > Imax:
#             Imax = sumN
#             ImaxName = j
#         maxN = np.max(data)
#         if maxN > Gmax:
#             Gmax = maxN
#             maxName = j
#         minN = np.min(data)
#         if minN < Gmin:
#             Gmin = minN
        count50 = np.count_nonzero(data > val50)
        count60 = np.count_nonzero(data > val60)
        count70 = np.count_nonzero(data > val70)
        count80 = np.count_nonzero(data > val80)
        count90 = np.count_nonzero(data > val90)
        count95 = np.count_nonzero(data > val95)
        countF2 = np.count_nonzero(data > F2val)
        countF3 = np.count_nonzero(data > F3val)
        countF4 = np.count_nonzero(data > F4val)
        countF5 = np.count_nonzero(data > F5val)
        countF6 = np.count_nonzero(data > F6val)
#         area = np.count_nonzero(data != 0)
#         area_plus = area - area_prev
#         area_prev = area
#         if area > Amax:
#             Amax = area
#             AmaxName = j
#         change = -(sumN_plus/area_plus)
#         if change > changeMax:
#             changeMax = change
#             CmaxName = j
#         intensity = intensity + sumN
#         volume = volume + area
        P50 = P50 + count50
        P60 = P60 + count60
        P70 = P70 + count70
        P80 = P80 + count80
        P90 = P90 + count90
        P95 = P95 + count95
        F2 = F2 + countF2
        F3 = F3 + countF3
        F4 = F4 + countF4
        F5 = F5 + countF5
        F6 = F6 + countF6
#         print(i, j, sumN, maxN, minN, zerocount, area, area_plus, sumN_plus, change)
#     average = intensity/volume
#     train_lab.loc[i,'c1'] = len(item_fol)
#     train_lab.loc[i,'c2'] = int(intensity)
#     train_lab.loc[i,'c3'] = volume
#     train_lab.loc[i,'c4'] = vac
#     train_lab.loc[i,'c5'] = volume+vac
#     train_lab.loc[i,'c6'] = int(average)
#     train_lab.loc[i,'c7'] = Gmin
#     train_lab.loc[i,'c8'] = Gmax
#     train_lab.loc[i,'c9'] = int(Gmax-average)
#     train_lab.loc[i,'c10'] = 'Image-' + str(int(CmaxName) -1000) + '.dcm'
    c3val = train_lab['c3'][i]
    train_lab.loc[i,'c11'] = P50 * 1e7 / c3val
    train_lab.loc[i,'c12'] = P60 * 1e7 / c3val
    train_lab.loc[i,'c13'] = P70 * 1e7 / c3val
    train_lab.loc[i,'c14'] = P80 * 1e7 / c3val
    train_lab.loc[i,'c15'] = P90 * 1e7 / c3val
    train_lab.loc[i,'c16'] = P95 * 1e7 / c3val
    train_lab.loc[i,'c17'] = F2 *  1e7 / c3val
    train_lab.loc[i,'c18'] = F3 *  1e7 / c3val
    train_lab.loc[i,'c19'] = F4 * 1e7 / c3val
    train_lab.loc[i,'c20'] = F5 * 1e7 / c3val
    train_lab.loc[i,'c21'] = F6 * 1e7 / c3val
#     train_lab.loc[i,['c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7', 'c8', 'c9']] = (len(item_fol), intensity, volume, vac, volume+vac, average, Gmin, Gmax, int(Gmax-average))
#     train_lab.loc[i,['c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7', 'c8', 'c9']] = (len(item_fol), intensity, volume, vac, volume+vac, average, Gmin, Gmax, int(Gmax-average))
#     print(i, (P50, P60, P70, P80, P90, P95, 
#     print(i, (F2, F3, F4, F5, F6)*1e7/train_lab['c3'][i])

#     train_lab.c1[i] = len(item_fol)

# print(item_id[:1], maxName, AmaxName, ImaxName)
train_lab.head(10)

sample_sub = train_lab     # <=== Caution!

sample_sub.to_csv('Table_testA.csv')
test_data = sample_sub
test_data = AddDif1(test_data)
output = pd.read_csv('../input/rsna-miccai-brain-tumor-radiogenomic-classification/sample_submission.csv')

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

y = train_data["MGMT_value"]

features = ["c2", "c3", "c4", "c5", "c6", "c8", "c9", 
#              "c11", "c12", "c13", "c14", "c15", "c16",
            "d1", "d2", "d3", "d4", "d5"]
#              "c17", "c18", "c19", "c20", "c21"]
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

model = RandomForestClassifier(n_estimators=500, max_depth=7, random_state=42)
model.fit(X, y)

y_score1 = model.predict_proba(X)[:,1]
predictions = model.predict_proba(X_test)[:,1]

from sklearn.model_selection import KFold
import lightgbm as lgb
import gc

params = {
          'num_leaves': 5,
#           'min_child_weight': 0.7,
#           'feature_fraction': 0.01,
#            'bagging_fraction': 0.8,
#           'min_data_in_leaf': 5,
#           'objective': 'binary',
          'max_depth': 6,
          'learning_rate': 0.15,
          "boosting_type": "gbdt",
          "bagging_seed": 3407,
          "metric": 'auc',
          "verbosity": -1,
#           'reg_alpha': 0,
#           'reg_lambda': 0.6485237330340494,
          'random_state': 47,
         }


NFOLDS = 5
folds = KFold(n_splits=NFOLDS)

y = train_data["MGMT_value"]
features = ["c2", "c3", "c4", "c5", "c6", "c8", "c9", 
#              "c11", "c12", "c13", "c14", "c15", "c16",
            "d1", "d2", "d3", "d4", "d5"]
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

columns = X.columns
splits = folds.split(X, y)
y_preds = np.zeros(X_test.shape[0])
y_oof = np.zeros(X.shape[0])
score = 0

feature_importances = pd.DataFrame()
feature_importances['feature'] = columns
  
for fold_n, (train_index, valid_index) in enumerate(splits):
    X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
    dtrain = lgb.Dataset(X_train, label=y_train)
    dvalid = lgb.Dataset(X_valid, label=y_valid)

    clf = lgb.train(params, dtrain, 1000, valid_sets = [dtrain, dvalid], verbose_eval=20, early_stopping_rounds=100)
    
    feature_importances[f'fold_{fold_n + 1}'] = clf.feature_importance()
    
    y_pred_valid = clf.predict(X_valid)
    y_oof[valid_index] = y_pred_valid
    print(f"Fold {fold_n + 1} | AUC: {roc_auc_score(y_valid, y_pred_valid)}")
    
    score += roc_auc_score(y_valid, y_pred_valid) / NFOLDS
    y_preds += clf.predict(X_test) / NFOLDS
    
    del X_train, X_valid, y_train, y_valid
    gc.collect()
    
print(f"\nMean AUC = {score}")
print(f"Out of folds AUC = {roc_auc_score(y, y_oof)}")

import seaborn as sns

feature_importances['average'] = feature_importances[[f'fold_{fold_n + 1}' for fold_n in range(folds.n_splits)]].mean(axis=1)
feature_importances.to_csv('feature_importances.csv')

plt.figure(figsize=(16, 16))
sns.barplot(data=feature_importances.sort_values(by='average', ascending=False).head(50), x='average', y='feature');
plt.title('50 TOP feature importance over {} folds average'.format(folds.n_splits));

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
y_true1 = y
# # y_score1 = model.predict_proba(X)[:,1]
# y_score1 = gd.best_estimator_.predict_proba(X)[:,1]

roc1 = roc_curve(y_true1, y_score1)

fpr1, tpr1, thresholds1 = roc_curve(y_true1, y_score1)
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
y_pred = np.round(y_score1, decimals = 0)
tn1, fp1, fn1, tp1 = confusion_matrix(y_true1, y_pred).ravel()
ac1 = accuracy_score(y_true1, y_pred)
pr1 = precision_score(y_true1, y_pred)
rc1 = recall_score(y_true1, y_pred)
sp1 = tn1/(fp1+tn1)
f11 = f1_score(y_true1, y_pred)
phi1 = (tp1*tn1-fp1*fn1)/np.sqrt((tp1+fn1)*(tp1+fp1)*(tn1+fn1)*(tn1+fp1))



plt.figure(figsize=(6,6))
plt.plot((0,1), (0,1), color="black", linestyle="--")
plt.plot(fpr1, tpr1, linewidth=3)#, marker='o')
# plt.plot(fpr2, tpr2, linewidth=3)#, marker='o')
# plt.plot(fpr3, tpr3, linewidth=3)#, marker='o')
plt.tick_params(direction='in')
plt.xlabel('FPR: False positive rate')
plt.ylabel('TPR: True positive rate')
# plt.legend(['train', 'valid', 'test'], loc='lower right')
plt.grid()
ROC1=roc_auc_score(y_true1, y_score1)
# ROC2=roc_auc_score(y_true2, y_score2)
# ROC3=roc_auc_score(y_true3, y_score3)
print(ROC1)#,ROC2,ROC3)

# predictions = model.predict_proba(X_test)[:,1]
# predictions = gd.best_estimator_.predict_proba(X_test)[:,1]
output['MGMT_value'] = y_preds * 0.5 +predictions*0.5
output

In [None]:
from fastai.vision.all import *
import numpy as np
import sys
np.set_printoptions(threshold=sys.maxsize)

import pandas as pd
import os
import random
df = pd.read_csv('../input/rsna-miccai-brain-tumor-radiogenomic-classification/train_labels.csv', header=0, names=['id','value'], dtype=object)
df = df[~df.id.isin(["00109", "00123", "00709"])]

#https://stackoverflow.com/a/4836734/8245487
def natural_sort(l): 
    convert = lambda text: int(text) if text.isdigit() else text.lower()
    alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)]
    return sorted(l, key=alphanum_key)

import os
import pydicom
import pandas as pd
from pydicom.pixel_data_handlers.util import apply_voi_lut
from tqdm import tqdm
import binascii
from PIL import Image

INPUT = '../input/rsna-miccai-brain-tumor-radiogenomic-classification'

if not os.path.exists('./train'):
    os.makedirs('./train')
    

if not os.path.exists('./test'):
    os.makedirs('./test')


def get_dicom_files(input_dir, dataset='train'):
    for subdir, dirs, files in os.walk(f"{input_dir}/{dataset}"):
        if len(files) == 0:
            continue
        filename = natural_sort(files)[len(files)//2] #take middle most image -- FLAIR DCM file per training item.
        filepath = os.path.join(subdir, filename)
        
        if filepath.endswith(".dcm") and "FLAIR" in filepath:
            cur_id = subdir.split('/')[-2]
            outpath = os.path.join(f'./{dataset}',f'{cur_id}.png')
            
            process_dicom(filepath, outpath)

def process_dicom(path, outpath):
    dicom = pydicom.read_file(path)
    data = apply_voi_lut(dicom.pixel_array, dicom)
    if dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
    data = data - np.min(data)
    data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
    
    height = len(data)
    width = len(data[0])
    
    pixels_out = []
    for row in data:
        pixels_out.extend(row)
    assert(len(pixels_out) == height * width)
    
    image_out = Image.new('L', (width, height))
    image_out.putdata(pixels_out)
    image_out.save(outpath)

get_dicom_files(INPUT, 'train')
get_dicom_files(INPUT, 'test')


#     final = pd.DataFrame(final)
#     final.to_csv(f"{args['output']}/dicom_meta_{args['dataset']}.csv", index=False)

for id_num in df.id:
    full_path = './train/{}.png'.format(id_num)
    df.loc[df.id == id_num, 'file'] = full_path

dls = ImageDataLoaders.from_df(df, item_tfms=Resize(224), bs=64, label_col =1, fn_col=2, path='')

import torch 
import torch.nn as nn
import torch.nn.functional as F


class Net(nn.Module):
    def __init__(self, pretrained=False):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        return x
    

learn = cnn_learner(dls, Net, metrics=[error_rate, accuracy], model_dir="/tmp/model/").to_fp16()
learn.lr_find()
learn.fit_one_cycle(10, lr_max=1e-2, cbs=ShortEpochCallback())

df_test = pd.DataFrame(columns=['id', 'value'])
df_test.id = os.listdir("../input/rsna-miccai-brain-tumor-radiogenomic-classification/test/")

for id_num in df_test.id:
    full_path = './test/{}.png'.format(id_num)
    prediction = learn.predict(full_path)
    print(prediction)
    probability = prediction[2][1].item()
    print(probability)
    df_test.loc[df_test.id==id_num, 'value'] = probability
    
df_test = df_test.rename(columns={'id':'BraTS21ID','value':'MGMT_value'})

In [None]:
submission = pd.read_csv(f"{data_directory}/sample_submission.csv", index_col="BraTS21ID")
mri_types = ['T1w', 
             'T1wCE',
             'T2w',]

submission["MGMT_value"] = 0
for m, mtype in zip(modelfiles, mri_types):
    
    pred = predict(m, submission, mtype, split="test")
    submission["MGMT_value"] += pred["MGMT_value"]
    print(modelfiles,pred["MGMT_value"])

submission["MGMT_value"] /= len(modelfiles)
# submission["MGMT_value"] = (submission["MGMT_value"].values*0.8 + model2_pred.values*0.1 +df_test["MGMT_value"].values*0.1)*0.98 +output['MGMT_value'].values*0.02
submission["MGMT_value"] = submission["MGMT_value"].values*0.6 + model2_pred.values*0.4
submission["MGMT_value"].to_csv("submission.csv")

In [None]:
sns.displot(submission["MGMT_value"])