In [None]:
#!pip install fastbook torch==1.8.1 pyarrow pydicom kornia opencv-python scikit-image

In [None]:
!pip install --user torch==1.9.0 torchvision==0.10.0 torchaudio==0.9.0 torchtext==0.10.0 pyarrow pydicom kornia opencv-python scikit-image

In [None]:
from fastai.basics import *
from fastai.callback.all import *
from fastai.vision.all import *
from fastai.medical.imaging import *

import pydicom

In [None]:
!ls ../input

In [None]:
path_csv = Path("../input/meduni-ich-labels-sample/gesamt_labels.csv")

In [None]:
df_labels_all = pd.read_csv(path_csv)

In [None]:
df_labels_all

In [None]:
# hier können Daten von individuellen Studenten gefiltert werden
# falls ihr ALLE Bilder nehmen wollt: setzt die student_id auf eine negative Zahl (zB -1)
student_ids= [-1]

# mask = (df_labels_all["Student_ID"] == student_ids
mask = df_labels_all["Student_ID"].isin(student_ids)

df_labels = df_labels_all[mask]
n = len(df_labels)
if n:
    print(f"Studenten Nr {student_ids} => {len(df_labels)} Labels\n")
    df_labels

else:
    df_labels = df_labels_all
    print(f"Studenten Nr {student_ids} => keine Labels gefunden")
    print(f"daher nehmen wir alle verfügbaren Labels: {len(df_labels)} Labels\n")

In [None]:
# Hier könnt ihr angeben wie die verwendeten Labels für das Neuronale Netz heißen sollen
# zB
# Positiv/Negativ
# Blutung/keine Blutung

label_mapping = {
    0: "Keine Blutung",
    1: "Gehirnblutung",
}

label2digit = {
    v: k for (k, v) in label_mapping.items()
}

In [None]:
path_data = Path("../input/rsna-intracranial-hemorrhage-detection/rsna-intracranial-hemorrhage-detection/stage_2_train/")
assert path_data.exists()

In [None]:
# Fastai überprüft normalerweise ob die Dateien wirklich korrekt sind
# das macht das Laden der DICOMs ziemlich langsam
# zudem verwenden wir hier ein professionell erstelltes Datenset
# daher beschleunigen wir das Laden der Daten indem wir hier die eingebauten Funktionen von fastai durch unsere eigenen ersetzen
# zudem bauen wir ein Limit ein falls wir nicht alle DICOMs laden wollen

def get_files(path, extensions=None, folders=None, followlinks=True, limit=None):
    "Get all the files in `path` with optional `extensions`, optionally with `recurse`, only in `folders`, if specified."
    path = Path(path)
    folders=L(folders)
    extensions = setify(extensions)
    extensions = {e.lower() for e in extensions}    
    
#   f = [o.name for o in os.scandir(path) if o.is_file()]  # hier überprüft das Original ob es wirklich Dateien sind
    f = [o.name for o in os.scandir(path)]
    if limit:
        f = f[:limit]
            
    res = _get_files(path, f, extensions)
    
    return L(res)


def _get_files(p, fs, extensions=None):
    p = Path(p)
    res = [p/f for f in fs if not f.startswith('.')
           and ((not extensions) or f'.{f.split(".")[-1].lower()}' in extensions)]
    return res


def get_dicom_files(path, folders=None, limit=None):
    "Get dicom files in `path` recursively, only in `folders`, if specified."
    return get_files(path, extensions=[".dcm",".dicom"], folders=folders, limit=limit)

In [None]:
%%time

# das laden aller DICOMs dauert trotzdem etwas
dicoms = get_dicom_files(path_data, limit=None)

In [None]:
dicoms

In [None]:
sample = dicoms[1].dcmread()

In [None]:
sample.show()

In [None]:
class BrainWindow(PILBase):
    _open_args = {}
    _tensor_cls = TensorDicom
    _show_args = TensorDicom._show_args
    
    @classmethod
    def create(cls, fn:(Path,str,bytes), mode=None) -> None:
        if isinstance(fn,bytes):
            im = pydicom.dcmread(pydicom.filebase.DicomBytesIO(fn))
        elif isinstance(fn,(Path,str)):
            im = Path(fn).dcmread()
            
        scaled = np.array(im.windowed(l=40, w=80).numpy()) * 255
        scaled = scaled.astype(np.uint8)
        
        return cls(Image.fromarray(scaled))

In [None]:
ich_datablock = DataBlock(
    blocks=(ImageBlock(cls=BrainWindow), CategoryBlock),
    get_x=lambda df: (path_data/f"{df[0]}").with_suffix(".dcm"),
    get_y=lambda df:label_mapping[df[1]],
    batch_tfms=[*aug_transforms(size=224), Normalize.from_stats(*imagenet_stats)],
    splitter=RandomSplitter(valid_pct=0.2, seed=42)
)

In [None]:
dataloader = ich_datablock.dataloaders(df_labels[["Datei", "Label"]].values, num_workers=1) #, bs=64)

In [None]:
size_train = len(dataloader.train) * dataloader.train.bs
size_valid = len(dataloader.valid) * dataloader.valid.bs

print(f"Trainingsdatenset enthält {size_train} dicoms")
print(f"Validierungsdatenset enthält {size_valid} dicoms")

In [None]:
dataloader.train.show_batch()

In [None]:
learner = cnn_learner(dataloader, resnet34, metrics=accuracy)

In [None]:
# hier könnt ihr einstellen wie viele Epochen ihr trainieren wollt
# (am besten zuerst mit 1 Epoche ausprobieren um Zeit einschätzen zu können)
learner.fine_tune(5)

In [None]:
interp = Interpretation.from_learner(learner)

In [None]:
# fastai's plot_top_losses Funktion ist momentan kaputt, darum hier unsere eigene Version
def plot_top_losses_fix(interp, k, largest=True, **kwargs):
        losses, idx = interp.top_losses(k, largest)
        
        if not isinstance(interp.inputs, tuple): 
            interp.inputs = (interp.inputs,)
            
        if isinstance(interp.inputs[0], Tensor):
            inps = tuple(o[idx] for o in interp.inputs)
            
        else:
            inps = interp.dl.create_batch(interp.dl.before_batch([tuple(o[i] for o in interp.inputs) for i in idx]))
            
        b = inps + tuple(o[idx] for o in (interp.targs if is_listy(interp.targs) else (interp.targs,)))
        
        x,y,its = interp.dl._pre_show_batch(b, max_n=k)
        
        b_out = inps + tuple(o[idx] for o in (interp.decoded if is_listy(interp.decoded) else (interp.decoded,)))
        
        x1,y1,outs = interp.dl._pre_show_batch(b_out, max_n=k)
        
        if its is not None:
            #plot_top_losses(x, y, its, outs.itemgot(slice(len(inps), None)), L(self.preds).itemgot(idx), losses,  **kwargs)
            plot_top_losses(x, y, its, outs.itemgot(slice(len(inps), None)), interp.preds[idx], losses,  **kwargs)
        #TODO: figure out if this is needed
        #its None means that a batch knows how to show itself as a whole, so we pass x, x1
        #else: show_results(x, x1, its, ctxs=ctxs, max_n=max_n, **kwargs)


In [None]:
# zeig Bilder wo Vorhersagen "am meisten falsch" waren
plot_top_losses_fix(interp, 9, figsize=(15, 15))

In [None]:
path_test_set = Path("../input/meduni-ich-labels-sample/test_fixed.csv")
assert path_test_set.exists()

In [None]:
# !cat {path_test_set} | sed -E "s/(dcm);([01])/\1:\2/" | sed "s/;//g" | sed 's/"//g' | sed -E ":a;N;$!ba;s/dcm\n/dcm:/g"

In [None]:
df_testset = pd.read_csv(path_test_set, header=None, sep=";")
df_testset.columns = ["Datei", "Label"]
df_testset.set_index("Datei", drop=False, inplace=True)
df_testset.head()

In [None]:
def get_test_dataset(n=200):
    return list(df_testset["Datei"])

#     return list(df_testset.sample(n)["Datei"])
#     return list(df_labels["Datei"])[:200]

In [None]:
# anzahl_test_dicoms = 20

test_set = get_test_dataset(anzahl_test_dicoms)
len(test_set)

In [None]:
test_set[:5]

In [None]:
(path_data/test_set[0]).dcmread().show()

In [None]:
learner.predict(path_data/test_set[0])

In [None]:
# df_ground_truth = pd.read_csv("../input/rsna-intracranial-hemorrhage-detection/rsna-intracranial-hemorrhage-detection/stage_2_train.csv")

In [None]:
# mask = df_ground_truth["ID"].str.endswith("any")
# df_ground_truth = df_ground_truth[mask]
# f"Labels gefunden für {len(df_ground_truth)} dicoms"

In [None]:
# df_ground_truth["fname"] = df_ground_truth["ID"].str.rsplit("_", 1, expand=True)[0]
# df_ground_truth.set_index("fname", inplace=True)
# df_ground_truth.head()

In [None]:
def compare(fname):
    fname = Path(fname).stem
    dicom = (path_data/fname).with_suffix(".dcm")
    prediction = learner.predict(dicom)
    y_hat = label2digit[prediction[0]]
    y = df_testset.loc[fname + ".dcm", "Label"]
    return (y, y_hat)

In [None]:
compare('ID_b08fb0feb.dcm')

## Statistische Auswertung

In [None]:
result = {
    "FP": 0,  # false positive
    "TP": 0,  # true positive
    "FN": 0,  # false negative
    "TN": 0,  # true negative
}

for file in test_set:
    y, y_hat = compare(file)
    
    if y and y_hat:
        result["TP"] = result["TP"] + 1
    elif (not y) and (not y_hat):
        result["TN"] = result["TN"] + 1
    elif y and (not y_hat):
        result["FN"] = result["FN"] + 1
    elif (not y) and y_hat:
        result["FP"] = result["FP"] + 1

In [None]:
result

In [None]:
import seaborn as sns

confusion_matrix = [
    [result["TP"], result["FP"]],
    [result["FN"], result["TN"]]
]

df_cm = pd.DataFrame(confusion_matrix,
                     index=["NN pos", "NN neg"], 
                     columns=["real pos", "real neg"]
                    )

plt.figure(figsize = (5,5))  # hier könnt ihr die Größe ändern
sns.set(font_scale=2)
sns.heatmap(df_cm, annot=True, fmt="d")