# First Spike

This is a [spike solution](https://proxy.c2.com/cgi/wiki?SpikeSolution) to verify that I can achieve a few technical objectives.
## Objectives

**Description** | **Status** | **Remarks**
----------------------------------|--------|--------------------------------------
Read and plot data|OK|
Verify that I can write a submission.csv|OK|See section 'Save Submission'
Understand how all the images for one patient fit together|WIP|
Understand whether there is a 1 to 1 correspondence between the 4 series for one study|OK|The lengths of the 4 series for study 00688 are 196, 214, 214, 376. After removing blank images they are 160, 140, 139, 279. So it looks as if there is _not_ a 1 to 1 correspondence.
Understand metadata|Closed|[Show Patient Trajectories](https://www.kaggle.com/weka511/show-patient-trajectories)


In [None]:
from matplotlib.pyplot import axes, cm, figure, imshow, savefig, subplots, suptitle, title
from numpy             import log
from pydicom           import dcmread
from os                import sep, walk
from os.path           import join, normpath
from pandas            import DataFrame, read_csv
from random            import choice, sample

## Hyperparameters

In [None]:
class Hyperparameter:
    N = 12              # Maximum records while testing       

## Build list of datasets to process

### Data model

The training and test datasets each consist of a collection of _studies_ each for a single patient; a study is identified by a _label_ consisting of 5 digits.  Each study has a _label_, and the objective is to make a prediction for each label (or study)--[David Roberts](https://www.kaggle.com/c/rsna-miccai-brain-tumor-radiogenomic-classification/discussion/252972#1387906).
Each study contains 4 series--[Reuben Schmidt](https://www.kaggle.com/c/rsna-miccai-brain-tumor-radiogenomic-classification/discussion/252972#1388006)

- Fluid Attenuated Inversion Recovery (FLAIR)
- T1-weighted pre-contrast (T1w)
- T1-weighted post-contrast (T1wCE)
- T2-weighted (T2w)

>in T2 images water is bright, and in T1 images fat is bright. In FLAIR, cerebrospinal fluid is dark but the rest of the image looks like T2


In [None]:
# Study
#
# This class represents the data from one MRI Study

class Study:
    FLAIR = 0
    T1w   = 1
    T1wCE = 2
    T2w   = 3
    
    series_id = { 'FLAIR' : FLAIR,
                  'T1w'   : T1w,
                  'T1wCE' : T1wCE,
                  'T2w'   : T2w}
           
    def __init__(self,image_id):
        self.image_id = image_id
        self.series   = [[], [], [], []]
     
    # add
    #
    # Add one file to specified series
    
    def add(self,series_id,path):
        self.series[series_id].append(path)
    
    # compress
    #
    # Remove files that consist of empty images from all series
    #
    # See https://www.kaggle.com/c/rsna-miccai-brain-tumor-radiogenomic-classification/discussion/252968
    
    def compress(self):
        def non_empty(path):
            return dcmread(path).pixel_array.sum()>0
        
        for i in range(len(self.series)):
            self.series[i] = [path for path in self.series[i] if non_empty(path)]
        
        
    def length(self):
        return [len(series) for series in self.series]

        
# parse_path
#
# Determine whether a file is test or training

def parse_path(path):
    train     = 0
    test      = 0
    image_id  = None
    series_id = None
    for folder in normpath(path).split(sep):
        if folder in Study.series_id:
            series_id = Study.series_id[folder]
        train += folder=='train'
        test  += folder=='test'
        if folder.isnumeric() and image_id == None:
            image_id = folder
    return path,train>0,test>0,image_id,series_id

# get_seq
#
# Extract sequence from file name, used to sort images

def get_seq(filename):
    base  = filename.split('.') # base -- Image-13.dcm -> Image-13
    parts = base[0].split('-')  # split into forst part + sequence
    try:
        return int(parts[-1])
    except ValueError:
        return filename
    
train_labels    = None # Targets for training
training_agenda = {}   # Data that needs to be processed for training
testing_agenda  = {}   # Data that needs to be processed for testing

for dirname, _, filenames in walk('/kaggle/input'):
    for filename in sorted(filenames,key=get_seq):
        full_path = join(dirname, filename)
        if filename == 'train_labels.csv':
            train_labels = read_csv(full_path,dtype={'BraTS21ID':str})
        path,train,test,image_id,series_id = parse_path(full_path)
        if train:
            if image_id not in training_agenda:
                training_agenda[image_id]= Study(image_id)
            training_agenda[image_id].add(series_id,path)
        if test:
            if image_id not in testing_agenda:
                testing_agenda[image_id]= Study(image_id)
            testing_agenda[image_id].add(series_id,path)
  
    

## Plot stuff

1. Verify that I can read and plot an image
2. Establish criteria for identifying that in image is blank

In [None]:
# plot_study
#
# Display all images in study

def plot_study(study, series_name='FLAIR',ncols = 8,threshold=10):
    series_id = Study.series_id[series_name]
    series    = study.series[series_id]
    nrows     = (len(series) + ncols - 1)//ncols
    fig,axs   = subplots(nrows=nrows,ncols=ncols,figsize=(20,20*nrows/ncols))
    for i in range(nrows):
        for j in range(ncols):
            k = i * ncols + j
            if k<len(series):
                dcim = dcmread(series[k])
            axs[i][j].imshow(dcim.pixel_array, cmap=cm.gray)
            axs[i][j].axes.xaxis.set_visible(False)
            axs[i][j].axes.yaxis.set_visible(False)
            axs[i][j].axis('tight')

    suptitle(f'Study {study.image_id}, series {series_name}')
    fig.tight_layout()
    fig.subplots_adjust(top=0.95) # https://stackoverflow.com/questions/39331143/huge-space-between-title-and-plot-matplotlib
    title(dcim.PatientID)
    study_id = list(training_agenda.keys())[0]
  
# study.compress()
study_id = list(training_agenda.keys())[0]
study    = training_agenda[study_id]

for series_name in Study.series_id.keys():
     plot_study(study, series_name=series_name)



## Save submission

This is just a stub to verify that I can write submission in the correct format. I'm going to assign MGMT values using the probability of 0 or 1 established from the training dataset. 

In [None]:
data        = [[image_id,0 if choice(range(287+307))<287 else 1] for image_id in testing_agenda.keys()]
xsubmission = DataFrame(data,columns=['BraTS21ID','MGMT_value'])
xsubmission.to_csv('submission.csv', index = False)