In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import os                                   # Iterate over dataset directories
import numpy as np                          # Linear algebra
import pandas as pd                         # Data processing (read labels CSV)
import cv2 as cv                            # Opencv for image files
import pydicom                              # Read dcm files
from sklearn.cluster import MiniBatchKMeans # Create bag of visual words
from sklearn.svm import SVC                 # Classifier
import pickle                               # Serialize and save features extracted from dataset images

# Helper function

In [None]:
# Function to convert dcm pixel array to 8-bit grayscale image
def dcmToGray(dcm):
    image = dcm.pixel_array
    if np.amax(image) != 0:
        gray = np.uint8(image/np.amax(image)*255)
    else:
        gray = np.uint8(image)
    return gray

# Important constants

In [None]:
# Paths to training and test data
train_path =  "../input/rsna-miccai-brain-tumor-radiogenomic-classification/train"
test_path  = "../input/rsna-miccai-brain-tumor-radiogenomic-classification/test"

# Subdirectories inside each directory of dataset
subdirs = ["/FLAIR", "/T1w", "/T1wCE", "/T2w"]

# Sizes of training and test set
train_size = len(next(os.walk(train_path))[1])
test_size  = len(next(os.walk(test_path))[1])

# Size of an image descriptor (e.g. 128 for SIFT, 32 for ORB)
descriptor_size = 32

# Feature detector
detector = cv.ORB_create(64)

# Size of visual vocabulary
vocab_size = 2000

# Creating vocabulary of visual words

In [None]:
# Populating array of visual features with the descriptors computed by the defined detector

# Each element of this list is an array of all the descriptor arrays
# computed by the detector for every image of each sample.
features_per_sample = []

i = 0
while(len(features_per_sample) < train_size):
    # Current directory
    curr_dir = train_path + '/{0:05d}'.format(i)
    
    i += 1
    
    # If the there is no such directory, continue to the next one
    if not os.path.exists(curr_dir):
        continue
        
    # Array of descriptor array for each image of current sample
    curr_features = np.array([]).reshape(0,descriptor_size)
        
    # Process the images from each subdirectory in the current dir
    for subdir in subdirs:
        curr_subdir = curr_dir+subdir
        for filename in os.listdir(curr_subdir):
            dcm  = pydicom.dcmread(curr_subdir+'/'+filename)
            gray = dcmToGray(dcm)
            keypoints, descriptors = detector.detectAndCompute(gray,None)
            if descriptors is not None:
                curr_features = np.vstack([curr_features, descriptors])
                
    features_per_sample.append(curr_features)

In [None]:
# Group all features to run clustering in order to get bag of visual words
all_features = np.array([]).reshape(0,descriptor_size)
for sample_features in features_per_sample:
    all_features = np.vstack([all_features, sample_features])

In [None]:
# Clustering all the features obtained with the detector
# The centroids will be the visual words of the vocabulary
kmeans = MiniBatchKMeans(n_clusters = vocab_size,
                         batch_size = vocab_size//10,
                         verbose    = False, 
                         init       = 'k-means++',
                         n_init     = 3,
                         max_iter   = 1)

vocab = kmeans.fit(all_features)

# Create the visual words histogram for each sample

In [None]:
# Training set
histograms = []
for sample_features in features_per_sample:
    
    sample_hist = np.zeros(vocab_size)
    n_features  = sample_features.shape[0]
    
    visual_word_indexes = vocab.predict(sample_features)
    for index in visual_word_indexes:
        sample_hist[index] += 1/n_features
        
    histograms.append(sample_hist)

X_train = np.array(histograms)

In [None]:
# Test set
histograms = []
test_sample_ids = []
i = 0
while(len(histograms) < test_size):
    # Current directory
    curr_dir = test_path + '/{0:05d}'.format(i)
    
    i += 1
    
    # If the there is no such directory, continue to the next one
    if not os.path.exists(curr_dir):
        continue
        
    test_sample_ids.append('{0:05d}'.format(i-1))
        
    # Array of descriptor array for each image of current sample
    curr_features = np.array([]).reshape(0,descriptor_size)
        
    # Process the images from each subdirectory in the current dir
    for subdir in subdirs:
        curr_subdir = curr_dir+subdir
        for filename in os.listdir(curr_subdir):
            dcm  = pydicom.dcmread(curr_subdir+'/'+filename)
            gray = dcmToGray(dcm)
            keypoints, descriptors = detector.detectAndCompute(gray,None)
            if descriptors is not None:
                curr_features = np.vstack([curr_features, descriptors])
                
    sample_hist = np.zeros(vocab_size)
    n_features  = curr_features.shape[0]
    
    visual_word_indexes = vocab.predict(curr_features)
    for index in visual_word_indexes:
        sample_hist[index] += 1/n_features
        
    histograms.append(sample_hist)
    
X_test = np.array(histograms)
test_sample_ids = np.array(test_sample_ids)

# Classifier

In [None]:
# Reading labels
labels = pd.read_csv("../input/rsna-miccai-brain-tumor-radiogenomic-classification/train_labels.csv")
labels = labels.iloc[:,1].values

train_labels = labels[0:int(0.8*train_size)]
valid_labels = labels[int(0.8*train_size):train_size]

In [None]:
# Reading training, validation and test data
X_valid = X_train[int(0.8*train_size):train_size,:]
X_train = X_train[0:int(0.8*train_size),:]

In [None]:
# Fitting classifier
svc = SVC(probability=True)
svc.fit(X_train, train_labels)

In [None]:
# Validating
score = svc.score(X_valid, valid_labels)
print(score)

In [None]:
# Predictions
pred = svc.predict_proba(X_test)
print(pred)

In [None]:
# Write submission file
submission = pd.DataFrame({"BraTS21ID": test_sample_ids, "MGMT_value": pred[:,1]})
submission.to_csv("./submission.csv", index=False)