In [None]:
import os
import gc
import cv2
import cuml
import glob
import numpy as np
import pandas as pd
from numba import cuda
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier

In [None]:
!ls ../input/ranzcr-clip-catheter-line-classification

# Load train and test as DataFrames

In [None]:
train = pd.read_csv('../input/ranzcr-clip-catheter-line-classification/train.csv')
test = pd.read_csv('../input/ranzcr-clip-catheter-line-classification/sample_submission.csv')
print(train.shape)
print(test.shape)
train.head(10)

# Check distribution of labels in train

In [None]:
train.mean()

# Check first image in train

In [None]:
img = cv2.imread('../input/ranzcr-clip-catheter-line-classification/train/'+train.StudyInstanceUID.values[0]+'.jpg')
plt.imshow(img)

In [None]:
import ast

annot = pd.read_csv('../input/ranzcr-clip-catheter-line-classification/train_annotations.csv')
print(annot.shape)
annot.head()

# Process average of cateter position to be used as a mask.

In [None]:
RES = np.zeros( (512,512) )
for i in tqdm(range(annot.shape[0])):
    img = cv2.imread('../input/ranzcr-clip-catheter-line-classification/train/'+annot.StudyInstanceUID.values[i]+'.jpg')
    img[:] = 0
    data = eval(annot.data.values[i])
    for i in range(len(data)-1):
        img = cv2.line(img, (data[i][1],data[i][0]), (data[i+1][1],data[i+1][0]), (255,255,255), 20 )
    img = cv2.resize(img,(512,512))
    RES += img[:,:,0]
    
RES /= annot.shape[0]

In [None]:
plt.imshow(np.clip(RES,0,1))

In [None]:
mask = RES.copy()
mask[mask>0.5] = 1.
mask[mask<1] = 0
mask = mask.astype(np.uint8)
mask = np.stack( (mask,mask,mask), 2 )

del RES
gc.collect()
plt.imshow(mask)

# Lets extract features from the images using transfer learning from pretrained Imagenet models.

In [None]:
import keras
from keras.applications.mobilenet import preprocess_input

dir(keras.applications)

In [None]:
!ls ../input/keras-pretrained-models/

In [None]:
import keras
from keras.applications.mobilenet import preprocess_input

# Instantiate model
base = keras.applications.Xception( weights=None,  include_top=True)

# Load pretrained imagenet weights
base.load_weights('../input/keras-pretrained-models/Xception_Top_ImageNet.h5')
base.trainable = False

model = keras.Model(inputs=base.input, outputs=base.get_layer('avg_pool').output)

# Inefficient, but easy to understand for loop to extract features from train images

In [None]:
train_path = '../input/ranzcr-clip-catheter-line-classification/train/'

emb_train = np.zeros( (train.shape[0],2048), dtype=np.float32 )
for n, filename in tqdm(enumerate(train.StudyInstanceUID.values), total=train.shape[0]):
    img = cv2.imread(train_path+filename+'.jpg')
    img = cv2.resize(img,(512,512))
    img *= mask
    img = preprocess_input(img)[np.newaxis]
    emb_train[n] = model.predict(img)[0]
    
gc.collect()

# Extract features from test images

In [None]:
test_path = '../input/ranzcr-clip-catheter-line-classification/test/'

emb_test = np.zeros( (test.shape[0],2048), dtype=np.float32 )
for n, filename in tqdm(enumerate(test.StudyInstanceUID.values), total=test.shape[0]):
    img = cv2.imread(test_path+filename+'.jpg')
    img = cv2.resize(img,(512,512))
    img *= mask
    img = preprocess_input(img)[np.newaxis]
    emb_test[n] = model.predict(img)[0]
    
gc.collect()

# Delete model and release memory

In [None]:
del model
gc.collect()
keras.backend.clear_session() 
gc.collect()

# I found this trick to clear all Keras allocated memory in GPU.

In [None]:
cuda.select_device(0)
cuda.close()
cuda.select_device(0)

# Check labels names

In [None]:
train.head()
targets = train.columns[1:-1]
print(targets)

# Split train and valid set: 95%/5%

In [None]:
train_index = np.where( (np.arange(emb_train.shape[0])%20)!=7 )[0]
valid_index = np.where( (np.arange(emb_train.shape[0])%20)==7 )[0]
len(train_index), len(valid_index)

# Fit each label and predict test using the embeddings features

In [None]:
ytarget = train[targets].values[valid_index]
ypred = np.zeros( (len(valid_index), len(targets)) )

for n, target in tqdm(enumerate(targets), total=len(targets)):
    
    rf = cuml.ensemble.RandomForestClassifier(n_estimators=250, max_features=500, n_bins=16, output_type='numpy')
    
    rf.fit( emb_train[train_index], train[target].values[train_index] )
    
    ypred[:,n] = rf.predict_proba(emb_train[valid_index])[:,1]
    test[target] = rf.predict_proba(emb_test)[:,1]
    
    print(n, roc_auc_score( ytarget[:,n], ypred[:,n] ), target )
    
    del rf
    gc.collect()
    
print( 'Final AUC:', roc_auc_score( ytarget.flatten(), ypred.flatten() ) )

In [None]:
test.head()

# Check test predictions distribution

In [None]:
test.mean()

# Submit

In [None]:
test.to_csv('submission.csv', index=False)