In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
%cd gdrive/My Drive/test_colab/ml_project2/src
!pwd

[Errno 2] No such file or directory: 'gdrive/My Drive/test_colab/ml_project2/src'
/content/gdrive/My Drive/test_colab/ml_project2/src
/content/gdrive/My Drive/test_colab/ml_project2/src


In [None]:
!ls ../pneumoscope

h5_files  npz_files


In [None]:
!git reset --hard
!git pull

Checking out files: 100% (2592/2592), done.
HEAD is now at 076018f clinical databases
remote: Enumerating objects: 14, done.[K
remote: Counting objects: 100% (14/14), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 8 (delta 6), reused 8 (delta 6), pack-reused 0[K
Unpacking objects: 100% (8/8), done.
From https://github.com/salimbeni1/ml_project2
   076018f..7b67663  main       -> origin/main
Updating 076018f..7b67663
Fast-forward
 src/sound_processing.py | 4 [32m++[m[31m--[m
 1 file changed, 2 insertions(+), 2 deletions(-)


In [None]:
# location of the npz files
!ls -l ../pneumoscope/npz_files/*/

../pneumoscope/npz_files/GVA/:
total 3079312
-rw------- 1 root root  553904757 Nov 27 16:23 GVA_Ca_Co_test.npz
-rw------- 1 root root 1369343627 Nov 27 16:16 GVA_Ca_train_b1.npz
-rw------- 1 root root 1229965595 Nov 27 16:13 GVA_Co_train_b1.npz

../pneumoscope/npz_files/POA/:
total 21056243
-rw------- 1 root root  546407225 Nov 28 07:42 POA_Ca_Co_test.npz
-rw------- 1 root root 4121620181 Nov 27 15:29 POA_Ca_train_b1.npz
-rw------- 1 root root 4301549167 Nov 27 15:17 POA_Ca_train_b2.npz
-rw------- 1 root root 4519848135 Nov 27 15:41 POA_Ca_train_b3.npz
-rw------- 1 root root 4150727343 Nov 27 15:52 POA_Ca_train_b4.npz
-rw------- 1 root root 3088366560 Nov 27 16:07 POA_Co_train_b1.npz
-rw------- 1 root root  833071644 Nov 27 15:57 POA_Co_train_b2.npz


In [None]:
# keras imports
import keras
from keras.models import Sequential
from keras.layers import Dense, Convolution2D, Flatten, MaxPooling2D, ZeroPadding2D, Dropout, BatchNormalization, Activation
from keras import optimizers
from keras.utils import to_categorical
from keras.preprocessing.image import ImageDataGenerator
from keras.models import load_model
from keras.callbacks import ModelCheckpoint, CSVLogger, RemoteMonitor, TensorBoard, ReduceLROnPlateau
from tensorboardcolab import TensorBoardColab, TensorBoardColabCallback
from tqdm.keras import TqdmCallback

In [None]:
# ours
import sound_processing as sp
import utils as ut
import model_tools as mt

# standards
import numpy as np
from scipy import signal
from scipy.io import wavfile
import matplotlib.pyplot as plt
import glob

In [None]:
def get_GVA():

  # Train -------------------------------------
  feat_GVA_Ca, _, posi_GVA_Ca, ctrl_GVA_Ca, _, nbpa_GVA_Ca = ut.load_from_npz( '../pneumoscope/npz_files/GVA/GVA_Ca_train_b1.npz' )
  feat_GVA_Co, _, posi_GVA_Co, ctrl_GVA_Co, _, nbpa_GVA_Co = ut.load_from_npz( '../pneumoscope/npz_files/GVA/GVA_Co_train_b1.npz' )
  # Merge train batches -----------------------
  features   = np.append(feat_GVA_Ca, feat_GVA_Co)
  positions  = np.append(posi_GVA_Ca, posi_GVA_Co)
  controls   = np.append(ctrl_GVA_Ca, ctrl_GVA_Co)
  patientnbs = np.append(nbpa_GVA_Ca, nbpa_GVA_Co)

  # Test ---------------------------------------
  feat_GVA_test, _, posi_GVA_test, ctrl_GVA_test, _, nbpa_GVA_test = ut.load_from_npz( '../pneumoscope/npz_files/GVA/GVA_Ca_Co_test.npz'  )
  featuresT   = feat_GVA_test
  positionsT  = posi_GVA_test
  controlsT   = ctrl_GVA_test
  patientnbsT = nbpa_GVA_test

  # return (train),(test)
  return (features, positions, controls, patientnbs), (featuresT, positionsT, controlsT, patientnbsT)

In [None]:
def get_POA(shrink = True):

  # Train -------------------------------------
  feat_POA_Ca, _, posi_POA_Ca, ctrl_POA_Ca, _, nbpa_POA_Ca = ut.load_from_npz( '../pneumoscope/npz_files/POA/POA_Ca_train_b1.npz' )
  feat_POA_Co, _, posi_POA_Co, ctrl_POA_Co, _, nbpa_POA_Co = ut.load_from_npz( '../pneumoscope/npz_files/POA/POA_Co_train_b1.npz' )
  # Merge train batches -----------------------
  features   = np.append(feat_POA_Ca, feat_POA_Co)
  positions  = np.append(posi_POA_Ca, posi_POA_Co)
  controls   = np.append(ctrl_POA_Ca, ctrl_POA_Co)
  patientnbs = np.append(nbpa_POA_Ca, nbpa_POA_Co)

  # Test ---------------------------------------
  feat_POA_test, _, posi_POA_test, ctrl_POA_test, _, nbpa_POA_test = ut.load_from_npz( '../pneumoscope/npz_files/POA/POA_Ca_Co_test.npz'  )
  featuresT   = feat_POA_test
  positionsT  = posi_POA_test
  controlsT   = ctrl_POA_test
  patientnbsT = nbpa_POA_test

  # Shrink too big features --------------------
  if (shrink):
    for i in range(featuresT.shape[0]):
      size = featuresT[i].shape[0]
      if (size > 20):
        idx = np.random.randint(size, size=15)
        featuresT[i] = featuresT[i][idx,:]
        print(featuresT[i].shape)

    for i in range(features.shape[0]):
      size = features[i].shape[0]
      if (size > 20):
        idx = np.random.randint(size, size=15)
        features[i] = features[i][idx,:]

  # return (train),(test)
  return (features, positions, controls, patientnbs), (featuresT, positionsT, controlsT, patientnbsT)

In [None]:
(features, positions, controls, patientnbs), (featuresT, positionsT, controlsT, patientnbsT) = get_GVA()

Importing from: ../pneumoscope/npz_files/GVA/GVA_Ca_train_b1.npz
Importing from: ../pneumoscope/npz_files/GVA/GVA_Co_train_b1.npz
Importing from: ../pneumoscope/npz_files/GVA/GVA_Ca_Co_test.npz


In [None]:
(features, positions, controls, patientnbs), (featuresT, positionsT, controlsT, patientnbsT) = get_POA()

In [None]:
from sklearn.model_selection import KFold


print('Train set : ',features.shape)
print('Test set  : ',featuresT.shape)


def train_all_pos_models(index = 1):
  stats = []
  for position in ['P8']:

    # Define per-fold score containers <-- these are new
    acc_per_fold = []
    loss_per_fold = []

    print('position',position,'index',index)
    # train set ----------------------------
    indx = positions == position
    x_train_, y_train_, pnb_train, pos_train = ut.prepare_samples(features[indx],controls[indx],positions[indx],patientnbs[indx])

    # test set -----------------------------
    #indx = positionsT == position
    #x_test, y_test, pnb_test, pos_test = ut.prepare_samples(featuresT[indx],controlsT[indx],positionsT[indx],patientnbsT[indx])

    print(x_train_.shape , y_train_.shape)

    # shape --------------------------------
    shape = x_train_.shape[1:]

    # Define the K-fold Cross Validator
    num_folds = 10
    kfold = KFold(n_splits=num_folds, shuffle=False)

    # K-fold Cross Validation model evaluation
    fold_acc = []

    fold_no = 1
    for x_train_index, x_test_index in kfold.split(x_train_):
      
      x_train = x_train_[x_train_index]
      x_test = x_train_[x_test_index]
      y_train = y_train_[x_train_index]
      y_test = y_train_[x_test_index]

      # Prepare train batches ----------------
      train_batch_size = 32
      train_steps = x_train.shape[0]//train_batch_size # need to specify the number of steps since the data generator outputs continuously
      train_generator = ImageDataGenerator(width_shift_range=0)
      train_generator.fit(x_train) #apply the augmetnation to train data
      train_datagen = train_generator.flow(x_train, y_train, batch_size=train_batch_size) # creates the mii batch flow

      # Model and optimizer -----------------
      density = 25
      model = mt.get_model(shape , dense_unit = density)
      sgd = optimizers.SGD(lr=0.00004, decay=1e-6, momentum=0.9, nesterov=True)
      #adam = keras.optimizers.Adam(learning_rate=0.01)
      model.compile(optimizer=sgd, loss='binary_crossentropy', metrics=['accuracy',ut.f1])  



      # Callbacks --------------------------
      checkpointer = ModelCheckpoint(monitor='val_loss',
                                    mode='min', 
                                    filepath='../report_'+str(index)+'_'+position+'_d_'+str(density)+'_GVA_KFOLD.h5', # {epoch:02d}
                                    verbose=0,
                                    save_best_only=True,
                                    save_freq = 'epoch'
                                    )

      # Fit --------------------------------
      history = model.fit_generator(train_datagen,
                                    steps_per_epoch=train_steps,
                                    epochs=200,
                                    validation_data=(x_test, y_test),
                                    verbose = 0,
                                    callbacks= [TqdmCallback(verbose=0),
                                                keras.callbacks.EarlyStopping(patience=7),
                                                #checkpointer
                                                ]
                                    )

      # Plot history ----------------------
      #ut.plot_history(history, title='POA model for '+position)

      ind = np.argmin( history.history['val_loss'] )

      fold_acc.append( [history.history['val_loss'][ind] , history.history['val_accuracy'][ind] , history.history['val_f1'][ind] ] )

      print(f'Score for fold {fold_no}: { fold_acc[-1] } ')

      # Increase fold number
      fold_no = fold_no + 1
    stats.append(fold_acc)
  return stats


all_stats = train_all_pos_models(1)

Train set :  (503,)
Test set  :  (70,)
position P8 index 1
shuffleling
(815, 150, 216, 1) (815, 2)


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Instructions for updating:
Please use Model.fit, which supports generators.

Score for fold 1: [0.45085087418556213, 0.7560975551605225, 0.7673609852790833] 


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Score for fold 2: [0.30072101950645447, 0.8902438879013062, 0.8819443583488464] 


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Score for fold 3: [0.36158716678619385, 0.792682945728302, 0.7986109852790833] 


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Score for fold 4: [0.41646385192871094, 0.8048780560493469, 0.8171296119689941] 


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Score for fold 5: [0.31202009320259094, 0.8780487775802612, 0.8796296119689941] 


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Score for fold 6: [0.4115146994590759, 0.8641975522041321, 0.8578431010246277] 


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Score for fold 7: [0.39658671617507935, 0.8148148059844971, 0.7794117331504822] 


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Score for fold 8: [0.4644911587238312, 0.8271604776382446, 0.8174018859863281] 


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Score for fold 9: [0.46040311455726624, 0.7777777910232544, 0.7665440440177917] 


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Score for fold 10: [0.35349592566490173, 0.8888888955116272, 0.8694851994514465] 


In [None]:
def get_POA_train(shrink = True , caco = 'Ca', batch = 'b1'):

  #if(batch != 'b1' and batch != 'b2'):
  #  raise 'not enough batches'

  # Train -------------------------------------
  features, _, positions, controls, _, patientnbs = ut.load_from_npz( '../pneumoscope/npz_files/POA/POA_'+caco+'_train_'+batch+'.npz' )

  # Shrink too big features --------------------
  if (shrink):
    for i in range(features.shape[0]):
      size = features[i].shape[0]
      if (size > 20):
        idx = np.random.randint(size, size=15)
        features[i] = features[i][idx,:]

  # return (train),(test)
  return (features, positions, controls, patientnbs)


def get_GVA_train(shrink = True , caco = 'Ca', batch = 'b1'):

  if(batch != 'b1'):
    raise 'not enough batches'

  # Train -------------------------------------
  features, _, positions, controls, _, patientnbs = ut.load_from_npz( '../pneumoscope/npz_files/GVA/GVA_'+caco+'_train_'+batch+'.npz' )

  # Shrink too big features --------------------
  if (shrink):
    for i in range(features.shape[0]):
      size = features[i].shape[0]
      if (size > 20):
        idx = np.random.randint(size, size=15)
        features[i] = features[i][idx,:]

  # return (train),(test)
  return (features, positions, controls, patientnbs)


In [None]:
from keras.models import Sequential, Model, Input

def get_dropout(input_tensor, p=0.5, mc=False):
  if mc:
    return Dropout(p)(input_tensor, training=True)
  else:
    return Dropout(p)(input_tensor)

def get_model( shape_input , dense_unit = 25 , mc_dropout = False):

  inp = Input(shape_input)

  x = Convolution2D(20, (3, 3), activation='relu', padding='same', input_shape=shape_input)(inp)
  x = BatchNormalization()(x)
  x = MaxPooling2D(2, 2)(x)
  x = get_dropout(x, p=0.25, mc=mc_dropout)

  x = Convolution2D(50, (3, 3), activation='relu', padding='same', input_shape = shape_input )(x)
  x = BatchNormalization()(x)
  x = MaxPooling2D(2, 2)(x)
  x = get_dropout(x, p=0.25, mc=mc_dropout)

  x = Flatten()(x)
  x = Dense(dense_unit, activation='relu')(x)
  x = BatchNormalization()(x)
  x = get_dropout(x, p=0.25, mc=mc_dropout)

  out = Dense(2, activation='softmax')(x)

  model = Model(inputs=inp, outputs=out)
  return model

In [None]:
def mean_models_fusion(x_test_, y_test_, p_test_, pos_test_ , position_weights = [1,1,1,1,1,1,1,1] , mc_dropout = False):
  # Test set ---------------------------------------
  #x_test_, y_test_, p_test_, pos_test_ = ut.prepare_samples(featuresT,controlsT,positionsT,patientnbsT)


  # Model -----------------------------------------
  model = get_model(x_test_.shape[1:] , dense_unit = 25 ,  mc_dropout = mc_dropout )
  
  sgd = optimizers.SGD(lr=0.00004, decay=1e-6, momentum=0.9, nesterov=True)
  model.compile(optimizer=sgd, loss='binary_crossentropy', metrics=['accuracy',ut.f1]) 


  
  y_pred_patient = []
  y_expc_patient = []

  for patient in np.unique(p_test_):
    x_test_patient = x_test_[p_test_ == patient]
    y_test_patient = y_test_[p_test_ == patient] # all the same
    pos_test_patient = pos_test_[p_test_ == patient]

    
    print( 'y ->' , np.unique( y_test_patient, axis=0) )
    for caco in np.unique( y_test_patient, axis=0):
      pred_patient = []
      for pos in np.unique(pos_test_patient):
        #model.load_weights('../h5_files/'+str(pos)+'_density25_POA_200.h5')
        model.load_weights('../report_2_'+str(pos)+'_d_25_POA.h5')
        
        to_pred = x_test_patient[(pos_test_patient == pos) & (y_test_patient[:,0] == caco[0])]
        if(to_pred.shape[0] != 0):
          pred = model.predict( to_pred )
          for i in range(position_weights[int(pos[1])-1]):
            pred_patient.append(np.mean(pred,axis=0))
      
      # patien preddiction (merge positions)
      merged_position = np.mean(pred_patient,axis=0)
      y_pred_patient.append(merged_position)
      y_expc_patient.append(y_test_patient)
      print('patient',patient ,' predicted:', merged_position , 'expected:' , caco)

  y_pred_patient = np.array(y_pred_patient) # shape (N,2)
  y_expc_patient = np.array(y_expc_patient) # shape (N,2)


  #print( 'f1 score : ',ut.f1( y_expc_patient, y_pred_patient ) )

  return y_pred_patient , y_expc_patient

In [None]:
!ls ../

In [None]:
(featuresT, positionsT, controlsT, patientnbsT) = get_POA_train(caco = 'Ca' , batch = 'b4')

Importing from: ../pneumoscope/npz_files/POA/POA_Ca_train_b4.npz


In [None]:
x_test, y_test, p_test, pos_test = ut.prepare_samples(featuresT,controlsT,positionsT,patientnbsT)

shuffleling


In [None]:
y_pred , _ = mean_models_fusion(x_test, y_test, p_test, pos_test ,position_weights = [1,1,1,1,1,1,1,1] ,  mc_dropout = False)
print('total',y_pred.shape)
print('predicted cases',y_pred[y_pred[:,1]>=0.5].shape)

y -> [[0. 1.]]
patient 100  predicted: [0.26824605 0.73175395] expected: [0. 1.]
y -> [[0. 1.]]
patient 101  predicted: [0.24224682 0.7577532 ] expected: [0. 1.]
y -> [[0. 1.]]
patient 147  predicted: [0.44373423 0.5562658 ] expected: [0. 1.]
y -> [[0. 1.]]
patient 148  predicted: [0.43709266 0.5629074 ] expected: [0. 1.]
y -> [[0. 1.]]
patient 149  predicted: [0.26629367 0.7337063 ] expected: [0. 1.]
y -> [[0. 1.]]
patient 150  predicted: [0.21169433 0.7883057 ] expected: [0. 1.]
y -> [[0. 1.]]
patient 151  predicted: [0.252782 0.747218] expected: [0. 1.]
y -> [[0. 1.]]
patient 152  predicted: [0.1105678  0.88943225] expected: [0. 1.]
y -> [[0. 1.]]
patient 153  predicted: [0.35678124 0.64321876] expected: [0. 1.]
y -> [[0. 1.]]
patient 154  predicted: [0.2077356  0.79226446] expected: [0. 1.]
y -> [[0. 1.]]
patient 155  predicted: [0.14735492 0.8526451 ] expected: [0. 1.]
y -> [[0. 1.]]
patient 157  predicted: [0.40397152 0.59602845] expected: [0. 1.]
y -> [[0. 1.]]
patient 158  pred

In [None]:
y_pred , _ = mean_models_fusion(x_test, y_test, p_test, pos_test ,position_weights = [0,0,1,0,0,0,0,0] ,  mc_dropout = False)
print('total',y_pred.shape)
print('predicted cases',y_pred[y_pred[:,1]>=0.5].shape)


y -> [[0. 1.]]
patient 10  predicted: [0.11331779 0.8866823 ] expected: [0. 1.]
y -> [[0. 1.]]
patient 11  predicted: [0.08248964 0.9175103 ] expected: [0. 1.]
y -> [[0. 1.]]
patient 14  predicted: [0.07097512 0.9290249 ] expected: [0. 1.]
y -> [[0. 1.]]


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


patient 15  predicted: nan expected: [0. 1.]
y -> [[0. 1.]]
patient 17  predicted: nan expected: [0. 1.]
y -> [[0. 1.]]
patient 18  predicted: nan expected: [0. 1.]
y -> [[0. 1.]]
patient 19  predicted: [0.20377761 0.7962223 ] expected: [0. 1.]
y -> [[0. 1.]]
patient 20  predicted: [0.09925503 0.90074503] expected: [0. 1.]
y -> [[0. 1.]]
patient 21  predicted: [0.17818607 0.8218139 ] expected: [0. 1.]
y -> [[0. 1.]]
patient 22  predicted: [0.20111977 0.7988802 ] expected: [0. 1.]
y -> [[0. 1.]]
patient 23  predicted: nan expected: [0. 1.]
y -> [[0. 1.]]
patient 24  predicted: nan expected: [0. 1.]
y -> [[0. 1.]]
patient 25  predicted: [0.12166371 0.87833637] expected: [0. 1.]
y -> [[0. 1.]]
patient 26  predicted: [0.11792495 0.882075  ] expected: [0. 1.]
y -> [[0. 1.]]
patient 27  predicted: [0.301392   0.69860804] expected: [0. 1.]
y -> [[0. 1.]]
patient 28  predicted: nan expected: [0. 1.]
y -> [[0. 1.]]
patient 29  predicted: [0.21370396 0.7862961 ] expected: [0. 1.]
y -> [[0. 1.]]


IndexError: ignored

In [None]:
y_pred , _ = mean_models_fusion(x_test, y_test, p_test, pos_test ,position_weights = [1,1,1,0,0,0,0,0] ,  mc_dropout = False)
print('total',y_pred.shape)
print('predicted cases',np.sum(y_pred[:,1]>=0.5))

y -> [[0. 1.]]
patient 100  predicted: [0.3154582 0.6845419] expected: [0. 1.]
y -> [[0. 1.]]
patient 101  predicted: [0.40367913 0.5963208 ] expected: [0. 1.]
y -> [[0. 1.]]
patient 147  predicted: [0.7379702  0.26202983] expected: [0. 1.]
y -> [[0. 1.]]
patient 148  predicted: [0.56269556 0.43730447] expected: [0. 1.]
y -> [[0. 1.]]
patient 149  predicted: [0.3627838  0.63721615] expected: [0. 1.]
y -> [[0. 1.]]
patient 150  predicted: [0.2693896 0.7306104] expected: [0. 1.]
y -> [[0. 1.]]
patient 151  predicted: [0.22259666 0.7774033 ] expected: [0. 1.]
y -> [[0. 1.]]
patient 152  predicted: [0.09556106 0.904439  ] expected: [0. 1.]
y -> [[0. 1.]]
patient 153  predicted: [0.40221167 0.59778833] expected: [0. 1.]
y -> [[0. 1.]]
patient 154  predicted: [0.22248322 0.77751684] expected: [0. 1.]
y -> [[0. 1.]]
patient 155  predicted: [0.06439183 0.93560815] expected: [0. 1.]
y -> [[0. 1.]]
patient 157  predicted: [0.28217268 0.7178273 ] expected: [0. 1.]
y -> [[0. 1.]]
patient 158  pred