In [None]:
import os
import time
import math
import warnings
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

import random
import glob

print(tf.__version__)


In [None]:
DEBUG = True


IMG_SZ_256 = 256
IMG_SZ_273 = 273
IMG_SZ_546 = 546
IMG_SZ_1638 = 1638
IMG_DIM = (IMG_SZ_256, IMG_SZ_546)

#Model does not support IMG_TYPE 0 or 1
#IMG_TYPE = 0  #(6, IMG_SZ_273, IMG_SZ_256) <- original IMG
#IMG_TYPE = 1  #(IMG_SZ_256, IMG_SZ_1638)   <- [A0,B,A1,C,A2,D]

#Model support IMG_TYPE 2 or 3
#IMG_TYPE = 2  #(IMG_SZ_256, IMG_SZ_546, 3) <- [A0_B,A1_C,A2_D]
IMG_TYPE = 3  #(IMG_SZ_256, IMG_SZ_273, 3, 2) <- [[A0,A1,A2],ch1], [[B,C,D],ch2]

CLASSES_NUM = 1
BATCH_SIZE = 32
EPOCHS = 10

if DEBUG:
    BATCH_SIZE = 4 
    EPOCHS = 10

In [None]:
MAIN_DIR = '../input/seti-breakthrough-listen'
TRAIN_DIR = '../input/seti-breakthrough-listen/train'
SUB_DIR = '../input/seti-breakthrough-listen/test'

In [None]:
train_df = pd.read_csv(os.path.join(MAIN_DIR, 'train_labels.csv'))
test_df = pd.read_csv(os.path.join(MAIN_DIR, 'sample_submission.csv'))

In [None]:
train_df.shape,test_df.shape

In [None]:
def plot_img(Xt, yt=None, img_type=IMG_TYPE):
    
    if (img_type == 0):
        min_rows = len(Xt)    
        fig, ax = plt.subplots(nrows = min_rows, figsize=(16, 10))
        a_strt  = 5
        a_y     = 100
        stars = ['A0','B','A1','C','A2','D']
        for j in range(min_rows): 
            ax[j].imshow(Xt[j],aspect='auto')                #(6, 273, 256)
            ax[j].text(a_strt, a_y, stars[j],  bbox={'facecolor': 'white'})        
            ax[j].set_ylabel('time ')
            ax[j].set_xlabel('freq (bins)')
    
    elif(img_type == 1): 
        stars = ['A0','B','A1','C','A2','D']
        a_strt  = 5
        a_y     = 100
        min_rows = len(Xt) 
        fig = plt.subplots(figsize=(16, 5))
        plt.imshow(Xt,aspect='auto')                          #(256,1638)
        for j in range(6): 
            plt.text(a_strt+j*IMG_SZ_273, a_y, stars[j],  bbox={'facecolor': 'white'})   

        plt.xlabel('time ')
        plt.ylabel('freq (bins)')
            
    elif(img_type == 2):    
        print(f'test X: {Xt.shape}')                          #train X: (4, 256, 546, 3)
        if len(np.shape(Xt)) == 3:                            #(256,546,3)
            Xt = np.expand_dims(Xt,axis=0)                    #(1,256,546,3)
            min_rows = 1
            fig, ax = plt.subplots(nrows = 2, figsize=(24, 8))
        else:
            print(f'test y: {yt.shape}')                      #target y: (4,1)
            min_rows = min(4,len(Xt))                         #(batch,256,546,3)
            fig, ax = plt.subplots(nrows = min_rows, figsize=(24, 22))

        a_strt  = 5
        b_strt  = a_strt+int(Xt.shape[2]/2)
        a_y     = 100

        for j in range(min_rows):
            ax[j].imshow(Xt[j],aspect='auto')     #Xt[j]=[A0:B], Xt[j]=[A1:C], Xt[j]=[A2:D]
            #ovelay A0,A1,A2 in color RGB
            ax[j].text(a_strt, a_y, 'A2',  bbox={'facecolor': 'white'})        #A2 = green
            ax[j].text(a_strt+8, a_y+8, 'A1',  bbox={'facecolor': 'white'})    #A1 = blue
            ax[j].text(a_strt+16, a_y+16, 'A0',  bbox={'facecolor': 'white'})  #A0 = red
            #ovelay B,C,D in color RGB 
            ax[j].text(b_strt, a_y, 'D',  bbox={'facecolor': 'white'})         #D = green
            ax[j].text(b_strt+8, a_y+8, 'C',  bbox={'facecolor': 'white'})     #C = blue
            ax[j].text(b_strt+16, a_y+16, 'B',  bbox={'facecolor': 'white'})   #B = red
            ax[j].set_xlabel('time ')
            ax[j].set_ylabel('freq (bins)')
            if min_rows==4:
                ax[j].set_title(f'Label= {yt[j,]}')

        plt.show()

    elif(img_type == 3):    
        print(f'test X: {Xt.shape}')         #train X: (batch, 256, 273, 3, 2)

        if len(np.shape(Xt)) == 4:                #(256,273,3,2)
            Xt = np.expand_dims(Xt,axis=0)        #(1,256,273,3,2)
            min_rows = 1
            fig, ax = plt.subplots(nrows =2, ncols =2, figsize=(24, 8))
        else:
            print(f'test y: {yt.shape}')         #target y: (4,1)
            min_rows = min(4,len(Xt))            #(batch,256,273,3,2)
            fig, ax = plt.subplots(nrows=min_rows, ncols =2, figsize=(24, 22))

        a_strt  = 5
        a_y     = 100

        for j in range(min_rows):
            ch0=Xt[j,:,:,:,0]
            ch1=Xt[j,:,:,:,1]
            ax[j,0].imshow(ch0,aspect='auto')     #Xt[j]=[A0:B], Xt[j]=[A1:C], Xt[j]=[A2:D]
            ax[j,1].imshow(ch1,aspect='auto')     #Xt[j]=[A0:B], Xt[j]=[A1:C], Xt[j]=[A2:D]
            #ovelay A0,A1,A2 in color RGB
            ax[j,0].text(a_strt, a_y, 'A2',  bbox={'facecolor': 'white'})        #A2 = green
            ax[j,0].text(a_strt+8, a_y+8, 'A1',  bbox={'facecolor': 'white'})    #A1 = blue
            ax[j,0].text(a_strt+16, a_y+16, 'A0',  bbox={'facecolor': 'white'})  #A0 = red
            ax[j,0].set_xlabel('time ')
            ax[j,0].set_ylabel('freq (bins)')
            #ovelay B,C,D in color RGB 
            ax[j,1].text(a_strt, a_y, 'D',  bbox={'facecolor': 'white'})         #D = green
            ax[j,1].text(a_strt+6, a_y+6, 'C',  bbox={'facecolor': 'white'})     #C = blue
            ax[j,1].text(a_strt+12, a_y+12, 'B',  bbox={'facecolor': 'white'})   #B = red
            ax[j,1].set_xlabel('time ')
            ax[j,1].set_ylabel('freq (bins)')
            if min_rows==4:
                ax[j,0].set_title(f'Label= {yt[j,]}')

        plt.show()        
        
    return     

In [None]:
def aug_img(arr):
    filtered_spectrogram = np.zeros((6, IMG_SZ_273, IMG_SZ_256), dtype = np.float32)  #6, 273, 256

    #type_aug = random.choice(('none', 'norm', 'row_median', 'col_median', 'all'))
    xtimes = random.uniform(1.0, 3.)  #keep the pixels which are xtimes times higher that column or/and row median
    print(xtimes)
    scale = 1

    for i in range(6):
        image = arr[i,:,:]          #273x256
        norm_s = (image -image.min())/(image.max()-image.min()) 
        col_medians = np.median(norm_s, axis=0)
        row_medians = np.median(norm_s, axis=1)
        total_pwr = np.product(norm_s.shape)
        sig_pwr = np.sum(norm_s)        #signal power
        col_sum = np.sum(col_medians)   #mean column power
        row_sum = np.sum(row_medians)   #mean row power
        pwr_ratio= total_pwr/sig_pwr
        print('total_pwr=',total_pwr,'pwr_ratio=',pwr_ratio,'sig_pwr=',sig_pwr,'col_mean=',col_sum, 'row_mean=',row_sum)
        filtered_spectrogram[i,:,:] = np.greater(norm_s, col_medians*xtimes)*scale & np.greater(norm_s.T, row_medians*xtimes).T*scale

    return filtered_spectrogram  

In [None]:
def get_img(arr, img_type=0, aug=False):
    c,h,w = arr.shape                                          #arr = 6x273x256
    if aug:
        arr = aug_img(arr)                                     #arr = 6x273x256   
    image = np.vstack(arr).transpose((1, 0))                   #image = 256x1638
    
    if (img_type ==0):
        return arr                                             #arr = 6x273x256
    
    elif(img_type ==1): #
        return image                                           #image = 256x1638
        
    elif(img_type ==2): #
        x = np.zeros(shape = (w, 2*h,3))                       #256, 2x273, 3 
        x[:, :, 0] = image[:,:x.shape[1]]  #
        x[:, :, 1] = image[:,x.shape[1]:2*x.shape[1]]  #
        x[:, :, 2] = image[:,2*x.shape[1]:3*x.shape[1]]  #
        return x                                               #256x546x3

    elif(img_type==3): #IMG_3D:
        x = np.zeros(shape = (w, h, 3, 2))                     #256, 273, 3 ,2
        #ch0, RGB overlay
        x[:, :, 0, 0] = image[:,:x.shape[1]]                #A0 
        x[:, :, 1, 0] = image[:,2*x.shape[1]:3*x.shape[1]]  #A1
        x[:, :, 2, 0] = image[:,4*x.shape[1]:5*x.shape[1]]  #A2    
        #ch1, RGB overlay
        x[:, :, 0, 1] = image[:,x.shape[1]:2*x.shape[1]]    #B 
        x[:, :, 1, 1] = image[:,3*x.shape[1]:4*x.shape[1]]  #C
        x[:, :, 2, 1] = image[:,5*x.shape[1]:6*x.shape[1]]  #D
        return x 
    
    return image    

In [None]:
#Target = 0
#arr = np.load('../input/seti-breakthrough-listen/train/f/f081cea12a6c.npy').astype(np.float32) 
#arr = np.load('../input/seti-breakthrough-listen/train/b/b07ed42630f8.npy').astype(np.float32)
#arr = np.load('../input/seti-breakthrough-listen/train/1/11a02b9f7c59.npy').astype(np.float32)
#arr = np.load('../input/seti-breakthrough-listen/train/3/331b2635e487.npy').astype(np.float32)
#Target = 1
#arr = np.load('../input/seti-breakthrough-listen/train/0/0f6dda0952ea.npy').astype(np.float32)
#arr = np.load('../input/seti-breakthrough-listen/train/5/54e340be921d.npy').astype(np.float32)
#arr = np.load('../input/seti-breakthrough-listen/train/f/f419ae83312b.npy').astype(np.float32)
#arr = np.load('../input/seti-breakthrough-listen/train/b/b29e945c82e4.npy').astype(np.float32)
#arr = np.load('../input/seti-breakthrough-listen/train/2/20b6bba77e00.npy').astype(np.float32)
arr = np.load('../input/seti-breakthrough-listen/train/8/87cca65ea82e.npy').astype(np.float32)


In [None]:
plot_img(arr, img_type=0)    #original img, img_type0, (6x273x256)

In [None]:
arr0 = get_img(arr, img_type=0, aug=True)  #input = original img  = img_type0 (6x273x256), output = img_type0
plot_img(arr0, img_type=0)

In [None]:
arr0 = get_img(arr, img_type=0, aug=True)  #input = original img  = img_type0 (6x273x256), output = img_type0
plot_img(arr0, img_type=0)

In [None]:
arr1 = get_img(arr, img_type=1, aug=True)  #input = original img = img_type0 (6x273x256), output = img_type1,(256x1638)
plot_img(arr1, img_type=1)                 # 256x1638

In [None]:
arr2 = get_img(arr, img_type=2, aug=True)  #input = original img(6x273x256),  output = img_type2, (256,546,3)
plot_img(arr2, img_type=2)                 #(256,546,3)

In [None]:
arr3 = get_img(arr, img_type=3, aug=True)  #input = original img(6x273x256),  output = img_type3 (256,273,3,2)
plot_img(arr3, img_type=3)

In [None]:
def return_filpath(name, folder=TRAIN_DIR):
    path = os.path.join(folder, name[0], f'{name}.npy')
    return path

In [None]:
train_df['image_path'] = train_df['id'].apply(lambda x: return_filpath(x))
test_df['image_path'] = test_df['id'].apply(lambda x: return_filpath(x, folder=SUB_DIR))

In [None]:
if DEBUG: 
    label_counts = train_df['target'].value_counts()
    min_label_counts = min(label_counts[0], label_counts[1])
    un_balance_ratio = max(label_counts[0], label_counts[1]) // min_label_counts

    df_0     = train_df[train_df['target']==0].sample(min_label_counts,random_state=42)
    df_1     = train_df[train_df['target']==1].sample(min_label_counts,random_state=42)
    train_df = pd.concat([df_0,df_1])
    train_df = train_df.reset_index()
    test_df = test_df[:31].reset_index(drop=True)    

else:    
    df_0     = train_df[train_df['target']==0]
    df_1     = train_df[train_df['target']==1]
    df_1_balance  = pd.concat([df_1,df_1,df_1,df_1,df_1,df_1,df_1,df_1,df_1])
    train_df = pd.concat([df_0,df_1_balance])
    train_df = train_df.reset_index()

In [None]:
#train_df = train_df.sample(100,random_state=42, replace=True).reset_index(drop=True)  #debug
#test_df = test_df[:31].reset_index(drop=True)    #debug

In [None]:
train_df.tail()

In [None]:
test_df.tail()

In [None]:
np.shape(train_df), np.shape(test_df)

In [None]:
sample_df = train_df.sample(frac=1).reset_index(drop=True)
#percent_split = int( 0.07 * sample_df.shape[0] )  #train=29, valid=2, 
percent_split = int( 0.2 * sample_df.shape[0] )  #train=25, valid=6, 

X_train = sample_df[:-percent_split]
X_valid = sample_df[-percent_split:]

np.shape(X_train), np.shape(X_valid)

In [None]:
lable_value_counts =  train_df['target'].value_counts()
class_weights = {i: max(lable_value_counts)/v for i, v in lable_value_counts.items()}

lable_value_counts, class_weights

In [None]:
STEPS_PER_EPOCH = X_train.shape[0] // BATCH_SIZE
VALIDATION_STEPS = X_valid.shape[0] // BATCH_SIZE   

BATCH_SIZE, STEPS_PER_EPOCH, VALIDATION_STEPS

In [None]:
class Preprocess_data(tf.keras.utils.Sequence):
    def __init__(self,df,batch_size=16, mode='fit', shuffle = False, aug = None, n_classes=CLASSES_NUM):
        self.df = df            # data frame with the id, target, image_path
        self.batch_size = batch_size
        self.mode = mode
        self.shuffle = shuffle
        self.aug = aug
        self.n_classes = n_classes
        
        self.on_epoch_end()
        
    def __len__(self):
        return math.ceil(len(self.df) / self.batch_size)
        #return len(self.df) // self.batch_size   #drop remainder
    
    def on_epoch_end(self):
        if self.shuffle:
            self.df = self.df.sample(frac=1).reset_index(drop=True)
            
    def __getitem__(self, index):
                
        batch_Idx_start = index*self.batch_size
        batch_Idx_end = batch_Idx_start+self.batch_size  # ..,24+4, 28+4
        
        if( len(self.df) < batch_Idx_end):  # 29 < 32
            len_batch_current = len(self.df)%self.batch_size  #1
            batch_Idx_end = batch_Idx_start + len_batch_current  #28+1
            
        img_batch = self.df.image_path[batch_Idx_start : batch_Idx_end]
        
        #IMG_TYPE = 0  #(6, IMG_SZ_256, IMG_SZ_273)
        #IMG_TYPE = 1  #(IMG_SZ_256, IMG_SZ_1638)   <- [A0,B,A1,C,A2,D]
        #IMG_TYPE = 2  #(IMG_SZ_256, IMG_SZ_546, 3) <- [A0_B,A1_C,A2_D]
        #IMG_TYPE = 3  #(IMG_SZ_256, IMG_SZ_273, 3, 2) <- [[A0,A1,A2],ch1], [[B,C,D],ch2]
        
        if(IMG_TYPE == 0): 
            X = np.zeros((len(img_batch), IMG_SZ_256, IMG_SZ_273), dtype = np.float32)  #(6, IMG_SZ_256, IMG_SZ_273) 
        elif(IMG_TYPE == 1):    
            X = np.zeros((len(img_batch), IMG_SZ_256, IMG_SZ_1638, 1), dtype = np.float32)  #256, 1638, 1 
        elif(IMG_TYPE == 2): 
            X = np.zeros((len(img_batch), IMG_SZ_256, IMG_SZ_546, 3), dtype = np.float32)  #256, 2x273, 3 
        elif(IMG_TYPE == 3): #3D
            X = np.zeros((len(img_batch), IMG_SZ_256, IMG_SZ_273, 3, 2), dtype = np.float32)  #[[A0,A1,A2],ch1], [[B,C,D],ch2]
        
        for i,fn in enumerate(img_batch):
            img   = np.load(fn)                           #6x273x256
            img   = get_img(img, IMG_TYPE, self.aug)      #MODEL does not support IMG_TYPE 0 or 1 yet.
            X[i,] = img        #(batch,256,2x273,3) or 3D<-(batch,256,273,3,2) 
            
        if self.mode in ['fit', 'validate']:
            y = np.zeros(len(img_batch), dtype = np.float32)
            # target list                  
            lbls_batch = self.df[batch_Idx_start: batch_Idx_end]['target'].values
            
            if False:  #DEBUG:
                if( len(img_batch) < self.batch_size):
                    print(' ')
                    print(f'len_img={len(img_batch)}, mode={self.mode}')
                    print(f'start batch Idx={batch_Idx_start},end Idx={batch_Idx_end},length df={len(self.df)}')  #
                    #print(f'batch index exceeded the length df={len(self.df)}') 

                    #print(' ')
                    #print(f'df_batch= ')
                    #print(f'{self.df.id}')

                    #print(' ')
                    #print(f'img_batch= ')
                    #print(f'{img_batch}')

                    #print(' ')
                    #print(f'lenx={np.shape(X)}, leny={np.shape(lbls_batch)}')

                    #print(' ')
                    #print('target=', lbls_batch)
                
            y = lbls_batch
            return X, y
        
        elif self.mode == 'predict':
            return X
        else:
            raise AttributeError('mode parameter error')    

In [None]:
train_gen = Preprocess_data(
                        df = X_train, 
                        batch_size = BATCH_SIZE,
                        mode = 'fit',
                        shuffle = True,
                        aug = False,  #True, 
                        n_classes = CLASSES_NUM)

val_gen = Preprocess_data(
                        df = X_valid,  
                        batch_size = BATCH_SIZE,
                        mode = 'validate',
                        shuffle = False,
                        aug = False,
                        n_classes = CLASSES_NUM)

In [None]:
X_train.tail(5)

In [None]:
len(X_train), X_train[-BATCH_SIZE:]

In [None]:
len(X_valid), X_valid[-BATCH_SIZE:]

In [None]:
Xt, yt = train_gen.__getitem__(0)    #idx*batch_size < len(df)  
plot_img(Xt, yt , img_type=IMG_TYPE)

In [None]:
trn_idx = len(X_train)//BATCH_SIZE
trn_idx

In [None]:
Xt, yt = train_gen.__getitem__(trn_idx)    #idx*batch_size < len(df)  
print(f'train X: {Xt.shape}')              #train X: (4, 256, 546, 3)
print(f'train y: {yt.shape}')              #target y: (4,1)

In [None]:
val_idx = len(X_valid)//BATCH_SIZE
val_idx

In [None]:
Xv, yv = val_gen.__getitem__(val_idx)      #idx*batch_size < len(df)  
print(f'valid X: {Xv.shape}')              #Valid X: (4, 256, 546, 3)
print(f'valid y: {yv.shape}')              #target y: (4,1)

In [None]:
def make_model(output_bias = None, metrics = None):
      
    #base_model = tf.keras.applications.EfficientNetB7(input_shape=(*IMG_DIM, 3),include_top=False,weights='imagenet')                                                                                                
    #base_model = tf.keras.applications.DenseNet201(input_shape=[*IMG_DIM, 3], include_top=False, weights='imagenet')
    #base_model = tf.keras.applications.Xception(input_shape=[*IMG_DIM, 3], include_top=False, weights='imagenet')
    base_model = tf.keras.applications.VGG16(input_shape=(IMG_SZ_256, IMG_SZ_546, 3), include_top=False, weights='imagenet')

    for i in base_model.layers[:-2]:   #vgg
        print(i.name)
        i.trainable = True   #arguments=dict(batch_norm_momentum=0.997) #False 

    base_model.trainable = False
    
    model = tf.keras.Sequential([
        base_model,        
        tf.keras.layers.GlobalAveragePooling2D(),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(CLASSES_NUM, activation='sigmoid'),
    ])
    

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                  loss='binary_crossentropy',
                  metrics=[tf.keras.metrics.AUC()],
                  )
    
    return model    

In [None]:
def make_model_3D():
    input_shape =(BATCH_SIZE, IMG_SZ_256, IMG_SZ_273, 3, 2)     #2 channels <- data_format="channels_last"
    
    model = tf.keras.Sequential([
            tf.keras.layers.Conv3D(16, (3,3,1), activation='relu', input_shape=(IMG_SZ_256, IMG_SZ_273, 3, 2)),
            tf.keras.layers.MaxPooling3D((6,6,1)),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.Conv3D(8, (3,3,1), activation='relu'),
            tf.keras.layers.MaxPooling3D((2,2,1)),
            #tf.keras.layers.Dropout(0.2),
            tf.keras.layers.Flatten(),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.Dense(32, activation='relu'),
            tf.keras.layers.Dense(CLASSES_NUM, activation='sigmoid') 
        ])
    
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=[tf.keras.metrics.AUC()],
                  )
    
    return model 

In [None]:
if(IMG_TYPE == 3): #3D
    model = make_model_3D()
else:
    model = make_model()

In [None]:
model.summary()

In [None]:


hist = model.fit(
        train_gen,
        validation_data = val_gen,
        class_weight = class_weights,
        epochs = EPOCHS,
        #steps_per_epoch=STEPS_PER_EPOCH,
        #validation_steps= VALIDATION_STEPS,  
        verbose = 1
        )

In [None]:
hist.history.keys()

In [None]:
#hist.history

In [None]:
fig = plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
plt.plot(hist.history['loss'],marker="o",c="red",label=f"Training {'loss'}")
plt.plot(hist.history['val_loss'],marker="x",c="green",label=f"Validation {'val_loss'}")
plt.legend()
plt.grid(False)
    
plt.subplot(1,2,2)
plt.plot(hist.history['auc'],marker="o",c="red",label=f"Training {'auc'}")
plt.plot(hist.history['val_auc'],marker="x",c="green",label=f"Validation {'val_auc'}")
plt.legend()
plt.grid(False)
    
plt.tight_layout()
plt.show()

 Predict results

In [None]:
sub_gen = Preprocess_data(
                        df = test_df,  
                        batch_size = 1,
                        mode = 'predict',
                        shuffle = False,
                        aug = False,
                        n_classes = CLASSES_NUM)

In [None]:
y_pred = model.predict(sub_gen,
                       #use_multiprocessing=True, workers=4,
                       verbose=1
                       ) 

Submission

In [None]:
sub_df = pd.read_csv("../input/seti-breakthrough-listen/sample_submission.csv")

In [None]:
if not DEBUG:
    sub_df["target"] = y_pred

In [None]:
sub_df.to_csv('submission.csv', index = False)

print('submission saved')
sub_df.head()    