In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#!pip install tensorflow==2.6.0
import tensorflow as tf


from tensorflow import keras
import numpy as np
from sklearn.model_selection import train_test_split
from kaggle_datasets import KaggleDatasets
from sklearn.model_selection import StratifiedKFold
import tensorflow_addons as tfa
from sklearn.utils import class_weight
import os 
from tensorflow.keras import layers

from sklearn.metrics import confusion_matrix
from tensorflow.keras import layers
import glob

In [None]:
import random

In [None]:
#!pip install tensorflow-addons==0.13.0
import tensorflow_addons  as tfa

In [None]:
try:
    import efficientnet.keras as efn
except:
    
    !pip install -U efficientnet
    import efficientnet.keras as efn


In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Device:', tpu.master())
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except:
    strategy = tf.distribute.get_strategy()
print('Number of replicas:', strategy.num_replicas_in_sync)

In [None]:
CREATE_TF_RECORD = True

In [None]:
train_csv = "../input/seti-breakthrough-listen/train_labels.csv"

train_df_master = pd.read_csv( train_csv )
train_df_master["path"] = train_df_master["id"].apply( lambda x: "../input/seti-breakthrough-listen/train/"+ str(x[0]) +"/"+x +".npy" )
train_df_master.head()


COMPETITION_NAME = "seti-breakthrough-listen"
GCS_DS_PATH = KaggleDatasets().get_gcs_path()
train_df_master["tpu_path"] = train_df_master["path"].apply( lambda x : x.replace("../input/seti-breakthrough-listen",GCS_DS_PATH))
train_df_master.head()

In [None]:
CFG= {
    
    "IMG_LENGTH" :  256,
    "IMG_WIDTH" : 256,
    "CHANNELS" : 3,
    "RANDOM_STATE" : 100,
    "BATCH_SIZE"  :8*50 * strategy.num_replicas_in_sync,
    "FOLDS" : 5,
    "LEARNING_RATE" : 0.1
}

In [None]:
gcs_path= KaggleDatasets().get_gcs_path("seti-tfrecord-256x256")
tf_rec_file_list = glob.glob( gcs_path )

In [None]:

gcs_path= KaggleDatasets().get_gcs_path("seti-tfrecord-256x256")
tf_rec_file_list = glob.glob( gcs_path )

train_df_master_2 = train_df_master
train_df,test_df = train_test_split ( train_df_master_2, train_size = 0.8, random_state= CFG["RANDOM_STATE"],shuffle = True,stratify = train_df_master_2["target"])

print ("number of samples for train data set  = {} ".format(len ( train_df) ) )
print ("number of samples for test data set  = {} ".format(len ( test_df)))


In [None]:
## Creating data generator which can work on Both TPU + GPU

def decode_numpy(  channel  ):
    
    def read_image(file_name, channel = None   ):
        np_data =  tf.io.read_file ( file_name )
        np_data = tf.io.decode_raw( np_data, tf.float16 )
        np_data = tf.reshape( np_data[64:], (6, 273, 256 )) # (6, 273, 256 ) is data origional shape 
        np_data_1 = tf.stack( (np_data[0],np_data[2] ,np_data[4]), axis = 2 ) 
        #np_data_2 = tf.stack( (np_data[1],np_data[3] ,np_data[5]), axis = 2 ) 
        np_data_1 = tf.image.resize( np_data_1, (256,256))
        #np_data_2 = tf.image.resize( np_data_2, (256,256))
        
        if channel == None: 
            random_int = np.random.randint(3)
            np_data = np_data_1[:,:,random_int ] 
            return tf.stack( (np_data, np_data , np_data ), axis = 2) 
        else:
            np_data = np_data_1[:,:,channel ] 
            return tf.stack( (np_data, np_data , np_data ), axis = 2) 
        
        
      
    def decode( file_name,target ):
        channel = None 
        return read_image ( file_name, channel   ),tf.cast(target, tf.float32)
        
    def decode_test_channel_0( file_name,target ):
        channel = 0
        return read_image ( file_name, channel   ),tf.cast(target, tf.float32)
    
    def decode_test_channel_1( file_name,target ):
        channel = 1
        return read_image ( file_name, channel   ),tf.cast(target, tf.float32) 
    
    def decode_test_channel_2( file_name,target ):
        channel = 2
        return read_image ( file_name, channel   ),tf.cast(target, tf.float32) 
    
    if channel == None :
        
        return decode
    elif channel == 0 :
        return  decode_test_channel_0

    elif channel == 1 :
        return  decode_test_channel_1

    elif channel == 2 :
        return  decode_test_channel_2

def data_augmentation( ):
    
    def add_augmentation( image, target ):
        
        image = tf.image.random_flip_left_right( image, seed=CFG["RANDOM_STATE"] )
        image = tf.image.random_flip_up_down( image, seed=CFG["RANDOM_STATE"] )
        image = tf.image.random_contrast( image,0.2,0.5, seed=CFG["RANDOM_STATE"] )
        
        return image,target
    
    return  add_augmentation

def datagenerator_rev_02(df,test = False,channel = None ):
    file_list = df["tpu_path"].to_list() 
    target = df["target"].to_list() 
    decode_tf = decode_numpy( channel )
    augment_fn = data_augmentation()
    
    datagen = tf.data.Dataset.from_tensor_slices( (file_list,target ))
    datagen = datagen.map( decode_tf ,num_parallel_calls= tf.data.AUTOTUNE )
    datagen = datagen.map(augment_fn, num_parallel_calls= tf.data.AUTOTUNE ) if not test else datagen
    datagen = datagen.repeat() if not test else datagen
    datagen = datagen.shuffle(1024) if not test else datagen
    datagen = datagen.batch(CFG["BATCH_SIZE"])
    datagen = datagen.prefetch(tf.data.AUTOTUNE )
    return  datagen

In [None]:
## Creating data generator which can work on Both TPU + GPU

def decode_numpy(    ):
    
    def read_image(file_name, channel ):
        np_data =  tf.io.read_file ( file_name )
         
        np_data = tf.reshape( tf.io.decode_raw( np_data, tf.float16 )[64:], (6, 273, 256 ))  # (6, 273, 256 ) is data origional shape 
        #np_data_1 = tf.stack( (np_data[0],np_data[2] ,np_data[4]), axis = 2 ) 
        #np_data_1 = tf.image.resize( np_data_1, (256,256))
        #np_data = np_data_1[:,:,channel ] 
        if channel == 1 : channel = 2
        if channel == 2 : channel = 4
        return  tf.image.resize(tf.stack( (np_data[channel], np_data[channel] , np_data[channel] ), axis = 2) , (256,256) )
        
        
      
    def decode( file_name,target,channel ):
        
        return read_image ( file_name, channel   ),tf.cast(target, tf.float32)
     
    return decode
    
    
def data_augmentation( ):
    
    def add_augmentation( image, target ):
        
        return  tf.image.random_contrast( tf.image.random_flip_up_down( tf.image.random_flip_left_right( image, seed=CFG["RANDOM_STATE"] 
                                                                                                       ), 
                                                                       seed=CFG["RANDOM_STATE"] 
                                                                      ),
                                         0.3,0.8, seed=CFG["RANDOM_STATE"] 
                                        ), target
       
        #return image,target
    
    return  add_augmentation

def datagenerator_rev_02(df,test = False, channel =None  ):
    file_list = df["tpu_path"].to_list() 
    target = df["target"].to_list() 
    df_channel = [channel]*df.shape[0] # df["channel"].to_list() 
    
    decode_tf = decode_numpy(  )
    augment_fn = data_augmentation()
    
    datagen = tf.data.Dataset.from_tensor_slices( (file_list,target, df_channel ))
    datagen = datagen.map( decode_tf ,num_parallel_calls= tf.data.AUTOTUNE )
    datagen = datagen.map(augment_fn, num_parallel_calls= tf.data.AUTOTUNE ) if not test else datagen
    datagen = datagen.repeat() if not test else datagen
    datagen = datagen.shuffle(1024) if not test else datagen
    datagen = datagen.batch(CFG["BATCH_SIZE"])
    datagen = datagen.prefetch(tf.data.AUTOTUNE )
    return  datagen

# File to TFRECORD conversion


In [None]:
def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def train_serialize_example(image, img_id, target ):
    feature = {
      'image'         : _bytes_feature(image),
      'image_id'      : _bytes_feature(img_id),   
      'target'        : _int64_feature(target),
      }
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()


In [None]:

if CREATE_TF_RECORD and False :
    
    !mkdir "256x256_channel_1_tf_record"
    each_file_contain = 100
    
    for i in range( 100, 200):
        print ( i )
        file_list = train_df_master["path"][i*each_file_contain: (i+1)*each_file_contain] 
        target_val = train_df_master["target"][i*each_file_contain: (i+1)*each_file_contain] 

        with tf.io.TFRecordWriter( "./256x256_channel_1_tf_record/256x256_tfrecord_" +str(i) + ".tfrec" ) as writer:
            for file_name, target in zip (file_list,target_val):
                np_data = np.load ( file_name )
                np_data= np.dstack(( np_data[0],np_data[2],np_data[4]))
                np_data =  tf.image.resize( np_data,( 256,256) ).numpy()
                np_data  = np_data.astype( np.float32 )
                file_id  = file_name.split("/")[-1].replace( ".npy","_chan_")
                for channel in ( 0, 1, 2 ):
                    example = train_serialize_example(np_data[:,:,channel].tobytes() , str.encode(file_id +"str(channel)"), target )
                    writer.write(example)
            writer.close()
            
    !zip -r  "./256x256_channel_1_tf_record_part1.zip" "./256x256_channel_1_tf_record"
    

# Reading TF Record

In [None]:
## code decode tfrecode 
def decode_image(image_data):
    image = tf.io.decode_raw( image_data,tf.float32 )
    #image = tf.cast(image, tf.float32) / 255.0  # convert image to floats in [0, 1] range
    #image = tf.reshape(image, [*IMAGE_SIZE, 3]) # explicit size needed for TPU
    return image

def prepare_target(target):    
    target = tf.cast(target, tf.float32)            
    target = tf.reshape(target, [1])         
    return target

def read_labeled_tfrecord(example):
    LABELED_TFREC_FORMAT = {
        "image" : tf.io.FixedLenFeature([], tf.string), # tf.string means bytestring
        "image_id":tf.io.FixedLenFeature([], tf.string),
        "target": tf.io.FixedLenFeature([], tf.int64),  # shape [] means single element
    }
    example = tf.io.parse_single_example(example, LABELED_TFREC_FORMAT)
    image = decode_image(example['image'])
    image  = tf.reshape(image, [256, 256])
    image = tf.stack( (image, image, image), axis = 2)
    target = prepare_target(example['target'])
    return image, target # returns a dataset of (image, label) pairs

def augmanet_data(image, target ):
    
    mask = random.randrange(2, 40, 2)
   
    #offset = random.randrange( 1, 200, 2 )
    
    image =  tf.image.random_contrast( tf.image.random_flip_up_down( tf.image.random_flip_left_right( image, seed=CFG["RANDOM_STATE"]  ),  seed=CFG["RANDOM_STATE"] 
                                                               ),0.3,0.8, seed=CFG["RANDOM_STATE"] )
    
    #image= tf.squeeze(tfa.image.random_cutout( tf.expand_dims(image,0), (10, 10) ) )
    #image = tfa.image.cutout( tf.expand_dims(image,0),(10,10), constant_values = 0.0,offset = (2,2,2) )
    #image = tfa.image.cutout( images= image, mask_size = (mask,mask), constant_values = 0  )#, offset =(2,2 ), constant_values = 0)
      
    return image , target
    
                                        

def load_dataset(fileids, augment = False ,labeled=True, ordered=False):
    # Read from TFRecords. For optimal performance, reading from multiple files at once and
    # disregarding data order. Order does not matter since we will be shuffling the data anyway.

    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False # disable order, increase speed

    dataset = tf.data.TFRecordDataset(fileids, num_parallel_reads=tf.data.AUTOTUNE) # automatically interleaves reads from multiple files
    dataset = dataset.with_options(ignore_order) # uses data as soon as it streams in, rather than in its original order
    dataset = dataset.map(read_labeled_tfrecord,num_parallel_calls= tf.data.AUTOTUNE)
   # dataset = dataset.map( augmanet_data ,num_parallel_calls= tf.data.AUTOTUNE) if augment else dataset
    
    # returns a dataset of (image, label) pairs if labeled=True or (image, id) pairs if labeled=False
    return dataset


## Main function 
def get_training_dataset(file_ist,repeat = True, augment= True   ):
    dataset = load_dataset(file_ist,augment, labeled=True, ordered = False )
    dataset = dataset.repeat()  if repeat else  dataset # the training dataset must repeat for several epochs
    dataset = dataset.shuffle(20, seed=CFG["RANDOM_STATE"])
    dataset = dataset.batch(CFG["BATCH_SIZE"])
    dataset = dataset.prefetch(tf.data.AUTOTUNE) # prefetch next batch while training (autotune prefetch buffer size)
    return dataset

#data_gen = get_training_dataset( train_files[:2],repeat = True, augment= True  )

In [None]:
def short_effnet_model():
    #with strategy.scope():
        
    model_input = layers.Input( shape=  ( CFG["IMG_LENGTH"],CFG["IMG_WIDTH"], 3 ) , name= "encoder_input_layer" )


    efff_net =efn.EfficientNetB0(include_top = False, 
                                   weights ="noisy-student" , 
                                   input_shape = ( CFG["IMG_LENGTH"], CFG["IMG_WIDTH"], CFG["CHANNELS"]) ,
                                   input_tensor = model_input ,
                                   classes=2,
                                   pooling = True,
                                   #classifier_activation='softmax',
                                   drop_connect_rate= 0.7
                                  ) 

    for layer in  efff_net.layers  : layer.trainable = True
    
    gaussian_noise = tf.keras.layers.GaussianNoise( stddev = 0.3 ) ( model_input )
    random_crop = tf.keras.layers.experimental.preprocessing.RandomCrop( height = 30, width = 30 , seed=CFG["RANDOM_STATE"]  ) (gaussian_noise)
    random_flip =tf.keras.layers.experimental.preprocessing.RandomFlip( mode="horizontal_and_vertical", seed=CFG["RANDOM_STATE"] ) ( random_crop )
    zoom_layer = tf.keras.layers.experimental.preprocessing.RandomZoom(  height_factor =(-0.3, -0.2)  , width_factor=(-0.3, -0.2), fill_mode='reflect', interpolation='bilinear', seed=CFG["RANDOM_STATE"], fill_value=0.0 ) ( random_flip)
    random_contrast = tf.keras.layers.experimental.preprocessing.RandomContrast( factor =[0.2, 0.8 ] , seed=CFG["RANDOM_STATE"] ) ( zoom_layer )
    
    efff_net.layers[0] ( random_contrast )
    layer_00 = efff_net.layers[-1].output
    layer_01 = layers.Flatten()( layer_00 )
    layer_02 = layers.Dense( 1, activation ="sigmoid") ( layer_01)
    model_short = tf.keras.Model( inputs = model_input, outputs = layer_02 )

    optimizer = tf.keras.optimizers.Adam(learning_rate= 0.00126000004/2 ) 
    model_short.compile( optimizer= optimizer,loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), 
                 metrics=[tf.keras.metrics.AUC() ])#AUC(curve='ROC')
    
    return model_short

In [None]:
model_effnet = short_effnet_model()
model_effnet.summary()

In [None]:
gcs_path= KaggleDatasets().get_gcs_path("seti-tfrecord-256x256") +"/256x256_channel_tf_record"
tf_rec_file_list = glob.glob( gcs_path ) 
tf_rec_file_list

In [None]:
CFG["RANDOM_STATE"] =3000
train_files , val_files = train_test_split ( glob.glob( "../input/seti-tfrecord-256x256/256x256_channel_tf_record/*.tfrec"), train_size = 0.8, random_state= CFG["RANDOM_STATE"],shuffle = True)


In [None]:
if True :
    
    group_list =[]
    for i in range( 0, 300 ):
        group_list = group_list + [ i ]*200

    train_df_master["group_tfrec"] = group_list
   
    anamally_count_group = train_df_master[["group_tfrec", "target"]].groupby( by = "group_tfrec").sum().reset_index()
    
    test_file_group  = [ int( x.split("/")[-1].replace("256x256_tfrecord_","").replace(".tfrec","")) for x in train_files ]
    
    anamally_sum = anamally_count_group["target"].loc[ test_file_group ].sum()
    
    class_weight  = { 0:1, 1: ( ( ( len(train_files)* 200 ) - anamally_sum )  / anamally_sum ) }
    class_weight

In [None]:
class_weight

In [None]:

    
with strategy.scope():
    model_effnet = short_effnet_model()


lr_reducer = tf.keras.callbacks.ReduceLROnPlateau(  patience=2,
                                                    min_lr= 0.000001,
                                                    monitor='val_loss', 
                                                    factor=0.45, 
                                                    verbose=1,
                                                    min_delta = 0.2,
                                                    cooldown=2,
                                                    mode='auto', 
                                                   )




train_file_count = len( train_files )*200*3
val_file_count  = len( val_files )*200 *3

CFG["BATCH_SIZE"]= 16*45 * strategy.num_replicas_in_sync

CFG["TRAIN_STEPS"] = int ( train_file_count /CFG["BATCH_SIZE"] ) + (1 if train_file_count % CFG["BATCH_SIZE"] != 0 else 0)
CFG["VAL_STEPS"] = int ( val_file_count/CFG["BATCH_SIZE"] ) + (1 if val_file_count% CFG["BATCH_SIZE"] != 0 else 0)



#model_effnet.load_weights("../input/seti-gpu-rev-01-model/Efficient_Net_Model_Rev_01.h5")
checkpoint = tf.keras.callbacks.ModelCheckpoint( f'model{1}.h5', save_best_only=True, monitor='val_loss', mode='min')


model_history = model_effnet.fit( get_training_dataset([ gcs_path +"/"+ x.split("/")[-1] for x in train_files],repeat = True, augment= True  ),
                        class_weight= class_weight ,
                         steps_per_epoch= CFG["TRAIN_STEPS"], 
                         epochs =12, 
                         validation_data= get_training_dataset([ gcs_path +"/"+ x.split("/")[-1] for x in val_files],repeat = True, augment= False  ),
                         validation_steps = CFG["VAL_STEPS"],
                         callbacks=[ checkpoint,lr_reducer ]
                       )


In [None]:
TEST = False

In [None]:

if TEST:
    ## Creating data generator which can work on Both TPU + GPU

    def decode_numpy(    ):

        def read_image(file_name, channel ):
            np_data =  tf.io.read_file ( file_name )

            np_data = tf.reshape( tf.io.decode_raw( np_data, tf.float16 )[64:], (6, 273, 256 ))  # (6, 273, 256 ) is data origional shape 

            if channel == 1 : channel = 2
            if channel == 2 : channel = 4
            return  tf.image.resize(tf.stack( (np_data[channel], np_data[channel] , np_data[channel] ), axis = 2) , (256,256) )



        def decode( file_name,target,channel ):

            return read_image ( file_name, channel   ),tf.cast(target, tf.float32)

        return decode



    def datagenerator_rev_03(df,test = False,channel = 0  ):
        file_list = df["tpu_path"].to_list() 
        target = df["target"].to_list() 
        df_channel = [channel]*df.shape[0]

        decode_tf = decode_numpy(  )
        augment_fn = data_augmentation()

        datagen = tf.data.Dataset.from_tensor_slices( (file_list,target, df_channel ))
        datagen = datagen.map( decode_tf ,num_parallel_calls= tf.data.AUTOTUNE )
        datagen = datagen.repeat() if not test else datagen
        datagen = datagen.shuffle(1024) if not test else datagen
        datagen = datagen.batch(CFG["BATCH_SIZE"])
        datagen = datagen.prefetch(tf.data.AUTOTUNE )
        return  datagen

In [None]:
if TEST:
    submission_gcs_path = KaggleDatasets().get_gcs_path("seti-breakthrough-listen") +"/test/"
    submission_df = pd.read_csv("../input/seti-breakthrough-listen/sample_submission.csv")
    submission_df["tpu_path"] = submission_df["id"].apply( lambda x: submission_gcs_path+ str(x[0]) +"/"+x +".npy" )
    submission_df["target"] = [1]*submission_df.shape[0]


In [None]:
if TEST:
    CFG["BATCH_SIZE"]= 20 * strategy.num_replicas_in_sync
    submission_data_gen = datagenerator_rev_03( submission_df, True, 0   )
    submission_df["prediction_Channel_0"] = model_effnet.predict( submission_data_gen )
    print ("completed channel-0")
    submission_data_gen = datagenerator_rev_03( submission_df, True, 1   )
    submission_df["prediction_Channel_1"] = model_effnet.predict( submission_data_gen ) 
    print ("completed channel -1")
    submission_data_gen = datagenerator_rev_03( submission_df, True, 2   )
    submission_df["prediction_Channel_2"] = model_effnet.predict( submission_data_gen ) 
    print ("completed channel -2")
    submission_df["target"] =submission_df[["prediction_Channel_0","prediction_Channel_1","prediction_Channel_2"]].apply( lambda x :  np.max( x ),axis = 1)

In [None]:
if TEST: submission_df["target"] =submission_df[["prediction_Channel_0","prediction_Channel_1","prediction_Channel_2"]].apply( lambda x :  np.max( x ),axis = 1)

In [None]:
if TEST: submission_df[["id","target"]].to_csv("./sample_submission.csv",index = False)