In [None]:
!pip install -q efficientnet

import numpy as np
import numpy.linalg as lin
import tensorflow as tf
import tensorflow_addons as tfa
import pandas as pd
import time
import os
import sklearn
import matplotlib.pyplot as plt
import albumentations as A
import efficientnet.tfkeras as efn
from PIL import Image
import random
from kaggle_datasets import KaggleDatasets

import tensorflow.keras as keras
import tensorflow.keras.layers as layers
import tensorflow.keras.models as models
import tensorflow.keras.losses as losses
import tensorflow.keras.optimizers as optimizers
import tensorflow.keras.metrics as tfmetrics
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense,Input,Dropout,GlobalAveragePooling2D,GlobalMaxPool2D,\
BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import AUC

from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

SEED = 31
DEVICE = "CPU" if 0 else "TPU"
PATH = KaggleDatasets().get_gcs_path("seti-tfrec-dataset-256")
SHAPE = [528,528]

In [None]:
def seeding(SEED):
    np.random.seed(SEED)
    random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    os.environ['TF_CUDNN_DETERMINISTIC'] = str(SEED)
    tf.random.set_seed(SEED)
    print('seeding done!!!')
seeding(SEED)

## TPU

In [None]:
if DEVICE == "TPU":
    print("connecting to TPU...")
    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        print('Running on TPU ', tpu.master())
    except ValueError:
        print("Could not connect to TPU")
        tpu = None

    if tpu:
        try:
            print("initializing  TPU ...")
            tf.config.experimental_connect_to_cluster(tpu)
            tf.tpu.experimental.initialize_tpu_system(tpu)
            strategy = tf.distribute.experimental.TPUStrategy(tpu)
            print("TPU initialized")
        except _:
            print("failed to initialize TPU")
    else:
        DEVICE = "GPU"

if DEVICE != "TPU":
    print("Using default strategy for CPU and single GPU")
    strategy = tf.distribute.get_strategy()

if DEVICE == "GPU":
    print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
    

AUTO     = tf.data.experimental.AUTOTUNE
REPLICAS = strategy.num_replicas_in_sync
BATCH = 16
print(f'REPLICAS: {REPLICAS}')

## Visualaize

In [None]:
def show_sample(array,label=None):
    array = array.astype(np.float32)
    f = plt.figure(figsize=(14,14))
   
    for i in range(6):
        ax = f.add_subplot(6,1,i+1)
        if i == 0 and label is not None:
            ax.set_title(f"label : {label}")
        ax.matshow(array[i],aspect="auto")
    plt.show()

In [None]:
def new_im(array):
    im = np.concatenate([array[i] for i in range(6)],axis=0).astype(np.float32)
    im = A.resize(im,256,256)
    f = plt.figure(figsize=(7,7))
    ax = f.add_subplot(111)
    ax.matshow(im,aspect="auto")
    plt.show()

new_im(np.load("../input/seti-breakthrough-listen/test/7/7ff1a14a9d96.npy"))

In [None]:
count = 0
labels = pd.read_csv("../input/seti-breakthrough-listen/train_labels.csv")
for i,r in labels.iterrows():
    if r.target == 1:
        show_sample(np.load(f"../input/seti-breakthrough-listen/train/0/{r.id}.npy"),1)
        count +=1
    if count > 1:
        break

## Augment

In [None]:
transformation = A.Compose([
    A.Blur(p=0.8,blur_limit=(3,7)),
    A.GaussNoise(p=0.9,var_limit=(160,250)),
    A.InvertImg(p=0.2),
    A.MultiplicativeNoise(p=0.9, multiplier=(0.2, 1.4), per_channel=True, elementwise=False),
    A.RandomBrightness(p=0.8,limit=(-0.2,0.2))
])


def gauss_noise(x,cache):
    choices = tf.random.uniform([6],0,1024,dtype=tf.int32)
    x1 = cache[choices[0]] + cache[choices[1]]
    x2 = cache[choices[2]] + cache[choices[3]]
    x3 = cache[choices[4]] + cache[choices[5]]

    noise = tf.stack([x1,x2,x3],axis=-1)
    x += noise
    return x  #tf.clip_by_value(x,0,1)
        
def multiply_ch(x,val):
    return tf.cast(tf.stack([x[:,:,i]*val[i] for i in range(3)],axis=-1),tf.float16)

def spatial(img,n_holes,size,channel_wise=False):
    # size: (min,max) of hole dimentions
    #
    #
    
    dims = img.shape
    scaling = tf.constant([dims[0]//dims[1],1],dtype=tf.int32)
    for i in range(n_holes):
        hole_pivot1 = tf.random.uniform([],0,dims[0]-size[0],dtype=tf.int32)
        hole_pivot2 = tf.random.uniform([],0,dims[1]-size[1],dtype=tf.int32)
        hole_pivot = tf.stack([hole_pivot1,hole_pivot2],axis=0)
        hole_size = tf.random.uniform([2],size[0],size[1],dtype=tf.int32)*scaling
        hole_end = tf.math.minimum(hole_pivot + hole_size,dims[:2])
        hole_size = hole_end - hole_pivot
        top = img[:hole_pivot[0],:,:]
        bottom = img[hole_end[0]:,:,:]
        mid_left = img[hole_pivot[0]:hole_end[0],:hole_pivot[1],:]
        mid_right = img[hole_pivot[0]:hole_end[0],hole_end[1]:,:]
        mid_mid = tf.zeros([hole_size[0],hole_size[1],3],dtype=tf.float16)
        mid = tf.concat([mid_left,mid_mid,mid_right],axis=1)
        img = tf.reshape(tf.concat([top,mid,bottom],axis=0),SHAPE+[3])
    return img

## Data

In [None]:
def num_of_samples(filenames):
    s = 0
    for i in filenames:
        s += int(i[-14:-10])
    return s

def get_dataset(filenames,
                target=True,
                transform=True,
                resize=True,
                num_holes_param=(6,8),
                hole_size_param=(16,24),
               ):
    ds = tf.data.TFRecordDataset(filenames)
    feature_descr = {
        "image":tf.io.FixedLenFeature([], tf.string),
        "im_name":tf.io.FixedLenFeature([],tf.string),
        "target":tf.io.FixedLenFeature([],tf.int64,default_value=0)
    }
    def parser(example):
        example = tf.io.parse_single_example(example, feature_descr)
        if target:
            return example["image"],example["target"]
        else: 
            return example["image"],example["im_name"]

    
    def preproc(img):
        img = tf.reshape(tf.io.decode_jpeg(img),[273*6,256]+[1])
        if resize:
            img = tf.image.resize(
                    img, SHAPE, method='bicubic')
        img = tf.image.adjust_contrast(img,3)
        img = tf.concat([img for _ in range(3)],axis=-1)
        img = tf.cast(img,tf.float16)/255.0
        
        return img
    
    normal_cache = tf.random.normal([1024]+SHAPE,0,0.01/np.sqrt(2),dtype=tf.float16)
    def augment(img):
        probs = [1.0,0.0,0.0]
        
        n_holes = tf.random.uniform([],*num_holes_param,dtype=tf.int32)
        mult_channel = tf.random.uniform([3],0.8,1.2,dtype=tf.float16)
        
        
        transforms = [
            lambda x: spatial(x,n_holes,hole_size_param),
            lambda x: gauss_noise(x,normal_cache),
            lambda x: multiply_ch(x,mult_channel),
        ]
        
        for p,tr in zip(probs,transforms):
            if tf.random.uniform([])<p:
                img = tr(img)
        
        return tf.reshape(img,SHAPE+[3])
    

    ds = ds.map(parser,num_parallel_calls=AUTO)
    ds = ds.map(lambda x,y:(preproc(x),y),num_parallel_calls=AUTO)
    if transform:
        ds = ds.map(lambda x,y:(augment(x),y),num_parallel_calls=AUTO)
    ds = ds.batch(BATCH*REPLICAS).repeat().prefetch(AUTO)
    return ds

filenames = tf.io.gfile.glob(f"{PATH}/train/*.tfrecords")
dataset = get_dataset(filenames)
print("started")
start = time.time()
it = list(dataset.take(3))
print(time.time()-start)
for i in it:
    count = 0
    for k in zip(i[0],i[1]):
        if count > 5:
            break
        if k[1]==1:
            plt.figure(figsize=(7,7))
            plt.title(str(k[1].numpy()))
            plt.imshow(k[0].numpy()[:,:,:].astype(np.float32),aspect="auto")
            count += 1


## Validation

## Model

In [None]:
def get_model():
    shape = tuple(SHAPE+[3])
    i = Input(shape)
    efnet = efn.EfficientNetB6(input_shape=shape,weights='imagenet',include_top=False)(i)
    x = GlobalAveragePooling2D()(efnet)
    x = BatchNormalization()(x)
    x = Dense(64,activation="swish")(x)
    x = Dense(1,activation="sigmoid")(x)
    model = Model(i,x)
    model.compile(optimizer=Adam(),
                  loss=losses.BinaryCrossentropy(),
                  metrics=[tfmetrics.AUC(name="auc")])
    return model

def lr_func(epoch):
    lrs = [1E-3,1E-3,5E-4,3E-4,1E-4,5E-5,3E-5,1E-5,5E-6]
    if epoch in range(len(lrs)):
        return lrs[epoch]
    else: return 3E-6
    
def lr_fine(epoch):
    return 2E-6
        
lr_sch = tf.keras.callbacks.LearningRateScheduler(lr_func, verbose=False) 
lr_fine_sch = tf.keras.callbacks.LearningRateScheduler(lr_fine, verbose=False)

m = get_model()
print('b')
i = it[0][0]
print('c')
m(i)

## Training

In [None]:
train_files = tf.io.gfile.glob(f"{PATH}/train/*.tfrecords")
test_files = tf.io.gfile.glob(f"{PATH}/test/*.tfrecords")

split = KFold(6,True,random_state=SEED)
for fold,(train_idx,val_idx) in enumerate(split.split(train_files)):
    train = get_dataset(np.array(train_files)[train_idx])
    val = get_dataset(np.array(train_files)[val_idx],transform=False)
    with strategy.scope():
        model = get_model()
    callbacks = [lr_sch,tf.keras.callbacks.ModelCheckpoint(f"./weights_{fold}.h5",
                                                    monitor='val_auc',
                                                    verbose=0, 
                                                    save_best_only=True,
                                                    save_weights_only=True,
                                                    mode='max',
                                                    save_freq='epoch')]
    steps = num_of_samples(np.array(train_files)[train_idx]) // (BATCH*REPLICAS)
    val_steps = num_of_samples(np.array(train_files)[val_idx]) // (BATCH*REPLICAS)
    print(f"steps: {steps}, val_steps: {val_steps}")
    model.fit(train,
              validation_data=val,
              epochs=6,
              steps_per_epoch=steps,
              validation_steps=val_steps,
              callbacks=callbacks,
              verbose=2)
    print("making augmentations lighter")
    callbacks = [lr_fine_sch,tf.keras.callbacks.ModelCheckpoint(f"./weights_{fold}.h5",
                                                    monitor='val_auc',
                                                    verbose=0, 
                                                    save_best_only=True,
                                                    save_weights_only=True,
                                                    mode='max',
                                                    save_freq='epoch')]
    train = get_dataset(np.array(train_files)[train_idx],num_holes_param=(2,4))
    val = get_dataset(np.array(train_files)[val_idx],transform=False)

    model.fit(train,
              epochs=4,
              steps_per_epoch=steps,
              validation_data=val,
              validation_steps=val_steps,
              callbacks=callbacks,
              verbose=2)
    
  