In [None]:
import os, shutil
import cv2
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from kaggle_datasets import KaggleDatasets
from IPython.display import display
from tqdm import tqdm
import skimage.measure

In [None]:
BASE_PATH='/kaggle/input/seti-breakthrough-listen'

In [None]:
def get_dataset():
    dfTrain = pd.read_csv(BASE_PATH + '/train_labels.csv')
    dfTrain['dir'] = dfTrain['id'].apply(lambda x: x[0])
    dfTrain['path'] = dfTrain['id'].apply(lambda x: f'/train/{x[0]}/{x}.npy')
    
    dfTest = pd.read_csv(BASE_PATH + '/sample_submission.csv')
    dfTest['dir'] = dfTest['id'].apply(lambda x: x[0])
    dfTest['path'] = dfTest['id'].apply(lambda x: f'/test/{x[0]}/{x}.npy')
    return dfTrain[['id', 'dir', 'path', 'target']], dfTest[['id', 'dir', 'path', 'target']]

In [None]:
dfTrainTest, dfSubmit = get_dataset()

In [None]:
dfTrainTest.head()

In [None]:
dfSubmit.head()

In [None]:
print(f'Dataset Count: {len(dfTrainTest)}\n')

print(f'Count of 0 and 1 in dataset: \n{dfTrainTest.target.value_counts()}')

print(f'\nPercentage of 0 and 1 in dataset: \n{dfTrainTest.target.value_counts(normalize=True)}')

In [None]:
# dfPositive=dfTrainTest[dfTrainTest.target==1]
# dfNegative=dfTrainTest[dfTrainTest.target==0]

# dfLIST=[]
# FOLDS_PATH='{}/{}/'
# FOLDS = len(dfNegative)//len(dfPositive) + int(len(dfNegative)%len(dfPositive)!=0)
# print('Folds:', FOLDS)

# for k in range(FOLDS):
#     folder_path = 'dataframes'
#     os.makedirs(folder_path, exist_ok=True)
#     dfNegative1=dfNegative.sample(n = len(dfPositive) if len(dfNegative)>len(dfPositive) else len(dfNegative))
#     dfNegative = dfNegative.drop(dfNegative1.index)
#     df=pd.concat((dfNegative1,dfPositive))
#     df.to_csv(folder_path+f'/df_{k:02}.csv')
#     dfLIST.append(df)

In [None]:
print("Target Classes=",list(dfTrainTest.target.unique()))
print("Sample File Reading Np array.")
nparray=np.load(BASE_PATH+dfTrainTest.loc[0,'path'])
print("NPY File Size",nparray.shape)
nparray[::2].transpose(1, 2, 0).shape

## Preview Image

In [None]:
def convert_np_img(img, stack=True):
    if stack:
        return np.stack((
              np.hstack((img[0,:,:],img[1,:,:])),
              np.hstack((img[2,:,:],img[3,:,:])),
              np.hstack((img[4,:,:],img[5,:,:]))),
              axis=2).astype(np.float32)
    else:
        return img[::2].transpose(1, 2, 0).astype(np.float32)
#         return np.stack((img[0,:,:],img[2,:,:], img[4,:,:]), axis=2).astype(np.float32)
#         return np.hstack((img[0,:,:],img[2,:,:], img[4,:,:])).astype(np.float32)

In [None]:
def preview_image(df, stack=True):
    fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(10, 6))
    idx=(df[(df['target']==1)])['path'].index[10]
    target1_img=np.load(BASE_PATH+df.loc[idx,'path'])
    
    idx=(df[(df['target']==0)])['path'].index[10]
    target0_img=np.load(BASE_PATH+df.loc[idx,'path'])
    
    target1_img = convert_np_img(target1_img, stack)
    target0_img = convert_np_img(target0_img, stack)
    axs[0].imshow(target1_img)
    axs[0].set_title(f'Target: 1 and Shape: {target1_img.shape}')
    axs[1].imshow(target0_img)
    axs[1].set_title(f'Target: 0 and Shape: {target0_img.shape}')
    plt.show()

### Preview Image

In [None]:
print('Stacked Channel Image')
preview_image(dfTrainTest)

print('\nSelective Channel Image')
preview_image(dfTrainTest, stack=False)

In [None]:
# def encode_img(path):
#     img = np.load(path)
#     img = img[::2,] # taking only 1, 3, 5 channels
#     img  = np.moveaxis(img, 0, -1)
#     img  = img.astype(np.float32)
#     img = cv2.resize(img, IMAGE_SIZE, interpolation = cv2.INTER_AREA)
#     kernel = np.array([[-1, -1, -1], [-1, 9, -1], [-1, -1, -1]])
#     img = cv2.filter2D(img, -1, kernel)
#     img = img[...,::-1]
#     img = cv2.imencode('.png', img)[1]
#     return img.tobytes()

# def encode_img(path):
#     img = np.load(path).astype(float)
#     img = np.vstack(img[::2])
#     img = skimage.measure.block_reduce(img, (3,1), np.max)
#     img = cv2.imencode('.png', img, [cv2.IMWRITE_PNG_COMPRESSION, 0])[1]
#     return img.tobytes() 

def encode_img(path):
    img = np.load(path).astype(float)
    img = img[::2].transpose(1, 2, 0)
    kernel = np.array([[-1, -1, -1], [-1, 9, -1], [-1, -1, -1]])
    img = cv2.filter2D(img, -1, kernel)
    img = cv2.imencode('.png', img, [cv2.IMWRITE_PNG_COMPRESSION, 1])[1]
    return img.tobytes() 

def _bytes_feature(value):
    """Returns a bytes_list from a string/byte"""
    if isinstance(value, type(tf.constant(0))):
        # BytesList won't unpack a string/byte from EagerTensor
        value = value.numpy()
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    """Returns a float_list from a float/double"""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
    """Returns an int64_list from a bool/int/enum/uint"""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

In [None]:
def serialize_example(img_id, img_data, target=None, is_train=True):
    feature = {
        'img_data': _bytes_feature(img_data),
        'img_id': _bytes_feature(img_id)
    }
    if is_train:
        feature['target'] = _int64_feature(target)
    # Below code create feature message 
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    # Return the serialized binary string
    return example_proto.SerializeToString()

In [None]:
# def create_tfrec(df, fol_name, tfrec_batch=1024, is_train=True):
#     os.makedirs(fol_name, exist_ok=True)
#     df_size = len(df)
#     tfrec_count = df_size//tfrec_batch + int(df_size%tfrec_batch!=0)
#     printer1 = display(display_id=True)
#     printer2 = display(display_id=True)
#     for f in tqdm(range(tfrec_count)):
#         printer1.update(f'Writing TFRecord file {f+1} of {tfrec_count}')
#         batch = min(tfrec_batch, df_size - f*tfrec_batch)
#         with tf.io.TFRecordWriter(path=f'{fol_name}/{f:02}-{batch}.tfrec') as writer:
#             for idx, k in enumerate(range(tfrec_batch*f, batch+f*tfrec_batch)):
#                 # serializing features
#                 serializer = serialize_example(
#                     img_id=df.loc[k,'id'].encode('utf-8'), 
#                     img_data=encode_img(BASE_PATH+df.loc[k,'path']), 
#                     target=None if not is_train else df.loc[k,'target'],
#                     is_train=is_train
#                 )
#                 writer.write(serializer)
#                 printer2.update(f'****Processed {idx + 1} of {batch}****')

def create_tfrec(df, fol_name, tfrec_batch=1024, is_train=True):
    if not os.path.exists(fol_name):
        os.mkdir(fol_name)
    df_size = len(df)
    tfrec_count = df_size//tfrec_batch + int(df_size%tfrec_batch!=0)
    printer1 = display(display_id=True)
    printer2 = display(display_id=True)
    for f in tqdm(range(tfrec_count)):
        printer1.update(f'Writing TFRecord file {f+1} of {tfrec_count}')
        batch = min(tfrec_batch, df_size - f*tfrec_batch)
        with tf.io.TFRecordWriter(path=f"{fol_name}/{f:02}-{batch}.tfrec") as writer:
            for idx, k in enumerate(range(tfrec_batch*f, batch+f*tfrec_batch)):
                # serializing features
                serializer = serialize_example(
                    img_id=df.loc[k,'id'].encode('utf-8'), 
                    img_data=encode_img(BASE_PATH+df.loc[k,'path']), 
                    target=None if not is_train else df.loc[k,'target'],
                    is_train=is_train
                )
                writer.write(serializer)
                printer2.update(f'****Processed {idx + 1} of {batch}****')

In [None]:
# for i in range(len(dfLIST)):
#     dfTrain, dfTest = train_test_split(dfLIST[i], test_size=0.3)
#     dfTrain=dfTrain.reset_index(drop=True)
#     dfTest=dfTest.reset_index(drop=True)
#     create_tfrec(dfTrain,f'train/{i:02}', TRAIN_TFREC_BATCH)
#     create_tfrec(dfTest,f'val/{i:02}', TEST_TFREC_BATCH)
# create_tfrec(dfSubmit,'test',SUBMIT_TFREC_BATCH, False)

In [None]:
dfTrain, dfVal = train_test_split(dfTrainTest, test_size=0.1)
dfTrain=dfTrain.reset_index(drop=True)
dfVal=dfVal.reset_index(drop=True)

In [None]:
create_tfrec(dfTrain, 'train', 2048, True)

In [None]:
create_tfrec(dfVal, 'val', 1024, True)

In [None]:
create_tfrec(dfSubmit, 'test', 2048, False)

In [None]:
! ls -lh ./*/*.tfrec

In [None]:
# ! rm ./*/*/*.tfrec