In [1]:
import pandas as pd
import sys
sys.path.append('../')
from scripts.abstract_utils import read_abstracts_file

%load_ext autoreload
%autoreload 2

## Explore data

## Load abstract

In [2]:
filename = "../data/abstracts.cat"

abstracts_df = read_abstracts_file(filename)

# Drop rows with missing Cycle
abstracts_df = abstracts_df.dropna(subset=['Cycle'])
abstracts_df = abstracts_df[abstracts_df['Cycle'] != '']

# Convert Cycle and ID to int
abstracts_df['Cycle'] = abstracts_df['Cycle'].astype(int)
abstracts_df['ID'] = abstracts_df['ID'].astype(int)

In [3]:
abstracts_df

Unnamed: 0,Prop. Type,Category,ID,Cycle,Title,PI,Abstract
0,ENG/STIS/PAR,,10000,12,STIS Pure Parallel Imaging Program: Cycle 12,Paul Goudfrooij,This is the default archival pure parallel pr...
1,GO,GALAXIES,10001,12,Locating Ultraluminous X-Ray Sources,Philip Kaaret,We propose to observe ultraluminous X-ray sou...
2,GO,AGN,10002,12,Detailed Study of X-ray Jets from a Complete S...,Eric Perlman,We propose deep followup HST and Chandra obse...
3,GO,GALAXIES,10003,12,Deep Chandra and Hubble Observations of NGC469...,Craig Sarazin,We propose 4 new Chandra observations of NGC4...
4,GO,AGN,10004,12,The Physics of Relativistic Jets: Chandra Imag...,F. Tavecchio,Extended jets have been a key target for Chan...
...,...,...,...,...,...,...,...
13129,CAL/NIC,,9995,12,Photometric Stability,Mark Dickinson,This NICMOS calibration proposal carries out ...
13130,CAL/NIC,,9996,12,Flats Stability,Alfred Schultz,This calibration proposal is the Cycle 12 NIC...
13131,CAL/NIC,,9997,12,Photometric Recalibration,Mark Dickinson,This proposal extends the NICMOS photometric ...
13132,CAL/NIC,,9998,12,NICMOS Cycle 12 Grism Calibration and Standard...,Ralph Bohlin,This is the grism calibration proposal.


In [4]:
# Only keep specific Cycles

cycle_min = 0
cycle_max = 32

abstracts_cycle_df = abstracts_df[(abstracts_df['Cycle'] >= cycle_min) & (abstracts_df['Cycle'] <= cycle_max)]

In [5]:
# abstract_ids = abstracts_cycle_df['ID'].values
# abstracts_cycle_df['Cycle'].value_counts()

In [6]:
# import os

# def remove_large_files(directory, size_limit=2*1024*1024):  # default size_limit is set to 2MB
#     for foldername, subfolders, filenames in os.walk(directory):
#         for filename in filenames:
#             filepath = os.path.join(foldername, filename)
#             if os.path.getsize(filepath) > size_limit:
#                 try:
#                     os.remove(filepath)
#                     print(f"Removed {filepath}")
#                 except Exception as e:
#                     print(f"Error removing {filepath}: {e}")

# directory_path = '../data/observations_v2/'
# remove_large_files(directory_path)

## Data loader

In [7]:
# abstracts_cycle_df[abstracts_cycle_df["ID"] == int(9998)]["Category"].values[0]

In [8]:
abstracts_cycle_df

Unnamed: 0,Prop. Type,Category,ID,Cycle,Title,PI,Abstract
0,ENG/STIS/PAR,,10000,12,STIS Pure Parallel Imaging Program: Cycle 12,Paul Goudfrooij,This is the default archival pure parallel pr...
1,GO,GALAXIES,10001,12,Locating Ultraluminous X-Ray Sources,Philip Kaaret,We propose to observe ultraluminous X-ray sou...
2,GO,AGN,10002,12,Detailed Study of X-ray Jets from a Complete S...,Eric Perlman,We propose deep followup HST and Chandra obse...
3,GO,GALAXIES,10003,12,Deep Chandra and Hubble Observations of NGC469...,Craig Sarazin,We propose 4 new Chandra observations of NGC4...
4,GO,AGN,10004,12,The Physics of Relativistic Jets: Chandra Imag...,F. Tavecchio,Extended jets have been a key target for Chan...
...,...,...,...,...,...,...,...
13129,CAL/NIC,,9995,12,Photometric Stability,Mark Dickinson,This NICMOS calibration proposal carries out ...
13130,CAL/NIC,,9996,12,Flats Stability,Alfred Schultz,This calibration proposal is the Cycle 12 NIC...
13131,CAL/NIC,,9997,12,Photometric Recalibration,Mark Dickinson,This proposal extends the NICMOS photometric ...
13132,CAL/NIC,,9998,12,NICMOS Cycle 12 Grism Calibration and Standard...,Ralph Bohlin,This is the grism calibration proposal.


In [10]:
import os
import tensorflow as tf
import pandas as pd
from PIL import Image
import numpy as np
from tqdm.notebook import tqdm
import random

def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def serialize_example(abstract, image):

    height, width = image.shape[:2]

    # Encode abstract to bytes
    abstract_bytes = abstract.encode('utf-8')  

    # Convert image to bytes
    image_bytes = image.tobytes()

    feature = {
        'abstract': _bytes_feature(abstract_bytes),
        'image': _bytes_feature(image_bytes),
        'image_height': _int64_feature(height),
        'image_width': _int64_feature(width)
    }
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

def write_tfrecord(abstracts, images, filename, metadata_file):
    with tf.io.TFRecordWriter(filename) as writer:
        for abstract, image in zip(abstracts, images):
            tf_example = serialize_example(abstract, image)
            writer.write(tf_example)
    
    # Write metadata to the auxiliary file
    with open(metadata_file, 'a') as meta_file:
        meta_file.write(f"{filename}: {len(images)} images\n")

def get_abstracts_and_images_and_write_tfrecords(data_folder, tfrecords_folder, abstracts_cycle_df, num_tfrecords, num_train_tfrecords):
    
    # Lists to store results
    images_list = []
    abstracts_list = []

    # Collect directories that contain .jpg files and match the "proposal_" pattern, excluding unwanted directories
    directories_with_images = [os.path.join(r, d)
                               for r, dirs, files in os.walk(data_folder)
                               for d in dirs
                               if d.startswith("proposal_") and not d.endswith('.ipynb_checkpoints')]

    # Shuffle the list of directories
    random.shuffle(directories_with_images)

    # Get the total number of jpg files to be processed for the progress bar
    total_files = sum([len(files) for r, d, files in os.walk(data_folder) if any(file.endswith('.jpg') for file in files)])

    # Calculate the chunk size
    chunk_size = total_files // num_tfrecords if total_files >= num_tfrecords else 1

    print(f"Chunk size is {chunk_size}")

    # Initialize the progress bar
    pbar = tqdm(total=total_files, desc='Processing Images')

    # Initialize the file number
    file_num = 1
    metadata_file = f'{tfrecords_folder}/metadata.txt'
    
    # Clear the metadata file if it exists
    if os.path.exists(metadata_file):
        os.remove(metadata_file)

    # Create tfrecords folder if it doesn't exist
    os.makedirs(tfrecords_folder, exist_ok=True)

    # Walk through data folder
    for directory in directories_with_images:
        for file in os.listdir(directory):
            if file.endswith(".jpg"):
                
                image_path = os.path.join(directory, file)
                proposal_id = directory.split("proposal_")[-1]  # Extract proposal id from the directory name
                
                # Extract abstract using the dataframe
                abstract = abstracts_cycle_df[abstracts_cycle_df["ID"] == int(proposal_id)]["Abstract"].values[0]
                category = abstracts_cycle_df[abstracts_cycle_df["ID"] == int(proposal_id)]["Category"].values[0]

                if category is not None:
                    abstract = f"Category: {category}. {abstract}"
                    
                image = Image.open(image_path).convert("RGB")
                image = np.array(image)

                # Pad image to square
                h, w, c = image.shape
                max_dim = max(h, w)
                padded_image = np.ones((max_dim, max_dim, c), dtype=np.uint8) * 255

                # Calculate top and left padding
                y_offset = (max_dim - h) // 2
                x_offset = (max_dim - w) // 2

                padded_image[y_offset : y_offset + h, x_offset : x_offset + w, :] = image

                images_list.append(padded_image)
                abstracts_list.append(abstract)
                pbar.update(1)  # Update the progress bar

                # If the length of the lists reaches the chunk size, write to a TFRecord file
                if len(images_list) >= chunk_size:
                    print(f"Writing record {file_num}")

                    if file_num < num_train_tfrecords:
                        filename = f"{tfrecords_folder}/observations_train_{file_num}.tfrecord"
                    else:
                        filename = f"{tfrecords_folder}/observations_val_{file_num}.tfrecord"
                        
                    write_tfrecord(abstracts_list, images_list, filename, metadata_file)
                    
                    # Reset the images and abstracts lists
                    images_list = []
                    abstracts_list = []
                    
                    file_num += 1  # Increment the file number

    # Write the remaining records to a TFRecord file if any
    if images_list:
        print(f"Writing final record {num_tfrecords}")
        filename = f"{tfrecords_folder}/observations_val_{file_num}.tfrecord"
        write_tfrecord(abstracts_list, images_list, filename, metadata_file)

    pbar.close()  # Close the progress bar

tfrecords_folder = "../data/tfrecords_v3/"
data_folder = "../data/observations_v1/"
num_tfrecords = 10
num_train_tfrecords = 9

get_abstracts_and_images_and_write_tfrecords(data_folder, tfrecords_folder, abstracts_cycle_df, num_tfrecords, num_train_tfrecords)

Chunk size is 3185


Processing Images:   0%|          | 0/31859 [00:00<?, ?it/s]

Writing record 1
Writing record 2
Writing record 3
Writing record 4
Writing record 5
Writing record 6
Writing record 7
Writing record 8
Writing record 9
Writing record 10
Writing final record 10
