In [6]:
import pandas as pd

%load_ext autoreload
%autoreload 2

In [7]:
def read_abstracts_file(filename):
    abstracts = []
    abstract = {}
    inside_abstract = False  # Flag to check if we're inside an abstract
    
    with open(filename, 'r') as file:
        for line in file:
            line = line.strip()
            
            if line.startswith('-----'):
                if abstract:  # If abstract has content, append to list
                    abstracts.append(abstract)
                    abstract = {}
                    inside_abstract = False  # Reset the flag
            else:
                # Check for known properties
                property_starts = ['Prop. Type:', 'Category:', 'ID:', 'Cycle:', 'Title:', 'PI:']
                
                if any(line.startswith(prop) for prop in property_starts) and not inside_abstract:
                    if 'Prop. Type' in abstract and line.startswith('Prop. Type:'):
                        # If a new abstract starts without delimiter, assume previous one ended
                        abstracts.append(abstract)
                        abstract = {}
                    
                    if line.startswith('Prop. Type:'):
                        abstract['Prop. Type'] = line.split(':', 1)[1].strip()
                    elif line.startswith('Category:'):
                        abstract['Category'] = line.split(':', 1)[1].strip()
                    elif line.startswith('ID:'):
                        id_val = line.split(':', 1)[1].strip()
                        try:
                            abstract['ID'] = int(id_val)
                        except ValueError:
                            abstract['ID'] = id_val
                    elif line.startswith('Cycle:'):
                        cycle_val = line.split(':', 1)[1].strip()
                        try:
                            abstract['Cycle'] = int(cycle_val)
                        except ValueError:
                            abstract['Cycle'] = cycle_val
                    elif line.startswith('Title:'):
                        abstract['Title'] = line.split(':', 1)[1].strip()
                    elif line.startswith('PI:'):
                        abstract['PI'] = line.split(':', 1)[1].strip()
                else:
                    # If none of the known properties are found, we treat the line as part of the abstract
                    abstract['Abstract'] = abstract.get('Abstract', '') + ' ' + line
                    inside_abstract = True  # Set the flag indicating we're inside an abstract
                
    # After loop ends, check if there's any remaining content in the abstract dictionary
    if abstract:
        abstracts.append(abstract)

    df = pd.DataFrame(abstracts)
    
    return df                

filename = "../data/abstracts.cat"
abstracts_df = read_abstracts_file(filename)

In [8]:
abstracts_df[abstracts_df['ID'] == 13200]

Unnamed: 0,Prop. Type,Category,ID,Cycle,Title,PI,Abstract
3139,CAL/WFC3,,13200.0,30,WFC3 SS Activation Test,John MacKenty,Part of side switch activities. This progra...


In [9]:
abstracts_df = abstracts_df.dropna(subset=['Cycle'])
abstracts_df = abstracts_df[abstracts_df['Cycle'] != '']

abstracts_df['Cycle'] = abstracts_df['Cycle'].astype(int)
abstracts_df['ID'] = abstracts_df['ID'].astype(int)
abstracts_cycle_df = abstracts_df[(abstracts_df['Cycle'] >= 25) & (abstracts_df['Cycle'] <= 31)]

In [10]:
abstracts_cycle_df['Cycle'].value_counts()

25    533
30    413
29    411
28    404
27    363
31    261
26    222
Name: Cycle, dtype: int64

In [11]:
abstract_ids = abstracts_cycle_df['ID'].values

In [12]:
# import sys
# sys.path.append("../")

# from tqdm import tqdm
# from scripts.download_data import download_data

# proposal_id = 15922
# n_max_images = 10
# max_resolution = 512
# seed = 42


# for proposal_id in tqdm(abstract_ids):
#     download_data(proposal_id, n_max_images, max_resolution, seed, data_dir='../data/observations/')

In [13]:
# import os

# def remove_large_files(directory, size_limit=2*1024*1024):  # default size_limit is set to 2MB
#     for foldername, subfolders, filenames in os.walk(directory):
#         for filename in filenames:
#             filepath = os.path.join(foldername, filename)
#             if os.path.getsize(filepath) > size_limit:
#                 try:
#                     os.remove(filepath)
#                     print(f"Removed {filepath}")
#                 except Exception as e:
#                     print(f"Error removing {filepath}: {e}")

# directory_path = '../data/observations/'
# remove_large_files(directory_path)

## Data loader

In [14]:
data_folder = "../data/observations/"

In [15]:
import os
import pandas as pd
from PIL import Image
import numpy as np

def get_abstracts_and_images(data_folder, abstracts_cycle_df):
    # Lists to store results
    images_list = []
    abstracts_list = []

    # Walk through data folder
    for root, dirs, files in os.walk(data_folder):
        for file in files:
            if file.endswith(".jpg"):
                image_path = os.path.join(root, file)
                proposal_id = root.split("proposal_")[-1]  # Extract proposal id from the directory name
                
                # Extract abstract using the dataframe
                abstract = abstracts_cycle_df[abstracts_cycle_df["ID"] == int(proposal_id)]["Abstract"].values[0]

                image = Image.open(image_path).convert("RGB")
                image = np.array(image)

                # Pad image to square
                h, w, c = image.shape
                max_dim = max(h, w)
                padded_image = np.ones((max_dim, max_dim, c), dtype=np.uint8) * 255

                # Calculate top and left padding
                y_offset = (max_dim - h) // 2
                x_offset = (max_dim - w) // 2

                padded_image[y_offset : y_offset + h, x_offset : x_offset + w, :] = image

                images_list.append(padded_image)
                abstracts_list.append(abstract)

    return np.array(abstracts_list), np.array(images_list)

data_folder = "../data/observations/"
abstracts, images = get_abstracts_and_images(data_folder, abstracts_cycle_df)

In [16]:
import tensorflow as tf

def serialize_example(abstract, image):

    height, width = image.shape[:2]

    # Encode abstract to bytes
    abstract_bytes = abstract.encode('utf-8')  

    # Convert image to bytes
    image_bytes = image.tobytes()

    feature = {
    'abstract': _bytes_feature(abstract_bytes),
    'image': _bytes_feature(image_bytes),
    'image_height': _int64_feature(height),
    'image_width': _int64_feature(width)
    }

    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

with tf.io.TFRecordWriter('../data/observations.tfrecord') as writer:
    for abstract, image in zip(abstracts, images):
        example = serialize_example(abstract, image)
        writer.write(example)

In [17]:
from models.dataset_utils import make_dataloader, create_input_iter

files = ['../data/observations.tfrecord']
ds = make_dataloader(files, batch_size=32, seed=42)
create_input_iter(ds)

image, caption = next(iter(ds))

import matplotlib.pyplot as plt
plt.imshow(image[0], vmin=0, vmax=1)

ModuleNotFoundError: No module named 'models'

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

In [None]:
caption = caption.numpy().tolist()
caption = [c.decode('utf-8') for c in caption]

tokenizer(
            caption,
            padding="max_length",
            truncation=True,
            max_length=300,
            return_tensors="np",
        )

{'input_ids': array([[49406,  1746, 13690, ...,   537, 36128, 49407],
       [49406,  1746, 13690, ...,   537, 36128, 49407],
       [49406,  1746, 13690, ...,   537, 36128, 49407],
       ...,
       [49406, 14171,  5357, ..., 49407, 49407, 49407],
       [49406, 14171,  5357, ..., 49407, 49407, 49407],
       [49406, 14171,  5357, ..., 49407, 49407, 49407]]), 'attention_mask': array([[1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])}