In [40]:
import os, sys
sys.path.append('../')

import outlines
import outlines.models as models
import outlines.text as text

import torch
import transformers

from pydantic import BaseModel, Field, constr, conlist
from enum import Enum

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [94]:
@outlines.prompt
def prompt_fn(sum):
    """Please produce a list of around concepts characterizing prominent objects, phenomena, and science use cases of images observed by the Hubble Space Telescope.

Here are some examples of objects:

{{sum}}

Follow these instructions exactly in your answer:
- Do not output empty strings as elements.
- Make sure that the list covers a diverse range of astronomical concepts, with items as different from each other as possible. 
- Do not give specific names of objects, to make sure you span the widest possible range of concepts (e.g., "dwarf galaxy" is allowed, but NOT "Fornax", "Terzan 5", or  "NGC6440").
- Do not return terms undescriptive of observations, e.g. "sloshing", "adiabatic", "interactions". Returning concrete physics objects, concepts, or phenomena.
- Only output scientifically meaningful terms that are descriptive of Hubble Space Telescope observations.
- Do not duplicate entries. Do not reference any telescopes, observatories, or surveys.
- Do not include units like "angular diameter distance", "parsec", or any other concepts that will not correlate with images of observations.
- Use the above example list of objects only as inspiration to infer broad classes of objects.
- Make sure each concept is succint, never more than 5 words.
- Answer in JSON format.
- The JSON should have the following keys {"galaxies", "stellar_physics", "exoplanets_planet_formation", "stellar_populations", "supermassive_black_holes", "solar_system", "integalactic_medium", "large_scale_structure"} reflecting rough observation categories.
- Each category will have a list of objects and/or astronomical phenomena.
- Output up to 20 items and no more in each category.
"""

In [96]:
import pandas as pd
summaries_filename = "../data/summary_v2.csv"
summaries_df = pd.read_csv(summaries_filename)

In [97]:
from tqdm import tqdm
import json

n_examples = 100
n_tries = 5

sum1 = []
prompt_list = []
for i_try in tqdm(range(n_tries)):
    
    prompt = prompt_fn('\n'.join(summaries_df['objects_phenomena'].values[i_try * n_examples:(i_try + 1) * n_examples]))
    prompt_list.append(prompt)

    client = OpenAI()
    
    response = client.chat.completions.create(
      model="gpt-4-1106-preview",
      response_format={ "type": "json_object" },
      messages=[
        {"role": "system", "content": "You are an expert astrophysicist and a helpful assistant designed to output JSON."},
        {"role": "user", "content": prompt}
      ]
    )
    
    output = json.loads(response.choices[0].message.content)
    list_of_lists = [output[key] for key in output.keys()]
    sum1 += [item for sublist in list_of_lists for item in sublist]

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [08:12<00:00, 98.59s/it]


In [98]:
# Strip special characters

import string 

special_chars = set(string.punctuation) 

cleaned_sum1 = []
for s in sum1:
    start_idx = 0
    while start_idx < len(s) and (s[start_idx] in special_chars or s[start_idx].isspace()):
        start_idx += 1
        
    cleaned_sum1.append(s[start_idx:])
        
cleaned_sum1;

In [113]:
# Remove duplicates and combine lists

cleaned2_sum1 = []
for s in cleaned_sum1:
    if s == '':
        continue
        
    if s.lower() not in map(str.lower, cleaned2_sum1):
        cleaned2_sum1.append(s.lower())

# cleaned2_sum1

In [114]:
len(cleaned2_sum1)

535

In [112]:
with open("../data/sum1_gpt4.txt", "w") as file:
    # Write each string to the file
    for string in cleaned2_sum1:
        file.write(string + "\n")

In [34]:
from utils.abstract_utils import read_abstracts_file

from tqdm.notebook import tqdm

filename = "../data/abstracts.cat"

abstracts_df = read_abstracts_file(filename)

# Drop rows with missing Cycle
abstracts_df = abstracts_df.dropna(subset=['Cycle'])
abstracts_df = abstracts_df[abstracts_df['Cycle'] != '']

# Convert Cycle and ID to int
abstracts_df['Cycle'] = abstracts_df['Cycle'].astype(int)
abstracts_df['ID'] = abstracts_df['ID'].astype(int)

In [35]:
@outlines.prompt
def prompt_fn(abs, cats):
    """<s>[INST] The following is a successful proposal abstract for the Hubble Space Telescope: "{{abs}}"

The following is a list of categories (astronomical concepts) that this abstract could correspond to.

{{cats}}

Please answer which of these listed concepts best describes this proposal, based on the objects and phenomena mentioned in the abstract.
The concept should meaningfully be present in the abstract and the eventual observation.

- For example, "The locations of supernovae {SNe} in the local stellar and gaseous environment in galaxies, as measured in high spatial resolution WFPC2 and ACS images, contain important clues to their progenitor stars." should return "supernova".
- If the abstract centers calibration and/or instrumentation efforts, return calibration or instrumention".

If no concept make sense, return "None". [/INST]
"""

In [36]:
from outlines.generate import choice

In [37]:
generator = choice(model, cleaned2_sum1 + ["None", "calibration or instrumention"])

In [38]:
abstract = abstracts_df['Abstract'].values[-77]  # -77
abstract

' We propose a comprehensive survey of CO column densities in diffuse molecular clouds extracted from archival spectra. The primary dataset involves STIS high resolution spectra, supplemented by high quality data obtained with GHRS. We will examine the 12C16O/13C16O ratio in 15 to 20 directions and the CO/H2 ratio in over 30 sight lines, thereby more than doubling the number of clouds with precisely determined column densities. The survey will provide the basis for the most thorough comparison between observations and theoretical models of CO photochemistry. Since CO is used as a diagnostic of the physical conditions in many astronomical environments, accurate models are essential. The comparison made with our survey will lead to more accurate models than those available today. Particular attention will be given to discerning the CO column where self shielding significantly reduces photodissociation. The trends in the CO/H2 ratio, especially for CO column densities much greater than th

In [39]:
prompt = prompt_fn(abstract, ', '.join(cleaned2_sum1 + ["calibration or instrumention"]))
result = generator(prompt)
result

'Magnetars'

In [41]:
# import pandas as pd

# data_folder = "../data/observations_v1/"

# def get_abstracts(data_folder, min_abstract=0, max_abstracts=99999999):
    
#     # Lists to store results
#     proposal_id_list = []
#     result_list = []

#     # Collect directories that contain .jpg files and match the "proposal_" pattern, excluding unwanted directories
#     directories_with_images = [os.path.join(r, d)
#                                for r, dirs, files in os.walk(data_folder)
#                                for d in dirs
#                                if d.startswith("proposal_") and not d.endswith('.ipynb_checkpoints')]

#     # Walk through data folder
#     for directory in tqdm(directories_with_images[min_abstract:max_abstracts]):
#         proposal_id = directory.split("proposal_")[-1]  # Extract proposal id from the directory name

#         # Extract abstract using the dataframe
#         abstract = abstracts_df[abstracts_df["ID"] == int(proposal_id)]["Abstract"].values[0]
#         prompt = prompt_fn(abstract, ', '.join(cleaned2_sum1 + ["calibration or instrumention"]))
#         result = generator(prompt)
#         print(f"{result}:{abstract}")
#         print("\n")

#         proposal_id_list.append(proposal_id)
#         result_list.append(result)

#     return proposal_id_list, result_list
        
# proposal_id_list, result_list = get_abstracts(data_folder)

In [42]:
# Create a DataFrame
df = pd.DataFrame({
    'proposal_id': proposal_id_list,
    'objects_phenomena': result_list,
})

df.to_csv('../data/summary_sum1_v2.csv', index=False)