In [1]:
import os, sys
sys.path.append('../')

import outlines
import outlines.models as models
import outlines.text as text

import torch
import transformers

from pydantic import BaseModel, Field, constr, conlist
from enum import Enum

%load_ext autoreload
%autoreload 2

In [2]:
@outlines.prompt
def prompt_fn(sum):
    """Please produce a list of around concepts characterizing prominent objects, phenomena, and science use cases of images observed by the Hubble Space Telescope.

Here are some examples of objects:

{{sum}}

Follow these instructions exactly in your answer:
- Do not output empty strings as elements.
- Make sure that the list covers a diverse range of astronomical concepts, with items as different from each other as possible. 
- Do not give specific names of objects, to make sure you span the widest possible range of concepts (e.g., "dwarf galaxy" is allowed, but NOT "Fornax", "Terzan 5", or  "NGC6440").
- Do not return terms undescriptive of observations, e.g. "sloshing", "adiabatic", "interactions". Returning concrete physics objects, concepts, or phenomena.
- Only output scientifically meaningful terms that are descriptive of Hubble Space Telescope observations.
- Do not duplicate entries. Do not reference any telescopes, observatories, or surveys.
- Do not include units like "angular diameter distance", "parsec", or any other concepts that will not correlate with images of observations.
- Use the above example list of objects only as inspiration to infer broad classes of objects.
- Make sure each concept is succint, never more than 5 words.
- Answer in JSON format.
- The JSON should have the following keys {"galaxies", "stellar_physics", "exoplanets_planet_formation", "stellar_populations", "supermassive_black_holes", "solar_system", "integalactic_medium", "large_scale_structure"} reflecting rough observation categories.
- Each category will have a list of objects and/or astronomical phenomena.
- Output up to 20 items and no more in each category.
"""

In [96]:
import pandas as pd
summaries_filename = "../data/summary_v2.csv"
summaries_df = pd.read_csv(summaries_filename)

In [97]:
from tqdm import tqdm
import json

n_examples = 100
n_tries = 5

sum1 = []
prompt_list = []
for i_try in tqdm(range(n_tries)):
    
    prompt = prompt_fn('\n'.join(summaries_df['objects_phenomena'].values[i_try * n_examples:(i_try + 1) * n_examples]))
    prompt_list.append(prompt)

    client = OpenAI()
    
    response = client.chat.completions.create(
      model="gpt-4-1106-preview",
      response_format={ "type": "json_object" },
      messages=[
        {"role": "system", "content": "You are an expert astrophysicist and a helpful assistant designed to output JSON."},
        {"role": "user", "content": prompt}
      ]
    )
    
    output = json.loads(response.choices[0].message.content)
    list_of_lists = [output[key] for key in output.keys()]
    sum1 += [item for sublist in list_of_lists for item in sublist]

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [08:12<00:00, 98.59s/it]


In [98]:
# Strip special characters

import string 

special_chars = set(string.punctuation) 

cleaned_sum1 = []
for s in sum1:
    start_idx = 0
    while start_idx < len(s) and (s[start_idx] in special_chars or s[start_idx].isspace()):
        start_idx += 1
        
    cleaned_sum1.append(s[start_idx:])
        
cleaned_sum1;

In [113]:
# Remove duplicates and combine lists

cleaned2_sum1 = []
for s in cleaned_sum1:
    if s == '':
        continue
        
    if s.lower() not in map(str.lower, cleaned2_sum1):
        cleaned2_sum1.append(s.lower())

# cleaned2_sum1

In [114]:
len(cleaned2_sum1)

535

In [112]:
with open("../data/sum1_gpt4.txt", "w") as file:
    for string in cleaned2_sum1:
        file.write(string + "\n")

In [3]:
cleaned2_sum1 = []
with open("../data/sum1_gpt4.txt", "r") as file:
    for line in file:
        cleaned2_sum1.append(line.strip())

In [5]:
len(cleaned2_sum1)

535

In [10]:
from transformers import BitsAndBytesConfig

model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1"

config = transformers.AutoConfig.from_pretrained(
    model_name, trust_remote_code=True,
)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = models.transformers(
    
    model_name=model_name,
    device="cuda:0",
    model_kwargs={
        "config": config,
        "quantization_config": bnb_config,
        "trust_remote_code": True,
        "device_map": "auto",
        "load_in_4bit": True,
        "cache_dir": "/n/holystore01/LABS/iaifi_lab/Users/smsharma/hf_cache/"
    },
)

Loading checkpoint shards:   0%|          | 0/19 [00:00<?, ?it/s]

In [11]:
from utils.abstract_utils import read_abstracts_file

from tqdm.notebook import tqdm

filename = "../data/abstracts.cat"

abstracts_df = read_abstracts_file(filename)

# Drop rows with missing Cycle
abstracts_df = abstracts_df.dropna(subset=['Cycle'])
abstracts_df = abstracts_df[abstracts_df['Cycle'] != '']

# Convert Cycle and ID to int
abstracts_df['Cycle'] = abstracts_df['Cycle'].astype(int)
abstracts_df['ID'] = abstracts_df['ID'].astype(int)

In [13]:
from outlines.generate import choice

In [14]:
generator = choice(model, cleaned2_sum1 + ["None", "calibration or instrumention"])

In [130]:
@outlines.prompt
def prompt_fn(abs, cats):
    """<s>[INST] The following is a successful observational proposal abstract for the Hubble Space Telescope.
    
Abstract: {{abs}}

The following is a list of categories (astronomical concepts) that this abstract could correspond to.

Categories: {{cats}}

Please answer and return which of the listed concepts best describes the content of this proposal, based on the objects and phenomena mentioned in the abstract.

Here is the abstract again: "{{abs}}"

Follow these instructions exactly:
- The returned concept should be meaningfully present in the abstract.
- Of all the listed concepts, the returned one have greatest affinity with the abstract.
- If the abstract centers calibration and/or instrumentation efforts, return "calibration or instrumention".
- Only if no listed concept meaningfully corresponds to the proposal, return "None".[/INST]
"""

In [133]:
abstract = abstracts_df['Abstract'].values[-191]  # -77
abstract

' Essentially all well-characterized preplanetary nebulae {PPNe}-- objects in transition between the AGB and planetary nebula evolutionary phases - are bipolar, whereas the mass-loss envelopes of AGB stars are strikingly spherical. In order to understand the processes leading to bipolar mass-ejection, we need to know at what stage of stellar evolution does bipolarity in the mass-loss first manifest itself. We have recently hypothesized that most OH/IR stars {evolved mass- losing stars with OH maser emission} are very young PPNe. We are conducting a multiwavelength survey program of imaging and spectroscopic observations of such objects, using a large, morphologically unbiased sample selected using IRAS 12-to-25 micron colors. Our ongoing HST/SNAP imaging survey of the optically bright half of this sample with WFPC2 and ACS is highly succesful: 19/32 objects observed are extended with bipolar/multipolar shapes {remaining objects are unresolved}. Slightly more than 50% of our sample are 

In [134]:
prompt = prompt_fn(abstract, ', '.join(cleaned2_sum1 + ["calibration or instrumention"]))
result = generator(prompt)
result

'stellar evolutionary track'

In [135]:
import pandas as pd

data_folder = "../data/observations_v1/"

def get_abstracts(data_folder, min_abstract=0, max_abstracts=99999999):
    
    # Lists to store results
    proposal_id_list = []
    result_list = []

    # Collect directories that contain .jpg files and match the "proposal_" pattern, excluding unwanted directories
    directories_with_images = [os.path.join(r, d)
                               for r, dirs, files in os.walk(data_folder)
                               for d in dirs
                               if d.startswith("proposal_") and not d.endswith('.ipynb_checkpoints')]

    # Walk through data folder
    for directory in tqdm(directories_with_images[min_abstract:max_abstracts]):
        proposal_id = directory.split("proposal_")[-1]  # Extract proposal id from the directory name

        # Extract abstract using the dataframe
        abstract = abstracts_df[abstracts_df["ID"] == int(proposal_id)]["Abstract"].values[0]
        prompt = prompt_fn(abstract, ', '.join(cleaned2_sum1 + ["calibration or instrumention"]))
        result = generator(prompt)
        print(f"{result}:{abstract}")
        print("\n")

        proposal_id_list.append(proposal_id)
        result_list.append(result)

    return proposal_id_list, result_list
        
proposal_id_list, result_list = get_abstracts(data_folder)

  0%|          | 0/4438 [00:00<?, ?it/s]

superluminous supernovae: The superluminous supernova (SLSN) SN 2015bn is one of the three nearest objects of its kind to date, and is now the best studied. However, despite a wealth of observations within ~1 year of peak light, the explosion mechanism remains elusive. Here we propose to distinguish between the popular scenarios of magnetar spin-down, circumstellar interaction and pair-instability explosion, by obtaining the latest and deepest ever images of a SLSN at around 2 years after explosion. At this phase, competing models predict robust colour differences, which we can detect in 3 orbits of HST+ACS observations (with 3 more orbits in the following cycle required for host galaxy template subtraction, also leading to a detailed host analysis). In the decade or so since the first SLSNe were discovered, no event has had the combination of late-time luminosity, proximity and a sufficiently faint host to be able to follow to such a late phase. We stress that these observations are n

In [136]:
# Create a DataFrame
df = pd.DataFrame({
    'proposal_id': proposal_id_list,
    'objects_phenomena': result_list,
})

df.to_csv('../data/summary_sum1_v3.csv', index=False)