In [2]:
import os, sys
sys.path.append('../')

import outlines
import outlines.models as models
import outlines.text as text

import torch
import transformers

from pydantic import BaseModel, Field, constr, conlist
from enum import Enum

from utils.summarize_utils import ConstrainedResponseHST, prompt_fn

%load_ext autoreload
%autoreload 2

In [None]:
# # model = models.awq("TheBloke/Mistral-7B-OpenOrca-AWQ")
# model = models.awq("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ")

In [None]:
from transformers import BitsAndBytesConfig

# model_name = "mosaicml/mpt-7b-8k-instruct"
# model_name = "teknium/OpenHermes-2.5-Mistral-7B"
model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1"

config = transformers.AutoConfig.from_pretrained(
    model_name, trust_remote_code=True, asd=True,
)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = models.transformers(
    
    model_name=model_name,
    model_kwargs={
        "config": config,
        "quantization_config": bnb_config,
        "trust_remote_code": True,
        "device_map": "auto",
        "load_in_4bit": True,
        "cache_dir": "/n/holystore01/LABS/iaifi_lab/Users/smsharma/hf_cache/"
    },
)

In [None]:
# abstract = """
# We propose to observe ultraluminous X-ray sources which are located near
# objects bright both in the X-rays and the optical using Chandra and HST. The
# presence of these reference objects will allow us to tie the x-ray and optical
# references frames and achieve 0.1-0.2 arcsecond relative position accuracy in
# searching for optical counterparts to the ultraluminous x-ray sources. This
# will be a significant improvement over the accuracy previously obtained for
# most ULXs {limited by Chandra's absolute astrometry} and will should permit
# identification of individual counterparts.
# """

# abstract = """
# We propose a comprehensive optical, UV, and X-ray investigation of the unique
# galaxy POX 52. POX 52 is a Seyfert 1 galaxy with unprecedented properties: its
# host galaxy appears to be a dwarf elliptical, and its stellar velocity
# dispersion is only 36 km/s. The stellar velocity dispersion and the broad
# emission-line widths both suggest a black hole mass of order 10^5 solar
# masses, placing POX 52 in a region of AGN parameter space that is almost
# completely unexplored at present. We request ACS/HRC imaging to perform a
# definitive measurement of the host galaxy structure; STIS UV and optical
# spectroscopy to study the nonstellar continuum and the structure of the
# broad-line region; and Chandra ACS imaging to detect the X-ray emission from
# the nucleus and investigate its spectral and variability properties. The
# results of this program will give a detailed understanding of the host galaxy
# and accretion properties of one of the very few known black holes in the mass
# range around 10^5 solar masses.
# """

abstract = """
The observed optical depths to microlensing of stars in the Galactic bulge are
difficult to reconcile with our present understanding of Galactic dynamics.
The main source of uncertainty in those comparisons is now shifting from
microlensing measurements to the dynamical models of the Galactic bar. We
propose to constrain the Galactic bar models with proper motion observations
of Bulge stars that underwent microlensing by determining both the kinematic
identity of the microlensed sources and the importance of streaming motions.
The lensed stars are typically farther than randomly selected stars.
Therefore, our proper motion determinations for 36 targeted MACHO events will
provide valuable constraints on the dynamics of bulge stars as a function of
distance. The first epoch data for our proposed events is already available in
the HST archive so the project can be completed within a single HST cycle. The
exceptional spatial resolution of HST is essential for completion of the
project. Constraints on te total mass in the bulge will ultimately lead to
the determination of the amount of dark matter in inner Galaxy.
"""

In [35]:
@outlines.prompt
def prompt_fn(abstract):
     """[INST]
You are an expert astrophysicist, with broad expertise across observational and theoretical astrophysics.
You are able to extract and summarize core information from text relevant ti astrophysics.

Abstract: "{{abstract}}"

The above is an abstract for a proposed observation taken by the Hubble Space Telescope.

Your will summarize the nature of the eventual observation taken by the Telescope, mentioning the astrophysical objects imaged and possible downstream science use cases and applications described in the abstract.

Follow these instructions exactly:
- You will never mention the Hubble Space Telescope, HST, or the HST archive.
- You will only write text and numbers.
- You will not return empty values.
- You will only write in English.
- You will mention the class (e.g., barred spiral galaxy) and not just the specific instance (e.g., Andromeda).
- You will name every single astrophysical object, phenomenon, and possible science use case mentioned in the abstract.
- You will be descriptive in your answers and list multiple items when appropriate. Do not used dashed or numbered lists, only comma-separated.
- The total length of summarized text should not exceed 80 words.

Your response should be JSON formatted. The JSON should be a a dictionary with keys "objects_and_phenomena" and "science_use_cases".

[/INST]
"""

In [36]:
prompt = prompt_fn(abstract)
generator = outlines.generate.json(model, ConstrainedResponseHST)
sequence = generator(prompt)
sequence

ConstrainedResponseHST(objects_and_phenomena='Galactic bulge, stars, microlensing events, proper motion of stars, Galactic bar', science_use_cases='Constraining Galactic bar models, determining kinematic identity of microlensed sources, evaluating significance of streaming motions, measuring distances of lensed stars, determining total mass in the bulge, estimating dark matter amount in inner Galaxy')

In [37]:
from utils.abstract_utils import read_abstracts_file

from tqdm.notebook import tqdm

filename = "../data/abstracts.cat"

abstracts_df = read_abstracts_file(filename)

# Drop rows with missing Cycle
abstracts_df = abstracts_df.dropna(subset=['Cycle'])
abstracts_df = abstracts_df[abstracts_df['Cycle'] != '']

# Convert Cycle and ID to int
abstracts_df['Cycle'] = abstracts_df['Cycle'].astype(int)
abstracts_df['ID'] = abstracts_df['ID'].astype(int)

In [39]:
import pandas as pd

data_folder = "../data/observations_v1/"

def get_abstracts(data_folder, min_abstract=12, max_abstracts=23):
    
    # Lists to store results
    proposal_id_list = []
    objects_list = []
    science_list = []

    # Collect directories that contain .jpg files and match the "proposal_" pattern, excluding unwanted directories
    directories_with_images = [os.path.join(r, d)
                               for r, dirs, files in os.walk(data_folder)
                               for d in dirs
                               if d.startswith("proposal_") and not d.endswith('.ipynb_checkpoints')]

    # Walk through data folder
    for directory in tqdm(directories_with_images[min_abstract:max_abstracts]):
        proposal_id = directory.split("proposal_")[-1]  # Extract proposal id from the directory name

        # Extract abstract using the dataframe
        abstract = abstracts_df[abstracts_df["ID"] == int(proposal_id)]["Abstract"].values[0]

        prompt = prompt_fn(abstract)
        generator = outlines.generate.json(model, ConstrainedResponseHST)
        result = generator(prompt)
        print(result)
        print("\n")

        proposal_id_list.append(proposal_id)
        science_list.append(result.science_use_cases)
        objects_list.append(result.objects_and_phenomena)

    return proposal_id_list, objects_list, science_list
        
proposal_id_list, objects_list, science_list = get_abstracts(data_folder)

# Create a DataFrame
df = pd.DataFrame({
    'proposal_id': proposal_id_list,
    'objects_phenomena': objects_list,
    'science_use_cases': science_list
})

# df.to_csv('../data/summary_v1.csv', index=False)

  0%|          | 0/11 [00:00<?, ?it/s]

objects_and_phenomena="six 'host-less' short Gamma-Ray Bursts (GRBs), underlying hosts of short GRBs, faint underlying low-luminosity galaxies, high redshift galaxies, galaxies at large separations of tens to hundreds of kpc, neutron stars and/or black holes" science_use_cases='constraints on the true spatial distribution of short GRBs, fraction of highly-kicked mergers which occur far from their hosts, population synthesis models of compact object mergers, distributions of kick velocities and delay times'


objects_and_phenomena='radio galaxies, blue and ultraviolet excess, stellar systems, active galactic nuclei (AGN), radio axis' science_use_cases='study of star-formation in distant galaxies, origin of AGN activity, modelling the stellar systems inside these galaxies'


objects_and_phenomena='protocluster containing 4 galaxy groups, distinct galaxies, gravitationally bound systems, individual galaxies, absorption line galaxies, Coma cluster, early-type galaxies, group environment, s