In [11]:
# Dependencies
import os
import pandas
import math
import time
import nltk
nltk.download('punkt')

import torch
from torch.utils.data import DataLoader, TensorDataset
device = torch.device('cpu')

from transformers import pipeline
from transformers import BartTokenizer, BartForConditionalGeneration, AdamW
from huggingface_hub import hf_hub_download
from langchain import PromptTemplate, HuggingFaceHub, LLMChain

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Rudy\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [3]:
# summarizer = pipeline("summarization", model="sambydlo/scientific_abstract_simplification-tomasg25/scientific_lay_summarisation")
model_name = [
                "sambydlo/bart-large-scientific-lay-summarisation",
                "haining/scientific_abstract_simplification",
                "philschmid/bart-large-cnn-samsum"
            ]

In [4]:
tokenizer = BartTokenizer.from_pretrained(model_name[0])
model = BartForConditionalGeneration.from_pretrained(model_name[0]).to(device)

In [5]:
reference = """We present numerical spectral and vertical structure calculations appropriate for 
            near-Eddington luminosity, radiation pressure dominated accretion disks
            around stellar mass black holes. We cover a wide range of black hole spins, and
            incorporate dissipation profiles based on first-principles three-dimensional MHD
            disk interior simulations. We also include non-zero stresses at the ISCO, which
            results in the disk effective temperature to increase rapidly towards the black
            hole, and give rise to rather extreme conditions with high temperatures and low
            surface densities. We found that local annuli spectra become increasingly characteristic 
            of saturated Comptonisation with decreasing distance to the black hole.
            While the spectra becomes harder with increasing black hole spin, they do not
            give rise to a broad power law tail even at maximum spin. We discuss the implications 
            of our results in the context of the steep power law (SPL) state and the
            associated high-frequency quasi-periodic oscillations (HFQPO) observed in some
            X-ray binary systems"""
introduction = """Galactic black hole X-ray binaries (BHB) show several states of outburst distinguished
by luminosity, spectral shape and variability (see for example, McClintock & Remillard
(2006) and Done, Gierlinski, & Kubota (2007)). In particular, at their highest luminosities
the spectra contains a steep power law component with photon index Γ > 2.4 (McClintock & Remillard
2006). These energetically significant power law tails begin at the spectral peak (≈ 10 keV)
and could extend into the MeV regime (Ling & Wheaton 2005; Grove et al. 1998). Moreover, this steep power 
law (SPL) spectral state is accompanied by high-frequency (ν > 50 Hz)
quasi-periodic oscillations (HFQPO) in the light curves when integrated over approximately
10 to 30 keV in photon energies.
Understanding the first-principles physics of radiating accretion flows that presumably
underly these observational properties remain an important outstanding problem in astrophysics. The standard thin accretion disk model (Shakura & Sunyaev 1973; Novikov & Thorne
1973; Riffert & Herold 1995) assumed that the stress and luminosity at the innermost stable
circular orbit (ISCO) drops to zero, and that at this point the material essentially simply disappears into the black hole. This assumption received significant recent theoretical scrutiny
upon the realization that magnetohydrodynamic turbulence (Balbus and Hawley 1991, 1998)
is probably the source of stress that drives accretion. In particular, Agol and Krolik (2000)
demonstrated that having non-zero magnetic stresses at the ISCO can cause the effective
temperature to rise sharply towards the black hole instead of fall to zero as predicted by
the standard model. Among the potentially observable consequences postulated by these
authors, the inner disk consequently becomes effectively thin, and extends the spectrum to
higher frequencies.
More recently, Dexter & Blaes (2014) (from here on referred to as DB14) proposed that
the Agol and Krolik (2000) model provides feasible mechanism for explaining both the steep
power law (SPL) state seen at near Eddington luminosities and the associated high frequency
quasi-periodic oscillations (HFQPO). These authors argued that the rapidly rising effective
temperature with decreasing distance to the black hole would give rise to the SPL spectra,
while also providing a natural filter for the HFQPOs that do not require the entire disk to
oscillate coherently.
In this work, we undertake a detailed numerical study of the structure and spectra of
near-Eddington accretion disks with non-zero magnetic stresses the ISCO, and particularly
focus on the effects of black hole spin. Unlike previous efforts that relied on one-zone models,
we self-consistently couple vertical structure to radiative transfer at each disk annuli, and
generate spectra that fully incorporates effects Comptonisation and metal opacities. Our
inputs are time and horizontally averaged vertical dissipation profiles from first-principles
stratified shearing-box simulations of accretion flows (Hirose, Krolik & Blaes 2009). These
calculations evolve the time-dependent three-dimensional radiation magneto-hydrodynamic
equations and accounts for the tidal vertical gravity from the black hole. In simulations over
a wide range of box-integrated radiation to gas pressure ratios, the resulting vertical spatial
dissipation profiles generally peak at around a pressure scale-height away from the disk
mid-plane, and should capture the effects of MRI turbulence. Moreover, these simulations
collectively indicate that the α-prescription (Shakura & Sunyaev 1973) relationship between
pressure and stress approximately hold (Hirose, Blaes & Krolik 2009). This means we are 
justified, at least in light of recent simulations, to use the α-model with modifications to
account for non-zero inner torque to generate radial profiles of total surface density Σ0 and
effective temperature Teff that are also necessary for our vertical structure and radiative
transfer computations.
This paper is organized as follows. In section 2 we outline our numerical methods, paying
particular attention to how we incorporated non-zero inner torque. Section 3 showcases our
numerical results, including full-disk spectra for all black hole spin values we covered. We
turn to the possibility of HFQPOs in section 4, and conclude in section 5 with a discussion
of on-going and future work.

"""


In [30]:
len(nltk.word_tokenize(introduction))

744

In [39]:
def batch_input_text(input_text: str, batch_size: int=819) -> list:
    """
    :param input_text: str, research paper in full
    :param batch_size: int, 80% of max input tokens 
    :return: list, batched input text
    """
    try:
        tokens = nltk.word_tokenize(input_text)
        n_batches = math.ceil(len(tokens) / batch_size)

        if len(tokens) > batch_size:
            batches = [" ".join(tokens[(i * batch_size):((i + 1) * batch_size)]) 
                    for i in range(n_batches)]
        else:
            batches = [" ".join(tokens)]
    except Exception as e:
        raise e

    return batches

In [40]:
def generate_summary(input_text: str) -> str:
    """
    Generate summary from Arxiv research article.

    """
    # Start clock
    start_time = time.time()

    try:
        # Check length of article and batch if necessary
        batches = batch_input_text(input_text)
        # Summarize batches
        sub_summaries = []
        for i in range(len(batches)):
            output = model.generate(input_ids=tokenizer.encode(batches[i], return_tensors="pt").to(device),
                                    max_length=350)
            summary = tokenizer.decode(output[0])
            sub_summaries.append(summary)
        print(sub_summaries)
    except:
        pass

    # Elapsed time
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Elapsed time: {elapsed_time}")


In [48]:
def generate_summary(input_text: str, device: str='cpu') -> str:
    """
    Generate summary from Arxiv research article.

    """
    # Start clock
    start_time = time.time()

    # Set model and torch method
    model_name = [
                "sambydlo/bart-large-scientific-lay-summarisation",
                # "haining/scientific_abstract_simplification",
                "philschmid/bart-large-cnn-samsum"
            ]
    device = torch.device(device)
    tokenizer = BartTokenizer.from_pretrained(model_name[1])
    model = BartForConditionalGeneration.from_pretrained(model_name[1]).to(device)

    try:
        # Check length of article and batch if necessary
        batches = batch_input_text(input_text)
        # Summarize batches
        sub_summaries = []
        for i in range(len(batches)):
            output = model.generate(input_ids=tokenizer.encode(batches[i], return_tensors="pt").to(device),
                                    max_length=500)
            summary = tokenizer.decode(output[0])
            sub_summaries.append(summary)
        
        # Combine child summaries & remove specified separators
        joined_summaries = " ".join(sub_summaries)
        parent_input = joined_summaries.replace("</s><s><s><s>", "").replace("</s>", "")

        # Obtain final summary
        output = model.generate(input_ids=tokenizer.encode(parent_input, return_tensors="pt").to(device),
                                max_length=500)
        summary = tokenizer.decode(output[0])
        summary = summary.replace("</s><s><s><s>", "").replace("</s>", "")
    except:
        pass

    # Elapsed time
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Elapsed time: {elapsed_time} seconds")

    return summary

def batch_input_text(input_text: str, batch_size: int=819) -> list:
    """
    :param input_text: str, research paper in full
    :param batch_size: int, 80% of max input tokens 
    :return: list, batched input text
    """
    try:
        tokens = nltk.word_tokenize(input_text)
        n_batches = math.ceil(len(tokens) / batch_size)

        if len(tokens) > batch_size:
            batches = [" ".join(tokens[(i * batch_size):((i + 1) * batch_size)]) 
                    for i in range(n_batches)]
        else:
            batches = [" ".join(tokens)]
    except Exception as e:
        raise e

    return batches

In [49]:
generate_summary(introduction)

In [35]:
%%time
batches = batch_input_text(introduction, batch_size=200)
len(introduction)
# s = 0
# for i in batches:
#     s += 1
#     print(f"Batch {s} token count: {len(i)}")
#     print(i)

4
CPU times: total: 0 ns
Wall time: 5.99 ms


4407

In [None]:
output = model.generate(input_ids=tokenizer.encode(introduction, return_tensors='pt').to(device), max_length=350)
tokenizer.decode(output[0])

'</s><s>Galactic black hole X-ray binaries (BHBs ) show several states of outburst distinguished by luminosity, spectral shape and variability (see for example, McClintock & Remillard (2006) and Done, Gierlinski, & Kubota (2007)). In particular, at their highest luminosities the spectra contains a steep power law component with photon index Γ > 2.4 (McClintock and Remillard2006). These energetically significant power law tails begin at the spectral peak (≈ 10 keV) and could extend into the MeV regime (Ling & Wheaton 2005; Grove et al. 1998). Moreover, this steep power</s>'