In [18]:
import openai
from tqdm.auto import tqdm
import time
import json
openai.api_key = "YOUR_API_KEY_HERE"

## Get the text

In [82]:
paper_title = "Loss of epigenetic information as a cause of mammalian aging"
data = open("epigenome.txt").read()
len(data)

183187

## Chunk the text

In [104]:
chunks = []
current_chunk = ""

max_tokens = 2048
response_avg_size = 500
max_characters = max_tokens * 4 - response_avg_size


sentences = [x for x in data.split(".") if len(x) > 10]

# doing some chunkings
for i, sentence in enumerate(sentences):
    is_sentence_big = len(sentence) > max_characters
    if is_sentence_big:
        chunks += [current_chunk, sentence]
        current_chunk = ""
        continue

    if len(current_chunk) > max_characters:
        chunks += [current_chunk]
        current_chunk = ""
    current_chunk += sentence + "."
if current_chunk:
    chunks += [current_chunk]

print(f"Number of tokens for each chunk: ")    
print([len(chunk) / 4 for chunk in chunks], max([len(chunk) / 4 for chunk in chunks]))

Number of tokens for each chunk: 
[1934.5, 1966.0, 1969.75, 1954.0, 1986.0, 1977.75, 1952.0, 1941.5, 1925.5, 1926.0, 1934.75, 1935.5, 1954.0, 1923.25, 1946.0, 1949.75, 1926.0, 1936.0, 1983.75, 1933.25, 1924.0, 976.0] 1986.0


## Get the first 3 chunks only  (Demo purpose)

## GPT-3 To extract important items

In [105]:
def get_prompt(text):
    prompt = f"""Here is an excerpt from the paper '{paper_title}':\n{text}
Give me the bullet item of specific terms that I should research in order to understand this paper:
- """
    return prompt

def get_response_gpt(prompt):
    is_done = False
    while not is_done:
        try:
            response = openai.Completion.create(
                model="text-davinci-003",
                prompt=prompt,
                temperature=0,
                max_tokens=512,
                top_p=1,
                frequency_penalty=0,
                presence_penalty=0
            )
            is_done = True
        except Exception as e:
            print(f"Sleeping for 5 seconds because of {e}")
            time.sleep(5)
    return response["choices"][0]["text"]

In [106]:
%%time
responses = []
for chunk in tqdm(chunks):
    prompt = get_prompt(chunk)
    response = get_response_gpt(prompt)
    print(response)
    responses.append(response)

  0%|          | 0/22 [00:00<?, ?it/s]

 DNA breaks
- Chromatin-modifying proteins
- Epigenetic landscape
- Cellular exdifferentiation
- Senescence
- DNA methylation clock
- Information theory of aging
- Double-stranded DNA breaks (DSBs)
- Silent information regulator complex (Sir2/3/4)
- rDNA
- Sterility
 DNA methylation
- Histone modifications
- DNA sequence
- DNA damage response
- Reduced representation bisulfite sequencing (RRBS)
- Serine-139-phosphorylated H2AX (gH2AX)
- Senescence-associated b-galactosidase (SA-b-Gal)
- I-PpoI
- Tamoxifen (TAM)
- Estrogen receptor domain gene (ERT2)
- LoxP-STOP-loxP cassette
- Cre recombinase gene (Cre-ERT2)
- Ubiquitin promoter
- Topoisomerase II inhibitor
- Phleomycin
- Ligation-mediated PCR (LM-PCR)
- Interleukin-6 (IL-6)
- Ccl2
- Ccl20
- Long interspersed nuclear element-1 (LINE-1)
- Intracisternal A-particle (IAP)
 Epigenetic Age
- Whole-Genome Sequencing (WGS)
- DNA Double-Strand Breaks (DSBs)
- Senescence-Associated Beta-Galactosidase (SA-b-Gal)
- Mouse Frailty Index (FI)
- Epit

## Filter redundant concepts
We might have extracted the same infos multiple times between multiple chunks, let's remove the doublons:

- V1 (current) remove duplicate (identical lowercase)
- V2: use fuzzywuzzy for scoring similar things higher than th and pick the bigger one accross the set 
- V3: use sentence embeddings and remove things with similarity at a similar th


In [107]:

concepts = []
for response in responses:
    concepts.extend([x.replace("-", "").strip() for x in response.split("\n")])
concepts_uniques = list(set([x.lower() for x in concepts]))
len(concepts_uniques)

426

In [108]:
concepts_uniques

['reduced representation bisulfite sequencing (rrbs)',
 'forskolin',
 'isx9',
 'h3k56ac',
 'contextual fear conditioning',
 'h3k36me2',
 'werner’s syndrome',
 'chipenrich',
 'sirt1',
 'myosin light chain 4',
 'nextflex rapid rna sequencing kit',
 'genomewide methylation profiles',
 'topologically associated domains (tads)',
 'trishcl te saturated phenol',
 'mitochondrial superoxide dismutase',
 'juicer tool',
 'parp1',
 'ligationmediated pcr',
 'lamin b1 depletion',
 'transcription factors',
 'transposable elements',
 'ice mouse embryonic fibroblasts',
 'ippoi',
 'quantification of dsbs',
 'elastic net algorithm',
 'etoposide',
 'endothelial nad(+)h2s signaling network',
 'yamanaka factors',
 'dovetail hichip mnase kit',
 'podocyte density',
 'wnt signaling',
 'treefam',
 'lifespan',
 'deeptools2',
 'western blot analysis',
 'h3k4 trimethylation complex',
 'mediator',
 'atp',
 'ccl2',
 'cooler cload',
 'quantitative realtime pcr',
 'epigenome roadmap',
 'dna doublestrand break repair',

## GPT-3 to get the definitions for each terms

In [109]:

def get_prompt_anki(text):
    prompt = f"""Write the content of an Anki card (front/back)for this concept: `{text}`:"""
    return prompt

ankis = {}
for i, concept in tqdm(enumerate(concepts_uniques)):
    response = get_response_gpt(get_prompt_anki(concept)).strip()
    # ugly    
    print(response)
    if "\n\n" in response:
        front, back = response.split("\n\n")
    else: # only "\n"
        front, back = response.split("\n")
    ankis[front] = back
    
    
%%time



0it [00:00, ?it/s]

Front: What is reduced representation bisulfite sequencing (rrbs)?

Back: Reduced representation bisulfite sequencing (rrbs) is a method of DNA methylation analysis that uses a reduced representation of the genome to identify methylation patterns. It is a cost-effective and efficient way to analyze the methylation status of CpG sites in a genome. It is used to study epigenetic changes in a variety of organisms, including humans.
Front: What is forskolin?
Back: Forskolin is a natural compound found in the root of the Indian coleus plant. It has been used for centuries in traditional Ayurvedic medicine to treat a variety of ailments. It is now being studied for its potential to treat a variety of conditions, including asthma, glaucoma, and obesity. It has also been studied for its potential to increase testosterone levels and improve heart health.
Front: What is ISX9?
Back: ISX9 is a type of gene that is involved in the development of the nervous system. It is a transcription factor that

## Generate ankis visually

In [110]:
import genanki

In [111]:
model_id = 1607392319 # random number generated
deck_id = 3807392343

anki_model = genanki.Model(
  model_id,
  'Simple Model',
  fields=[
    {'name': 'Question'},
    {'name': 'Answer'},
  ],
  templates=[
    {
      'name': 'Card 1',
      'qfmt': '{{Question}}',
      'afmt': '{{FrontSide}}<hr id="answer">{{Answer}}',
    },
  ])

anki_deck = genanki.Deck(deck_id=deck_id, name=paper_title)

In [112]:
for anki in ankis:
    fields = [anki, ankis[anki]]
    new_note = genanki.Note(anki_model, fields, )
    anki_deck.add_note(new_note)

In [113]:
from slugify import slugify
slugify(paper_title)

'loss-of-epigenetic-information-as-a-cause-of-mammalian-aging'

In [1]:
genanki.Package(anki_deck).write_to_file(f'{slugify(paper_title)}.apkg')

NameError: name 'genanki' is not defined

In [116]:
len(ankis)

424