In [3]:
import sys
import json
from openai import OpenAI
from dotenv import load_dotenv
import Graph 
import re
import os


def get_graph_from_abstract(abstract):
    load_dotenv()
    client = OpenAI(
      api_key=os.environ.get('mykey')
    )
    attempt = 0
    max_attempts = 5
    success = False

    while not success and attempt < max_attempts:
        try:
            completion = client.chat.completions.create(
                model="gpt-4-0125-preview",
                messages=[
                  {"role": "user", "content": "identify a list of entities in this science abstract and their relations, storing the list of entities as an array of three element tuple of strings and the relations as an array of tuples in the format of (Entity A, Entity B, Relationship):" + abstract},
                ],
            )

            message = completion.choices[0].message.content
            print(message) 

            entities_output = re.findall(r'\d+\.\s+(.*)', message)

            relations = [tuple(item[1:-1].split(', ')) for item in entities_output if item.startswith('(') and item.endswith(')')]

            graph = Graph.Graph()

            for relation in relations:
                if len(relation) == 3:
                    entity_a, entity_b, annotation = relation
                    graph.add_vertex(entity_a)
                    graph.add_vertex(entity_b)
                    graph.add_edge(entity_a, entity_b, annotation)

            graph.display()  # This will print the graph representation

            success = True 
            return json.dumps(graph.adjacency_list)

        except Exception as e:
            print(f"Attempt {attempt + 1} failed with error: {e}")
            attempt += 1  # Increment the attempt counter and try again


In [4]:
abstract = "Neurodegenerative diseases are characterized by the formation and propagation of protein aggregates, especially amyloid fibrils. However, what normally suppresses protein misfolding and aggregation in metazoan cells remains incompletely understood. Here, we show that TRIM11, a member of the metazoan tripartite motif (TRIM) family, both prevents the formation of protein aggregates and dissolves pre-existing protein deposits, including amyloid fibrils. These molecular chaperone and disaggregase activities are ATP independent. They enhance folding and solubility of normal proteins and cooperate with TRIM11 SUMO ligase activity to degrade aberrant proteins. TRIM11 abrogates α-synuclein fibrillization and restores viability in cell models of Parkinson's disease (PD). Intracranial adeno-associated viral delivery of TRIM11 mitigates α-synuclein-mediated pathology, neurodegeneration, and motor impairments in a PD mouse model. Other TRIMs can also function as ATP-independent molecular chaperones and disaggregases. Thus, we define TRIMs as a potent and multifunctional protein quality-control system in metazoa, which might be applied to treat neurodegenerative diseases."



In [7]:
al = get_graph_from_abstract(abstract)

### Entities List
1. Neurodegenerative diseases
2. Protein aggregates
3. Amyloid fibrils
4. Metazoan cells
5. TRIM11
6. TRIM family
7. Molecular chaperone activities
8. Disaggregase activities
9. ATP (Adenosine triphosphate)
10. Folding and solubility
11. Normal proteins
12. TRIM11 SUMO ligase activity
13. Aberrant proteins
14. α-synuclein fibrillization
15. Parkinson's disease (PD)
16. Intracranial adeno-associated viral delivery
17. Pathology
18. Neurodegeneration
19. Motor impairments
20. PD mouse model
21. Other TRIMs

### Relations
1. (Neurodegenerative diseases, Protein aggregates, characterized by)
2. (Protein aggregates, Amyloid fibrils, especially)
3. (TRIM11, Protein aggregates, prevents formation of)
4. (TRIM11, Protein deposits, dissolves pre-existing)
5. (Protein deposits, Amyloid fibrils, including)
6. (Molecular chaperone activities, ATP, independent)
7. (Disaggregase activities, ATP, independent)
8. (Molecular chaperone activities, Folding and solubility, enhance)
9. (T

In [6]:
al

'{"Neurodegenerative diseases": [["Protein aggregates", "characterized by"]], "Protein aggregates": [["Neurodegenerative diseases", "characterized by"], ["Amyloid fibrils", "especially includes"], ["TRIM11", "prevents formation of"]], "Amyloid fibrils": [["Protein aggregates", "especially includes"], ["TRIM11", "dissolves including"]], "Protein misfolding": [["Metazoan cells", "suppression in"]], "Metazoan cells": [["Protein misfolding", "suppression in"]], "TRIM11": [["Protein aggregates", "prevents formation of"], ["Protein deposits", "dissolves"], ["Amyloid fibrils", "dissolves including"], ["\\u03b1-synuclein fibrillization", "abrogates"], ["Cell models of PD", "restores viability in"]], "Protein deposits": [["TRIM11", "dissolves"]], "Molecular chaperone activities": [["ATP", "independent of"]], "ATP": [["Molecular chaperone activities", "independent of"], ["Disaggregase activities", "independent of"]], "Disaggregase activities": [["ATP", "independent of"]], "TRIM11 SUMO ligase act

In [1]:
import os
import re
import json
import ast
from openai import OpenAI
from dotenv import load_dotenv

def process_sequential_protocol(protocol):
    load_dotenv()

    client = OpenAI(
      api_key=os.environ.get('mykey')
    )

    completion = client.chat.completions.create(
      model="gpt-4-0125-preview",
      messages=[
        {"role": "user", "content": "Identify the key steps and reagents/objects used in this biological experiment procedure, and generate two python arrays that store respectively strings describing the key steps and another python array that stores the reagents/objects " + protocol},
      ],
    )

    message = completion.choices[0].message.content

    steps_match = re.search(r'steps = (\[.*?\])', message, re.DOTALL)
    reagents_objects_match = re.search(r'reagents_objects = (\[.*?\])', message, re.DOTALL)

    if steps_match and reagents_objects_match:
        try:
            steps_array = ast.literal_eval(steps_match.group(1))
            reagents_objects_array = ast.literal_eval(reagents_objects_match.group(1))
            return json.dumps({"steps": steps_array, "reagents/objects": reagents_objects_array})
        except ValueError as e:
            return json.dumps({"error": f"Error processing the extracted data: {str(e)}"})
    else:
        return json.dumps({"error": "No steps or reagents/objects found in the response"})

protocol = "We will expose wild-type astrocytes and ASH1L-depleted astrocytes to PBS (control), LPS, and Poly(I:C) in vitro. We will then use RT-qPCR to quantify the expression of IL6 and TNF, two pro-inflammatory cytokine encoding genes upregulated by astrocytes upon activation, in all samples [9]."
print(process_sequential_protocol(protocol))


{"steps": ["Expose wild-type astrocytes to PBS", "Expose wild-type astrocytes to LPS", "Expose wild-type astrocytes to Poly(I:C)", "Expose ASH1L-depleted astrocytes to PBS", "Expose ASH1L-depleted astrocytes to LPS", "Expose ASH1L-depleted astrocytes to Poly(I:C)", "Use RT-qPCR to quantify the expression of IL6 and TNF in all samples"], "reagents/objects": ["Wild-type astrocytes", "ASH1L-depleted astrocytes", "PBS (Phosphate Buffered Saline)", "LPS (Lipopolysaccharide)", "Poly(I:C)", "RT-qPCR reagents", "IL6 and TNF gene primers"]}
