In [1]:
!pip install rdflib



In [6]:
!pip install openai==0.28

Collecting openai==0.28
  Downloading openai-0.28.0-py3-none-any.whl.metadata (13 kB)
Downloading openai-0.28.0-py3-none-any.whl (76 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/76.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━[0m [32m71.7/76.5 kB[0m [31m2.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.55.1
    Uninstalling openai-1.55.1:
      Successfully uninstalled openai-1.55.1
Successfully installed openai-0.28.0


In [14]:
def generate_cancer_genomic_schema(sample_text, output_format):
  """
  Generates a cancer genomic schema with evidence and annotations.

  Args:
    sample_text: A sample text from the cancer genomic domain.
    output_format: The desired output format ("turtle" or "n3").

  Returns:
    A string representing the schema in the specified format.
  """

  prompt = f"""
  Generate a schema for a cancer genomic knowledge graph.
  Consider the following sample text:

  {sample_text}

  The schema should include:
  * Classes for genes, cancer types, mutations, and scientific publications.
  * Properties to link genes to cancer types, mutations to genes, and publications to genes.
  * Properties for PubMed IDs, BioPortal annotations, and Tumor Portal annotations.
  * Data types for properties.

  Output the schema in {output_format} format.
  """

  response = openai.Completion.create(
    engine="gpt-3.5-turbo-instruct",  # Updated to a supported engine
    prompt=prompt,
    max_tokens=1000,  # Adjust as needed
    n=1,
    stop=None,
    temperature=0.7,  # Adjust for creativity
  )

  schema = response.choices[0].text.strip()
  return schema

In [15]:
import openai
from rdflib import Graph, Literal, Namespace, URIRef

# Set your OpenAI API key
openai.api_key = ""

# ... (rest of the generate_cancer_genomic_schema function remains the same) ...

# Example usage
cancer_genomic_text = """
Mutations in the EGFR gene are commonly found in lung cancer patients.
A study published in PubMed (PMID: 12345678) found that EGFR mutations are associated with increased sensitivity to tyrosine kinase inhibitors.
BioPortal provides annotations for EGFR (http://bioportal.org/ontologies/EGFR) and lung cancer (http://bioportal.org/ontologies/LUNG-CANCER).
Tumor Portal provides information on EGFR mutations in various cancer types (http://tumorportal.org/genes/EGFR).
"""

# Generate schema in Turtle format
cancer_genomic_schema_turtle = generate_cancer_genomic_schema(cancer_genomic_text, "turtle")

# Parse the schema using RDFLib
g = Graph()
g.parse(data=cancer_genomic_schema_turtle, format="turtle")

# Define namespace
cg = Namespace("http://example.org/cancer_genomic#")

# Add triples based on example sentences
# g.add((cg.EGFR, cg.associatedWith, cg.LungCancer))
# g.add((cg.EGFR, cg.hasMutation, cg.EGFR_Mutation))

# 9 more example sentences and their corresponding triples
sentences = [
    "The KRAS gene is frequently mutated in pancreatic cancer.",
    "A study in PubMed (PMID: 98765432) showed that KRAS mutations are associated with poor prognosis in pancreatic cancer.",
    "BioPortal has annotations for KRAS (http://bioportal.org/ontologies/KRAS) and pancreatic cancer (http://bioportal.org/ontologies/PANCREATIC-CANCER).",
    "Tumor Portal provides data on KRAS mutations in various cancers (http://tumorportal.org/genes/KRAS).",
    "The TP53 gene is the most frequently mutated gene across all cancer types.",
    "TP53 mutations are linked to a variety of cancers, including breast cancer, colorectal cancer, and lung cancer.",
    "PubMed has numerous articles on TP53 mutations (e.g., PMID: 11122233).",
    "BioPortal provides comprehensive annotations for TP53 (http://bioportal.org/ontologies/TP53).",
    "Tumor Portal offers detailed information on TP53 mutations in different cancers (http://tumorportal.org/genes/TP53)."
]

for sentence in sentences:
    # This is a simplified example, you would need more sophisticated NLP techniques
    # to accurately extract the entities and relationships from each sentence.
    if "KRAS" in sentence and "pancreatic cancer" in sentence:
        g.add((cg.KRAS, cg.associatedWith, cg.PancreaticCancer))
        if "PMID" in sentence:
            g.add((cg.KRAS, cg.hasPublication, cg.PMID_98765432))
            g.add((cg.PMID_98765432, cg.pubmedID, Literal("98765432")))
        if "BioPortal" in sentence:
            g.add((cg.KRAS, cg.bioportalAnnotation, URIRef("http://bioportal.org/ontologies/KRAS")))
            g.add((cg.PancreaticCancer, cg.bioportalAnnotation, URIRef("http://bioportal.org/ontologies/PANCREATIC-CANCER")))
        if "Tumor Portal" in sentence:
            g.add((cg.KRAS, cg.tumorportalAnnotation, URIRef("http://tumorportal.org/genes/KRAS")))
    elif "TP53" in sentence:
        if "breast cancer" in sentence:
            g.add((cg.TP53, cg.associatedWith, cg.BreastCancer))
        if "colorectal cancer" in sentence:
            g.add((cg.TP53, cg.associatedWith, cg.ColorectalCancer))
        if "lung cancer" in sentence:
            g.add((cg.TP53, cg.associatedWith, cg.LungCancer))
        if "PMID" in sentence:
            g.add((cg.TP53, cg.hasPublication, cg.PMID_11122233))
            g.add((cg.PMID_11122233, cg.pubmedID, Literal("11122233")))
        if "BioPortal" in sentence:
            g.add((cg.TP53, cg.bioportalAnnotation, URIRef("http://bioportal.org/ontologies/TP53")))
        if "Tumor Portal" in sentence:
            g.add((cg.TP53, cg.tumorportalAnnotation, URIRef("http://tumorportal.org/genes/TP53")))

# Serialize the graph back to Turtle format:
print(g.serialize(format="turtle"))

AuthenticationError: You didn't provide an API key. You need to provide your API key in an Authorization header using Bearer auth (i.e. Authorization: Bearer YOUR_KEY), or as the password field (with blank username) if you're accessing the API from your browser and are prompted for a username and password. You can obtain an API key from https://platform.openai.com/account/api-keys.