In [1]:
import os
import openai
from langchain_text_splitters import TokenTextSplitter
from langchain_experimental.agents import create_csv_agent
from langchain_community.document_loaders import TextLoader, CSVLoader
from langchain.agents.agent_types import AgentType
from langchain_experimental.agents.agent_toolkits import create_csv_agent
from langchain_openai import ChatOpenAI, OpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain import PromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from dotenv import load_dotenv


In [15]:
load_dotenv()
openai_api_key = os.getenv('OPENAI_API_KEY')
os.environ["OPENAI_API_KEY"] =openai_api_key

In [3]:
loader = TextLoader("/workspaces/langchain-rag-tutorial-main/data/raw_text_complete/10.1006_bbrc.2001.5639.txt")
data = loader.load()
text_splitter = TokenTextSplitter(
    # Controls the size of each chunk
    chunk_size=500,
    # Controls overlap between chunks
    chunk_overlap=20,
)

texts = text_splitter.split_text(data[0].page_content)

In [4]:
loader = CSVLoader("/workspaces/langchain-rag-tutorial-main/data/context/histones.csv")
csv_data = loader.load()

In [29]:
template = """You have access to {context} as a dictionary identifying canonical histone-gene relationships. \n

Your target/focus is to analyze {document}. Extract all histone-gene or histone-disease relationships that appear in the target, including relationships that aren't in the context.\n
Format your output as a csv file for each type of relationship. One CSV should have a histone axis and a gene axis. If a relationship was found from the text, input 1 at the (histone, gene) co-ordinate. If not, input 0
Do the same for histone-disease relationship with a histone axis and a disease axis. Only include data extracted from the target.
"""

prompt = ChatPromptTemplate.from_template(template)

llm = ChatOpenAI(model="gpt-4-turbo",seed=42,temperature=0.1)
chain = create_stuff_documents_chain(llm, prompt)

docs = texts[:1]


output = chain.invoke({"context": csv_data[:50],'document':docs})
print ('%5s' % output)

                seed was transferred to model_kwargs.
                Please confirm that seed is what you intended.


Based on the provided text and the task requirements, we need to extract relationships between histones, genes, and diseases mentioned in the text. However, the text primarily discusses HDAC1 and DNMT1 in the context of prostate cancer, without specific mention of histone modifications or specific histones (like H3K4me1, H3K36me2, etc.). Therefore, the relationships involving specific histones and genes or diseases cannot be directly extracted from the provided text.

However, I can provide a template for how the CSV files should be structured based on the potential relationships that could be extracted if specific histones were mentioned:

### Histone-Gene Relationship CSV Template
```
Histone,Gene,Relationship
H3K4me1,HDAC1,0
H3K4me1,DNMT1,0
H3K36me2,HDAC1,0
H3K36me2,DNMT1,0
...
```

### Histone-Disease Relationship CSV Template
```
Histone,Disease,Relationship
H3K4me1,Prostate Cancer,0
H3K36me2,Prostate Cancer,0
...
```

In these templates:
- The `Histone` column lists potential his