In [None]:
import os
from typing import List
from typing_extensions import TypedDict
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_groq import ChatGroq
from langchain.schema import Document
from langgraph.graph import END, StateGraph
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
import pypdf
import docx2txt
import json



In [None]:
# Define the language model
GROQ_LLM = ChatGroq(model="llama3-70b-8192")

# Define the graph state
class GraphState(TypedDict):
    research_topic: str
    idea_nodes: List[str]
    research_questions: List[str]
    hypothesis_nodes: List[str]
    literature_nodes: List[str]
    methodology_nodes: List[str]
    significance_nodes: List[str]
    ethical_considerations: List[str]
    timeline_and_budget: str
    final_proposal: str
    num_steps: int
    extracted_data: List[str]
    rag_context: str



In [None]:
# Function to extract text from PDF
def extract_text_from_pdf(file_path):
    with open(file_path, 'rb') as file:
        pdf_reader = pypdf.PdfReader(file)
        text = ""
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text

# Function to extract text from Word document
def extract_text_from_docx(file_path):
    return docx2txt.process(file_path)

# Function to process documents and create vector store
def process_documents(file_paths):
    texts = []
    for file_path in file_paths:
        if file_path.endswith('.pdf'):
            texts.append(extract_text_from_pdf(file_path))
        elif file_path.endswith('.docx'):
            texts.append(extract_text_from_docx(file_path))

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    docs = text_splitter.create_documents(texts)

    embeddings = HuggingFaceEmbeddings()
    vectorstore = FAISS.from_documents(docs, embeddings)

    return vectorstore

# Function to perform RAG
def perform_rag(vectorstore, query):
    qa_chain = RetrievalQA.from_chain_type(
        llm=GROQ_LLM,
        chain_type="stuff",
        retriever=vectorstore.as_retriever()
    )
    result = qa_chain({"query": query})
    return result['result']



In [None]:
# Define prompt templates for each node
research_topic_prompt = PromptTemplate(
    template="""Generate a central theme or subject for the research based on the input ideas: {idea_nodes} and the context from relevant papers: {rag_context}""",
    input_variables=["idea_nodes", "rag_context"],
)

idea_node_prompt = PromptTemplate(
    template="""Generate initial thoughts and concepts related to the research topic: {research_topic}""",
    input_variables=["research_topic"],
)

research_question_prompt = PromptTemplate(
    template="""Generate specific questions that the research aims to address for the topic: {research_topic}""",
    input_variables=["research_topic"],
)

hypothesis_prompt = PromptTemplate(
    template="""Propose explanations or predictions that the research will test for the topic: {research_topic}""",
    input_variables=["research_topic"],
)

literature_prompt = PromptTemplate(
    template="""Identify key literature and sources that provide background and support for the research topic: {research_topic}""",
    input_variables=["research_topic"],
)

methodology_prompt = PromptTemplate(
    template="""Detail the research methods and approaches to be used for the topic: {research_topic}""",
    input_variables=["research_topic"],
)

significance_prompt = PromptTemplate(
    template="""Explain the importance and potential impact of the research on the topic: {research_topic}""",
    input_variables=["research_topic"],
)

ethical_considerations_prompt = PromptTemplate(
    template="""Consider ethical issues related to the research on the topic: {research_topic}""",
    input_variables=["research_topic"],
)

timeline_budget_prompt = PromptTemplate(
    template="""Plan the research timeline and budget for the topic: {research_topic}""",
    input_variables=["research_topic"],
)

# Define functions for each node
def extract_data_from_documents(state):
    print("---EXTRACTING DATA FROM DOCUMENTS---")
    file_paths = state['document_paths']
    vectorstore = process_documents(file_paths)
    query = " ".join(state['idea_nodes'])
    rag_context = perform_rag(vectorstore, query)
    return {"extracted_data": vectorstore, "rag_context": rag_context, "num_steps": state['num_steps'] + 1}

def generate_research_topic(state):
    print("---GENERATING RESEARCH TOPIC---")
    idea_nodes = state['idea_nodes']
    rag_context = state['rag_context']
    research_topic = GROQ_LLM.invoke(research_topic_prompt.format(idea_nodes=idea_nodes, rag_context=rag_context))
    return {"research_topic": research_topic.content, "num_steps": state['num_steps'] + 1}

def generate_idea_nodes(state):
    print("---GENERATING IDEA NODES---")
    research_topic = state['research_topic']
    idea_nodes = GROQ_LLM.invoke(idea_node_prompt.format(research_topic=research_topic))
    return {"idea_nodes": idea_nodes.content, "num_steps": state['num_steps'] + 1}

def generate_research_questions(state):
    print("---GENERATING RESEARCH QUESTIONS---")
    research_topic = state['research_topic']
    research_questions = GROQ_LLM.invoke(research_question_prompt.format(research_topic=research_topic))
    return {"research_questions": research_questions.content, "num_steps": state['num_steps'] + 1}

def generate_hypothesis(state):
    print("---GENERATING HYPOTHESIS---")
    research_topic = state['research_topic']
    hypothesis_nodes = GROQ_LLM.invoke(hypothesis_prompt.format(research_topic=research_topic))
    return {"hypothesis_nodes": hypothesis_nodes.content, "num_steps": state['num_steps'] + 1}

def generate_literature_review(state):
    print("---GENERATING LITERATURE REVIEW---")
    research_topic = state['research_topic']
    literature_nodes = GROQ_LLM.invoke(literature_prompt.format(research_topic=research_topic))
    return {"literature_nodes": literature_nodes.content, "num_steps": state['num_steps'] + 1}

def generate_methodology(state):
    print("---GENERATING METHODOLOGY---")
    research_topic = state['research_topic']
    methodology_nodes = GROQ_LLM.invoke(methodology_prompt.format(research_topic=research_topic))
    return {"methodology_nodes": methodology_nodes.content, "num_steps": state['num_steps'] + 1}

def generate_significance(state):
    print("---GENERATING SIGNIFICANCE---")
    research_topic = state['research_topic']
    significance_nodes = GROQ_LLM.invoke(significance_prompt.format(research_topic=research_topic))
    return {"significance_nodes": significance_nodes.content, "num_steps": state['num_steps'] + 1}

def generate_ethical_considerations(state):
    print("---GENERATING ETHICAL CONSIDERATIONS---")
    research_topic = state['research_topic']
    ethical_considerations = GROQ_LLM.invoke(ethical_considerations_prompt.format(research_topic=research_topic))
    return {"ethical_considerations": ethical_considerations.content, "num_steps": state['num_steps'] + 1}

def generate_timeline_and_budget(state):
    print("---GENERATING TIMELINE AND BUDGET---")
    research_topic = state['research_topic']
    timeline_and_budget = GROQ_LLM.invoke(timeline_budget_prompt.format(research_topic=research_topic))
    return {"timeline_and_budget": timeline_and_budget.content, "num_steps": state['num_steps'] + 1}

def compile_final_proposal(state):
    print("---COMPILING FINAL PROPOSAL---")
    final_proposal = {
        "Research Topic": state['research_topic'],
        "Ideas": state['idea_nodes'],
        "Research Questions": state['research_questions'],
        "Hypothesis": state['hypothesis_nodes'],
        "Literature": state['literature_nodes'],
        "Methodology": state['methodology_nodes'],
        "Significance": state['significance_nodes'],
        "Ethical Considerations": state['ethical_considerations'],
        "Timeline and Budget": state['timeline_and_budget'],
    }
    return {"final_proposal": json.dumps(final_proposal), "num_steps": state['num_steps'] + 1}



In [None]:
# Define the workflow
workflow = StateGraph(GraphState)
workflow.add_node("extract_data_from_documents", extract_data_from_documents)
workflow.add_node("generate_research_topic", generate_research_topic)
workflow.add_node("generate_idea_nodes", generate_idea_nodes)
workflow.add_node("generate_research_questions", generate_research_questions)
workflow.add_node("generate_hypothesis", generate_hypothesis)
workflow.add_node("generate_literature_review", generate_literature_review)
workflow.add_node("generate_methodology", generate_methodology)
workflow.add_node("generate_significance", generate_significance)
workflow.add_node("generate_ethical_considerations", generate_ethical_considerations)
workflow.add_node("generate_timeline_and_budget", generate_timeline_and_budget)
workflow.add_node("compile_final_proposal", compile_final_proposal)

# Set entry point and define the flow
workflow.set_entry_point("extract_data_from_documents")
workflow.add_edge("extract_data_from_documents", "generate_research_topic")
workflow.add_edge("generate_research_topic", "generate_idea_nodes")
workflow.add_edge("generate_idea_nodes", "generate_research_questions")
workflow.add_edge("generate_research_questions", "generate_hypothesis")
workflow.add_edge("generate_hypothesis", "generate_literature_review")
workflow.add_edge("generate_literature_review", "generate_methodology")
workflow.add_edge("generate_methodology", "generate_significance")
workflow.add_edge("generate_significance", "generate_ethical_considerations")
workflow.add_edge("generate_ethical_considerations", "generate_timeline_and_budget")
workflow.add_edge("generate_timeline_and_budget", "compile_final_proposal")
workflow.add_edge("compile_final_proposal", END)



In [None]:
# Compile and run the workflow
app = workflow.compile()

# Example input
inputs = {
    "idea_nodes": ["Impact of artificial intelligence on education", "Personalized learning algorithms", "Ethical considerations in AI-driven education"],
    "document_paths": ["path/to/paper1.pdf", "path/to/paper2.docx"],
    "num_steps": 0
}

# Run the workflow
for output in app.stream(inputs):
    for key, value in output.items():
        print(f"Finished running: {key}: {value}")



In [None]:
# Print the final proposal
final_state = app.invoke(inputs)
print(json.dumps(json.loads(final_state['final_proposal']), indent=2))