In [None]:
# Academic Research Assistant System
#!pip install -U langchain-community arxiv wikipedia langchain-openai faiss-cpu sentence-transformers

import os
from langchain_community.utilities import ArxivAPIWrapper, WikipediaAPIWrapper
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI
from typing import List, Dict
import json
from langchain_community.embeddings import HuggingFaceEmbeddings

# Initialize tools
arxiv = ArxivAPIWrapper()
wikipedia = WikipediaAPIWrapper()

class AcademicResearchAssistant:
    def __init__(self):
        self.llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.3, openai_api_key="your_openai_api_key")
        self.vectorstore = None
        self.sources = []

    def research(self, query: str, tools: List[str] = ["arxiv", "wikipedia"]) -> Dict:
        """Collect research from specified tools"""
        results = {}

        if "arxiv" in tools:
            results["arxiv"] = arxiv.run(query)
            self._add_sources("arxiv", query)

        if "wikipedia" in tools:
            results["wikipedia"] = wikipedia.run(query)
            self._add_sources("wikipedia", query)

        return results

    def _add_sources(self, tool: str, query: str):
        """Track sources for citations"""
        self.sources.append({
            "tool": tool,
            "query": query,
            "timestamp": datetime.now().isoformat()
        })

    def synthesize(self, research_data: Dict, format: str = "APA") -> str:
        """Synthesize information with proper citations"""
        template = f"""As an academic research assistant, synthesize this information into a {format}-formatted report:

        Research Data:
        {{research}}

        Include in-text citations and a references section. Use {format} citation style."""

        prompt = ChatPromptTemplate.from_template(template)
        chain = prompt | self.llm | StrOutputParser()

        return chain.invoke({"research": json.dumps(research_data)})

    def filter_relevant(self, documents: List[str], query: str, threshold: float = 0.7) -> List[str]:
        """Filter documents by relevance score"""
        if not self.vectorstore:
            self._create_vectorstore(documents)

        docs = self.vectorstore.similarity_search_with_relevance_scores(query)
        return [doc[0].page_content for doc in docs if doc[1] > threshold]

    def _create_vectorstore(self, documents: List[str]):
        """Create vector store for documents"""
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        splits = text_splitter.create_documents(documents)
        embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
        self.vectorstore = FAISS.from_documents(splits, embeddings)

# Example Usage
if __name__ == "__main__":
    assistant = AcademicResearchAssistant()

    # 1. Conduct research
    research_data = assistant.research(
        "quantum computing applications in cryptography",
        tools=["arxiv", "wikipedia"]
    )

    # 2. Filter relevant info
    documents = [research_data["arxiv"], research_data["wikipedia"]]
    relevant_docs = assistant.filter_relevant(
        documents,
        "post-quantum cryptography algorithms",
        threshold=0.65
    )

    # 3. Generate formatted report
    report = assistant.synthesize(
        {"relevant_docs": relevant_docs},
        format="APA"
    )

    print("# Academic Research Report\n")
    print(report)

    print("\n## Research Sources\n")
    for i, source in enumerate(assistant.sources, 1):
        print(f"{i}. {source['tool'].title()}: {source['query']}")