In [None]:
!pip install streamlit pyngrok chromadb langchain openai sentence-transformers langchain_community "protobuf<5"




In [None]:
import os

# Set your OpenAI API key here
os.environ["OPENAI_API_KEY"] = ""


In [None]:
# Authenticate ngrok (replace with your token)
!ngrok authtoken <TOKEN>

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
# Save the Streamlit app to app.py
with open("app.py", "w") as file:
    file.write("""import os
import streamlit as st
import chromadb
from sentence_transformers import SentenceTransformer
from langchain.chat_models import ChatOpenAI
from langchain.schema import SystemMessage, HumanMessage

# Path to ChromaDB in Google Drive
chroma_db_path = "Chroma_db"

# Initialize ChromaDB
try:
    client = chromadb.PersistentClient(path=chroma_db_path)
    collection = client.get_collection("research_papers")
except Exception as e:
    st.error(f"Error loading ChromaDB: {e}")
    st.stop()

# Load the allenai-specter model for embedding generation
embedding_model = SentenceTransformer("allenai-specter")

# Initialize LLM
openai_api_key = os.getenv("OPENAI_API_KEY")
if not openai_api_key:
    st.error("OpenAI API key not found. Please set it as an environment variable.")
    st.stop()

llm = ChatOpenAI(temperature=0, openai_api_key=openai_api_key)

# Streamlit App
st.title("Mini_Scholar.ai")
tab1, tab2 = st.tabs(["Ask a Question", "Keyword Search and Summarization"])

# Tab 1: Ask a Question
with tab1:
    st.subheader("Ask a Question")
    question = st.text_input("Enter your question:")

    if st.button("Get Answer"):
        if not question.strip():
            st.error("Please enter a valid question.")
        else:
            try:
                # Generate embedding for the question
                query_embedding = embedding_model.encode(question).tolist()

                # Query ChromaDB
                results = collection.query(query_embeddings=[query_embedding], n_results=5)

                # Combine context from top chunks
                flat_documents = [doc for sublist in results["documents"] for doc in sublist]
                flat_metadata = [meta for sublist in results["metadatas"] for meta in sublist]
                context = " ".join(flat_documents)

                # Use LLM to generate an answer
                prompt = f\"\"\"Context: {context}

Question: {question}

Answer based only on the provided context.\"\"\"
                response = llm(messages=[
                    SystemMessage(content="You are a helpful assistant."),
                    HumanMessage(content=prompt)
                ])

                # Display the answer
                st.write("### Answer:")
                st.write(response.content)

                # Display the sources with chunks
                st.write("### Sources:")
                for doc, meta in zip(flat_documents, flat_metadata):
                    st.write(f"Chunk: {doc[:200]}...")  # Display a preview of the chunk
                    st.write(f"Source: {meta['file_name']} | Chunk ID: {meta['chunk_id']}")
                    st.write("---")  # Separator for readability
            except Exception as e:
                st.error(f"Error while processing the question: {e}")

# Tab 2: Keyword Search and Document Summarization
with tab2:
    st.subheader("Keyword Search and Summarization")

    # Initialize session state variables
    if "search_results" not in st.session_state:
        st.session_state.search_results = []
    if "selected_pdf" not in st.session_state:
        st.session_state.selected_pdf = None
    if "summary" not in st.session_state:
        st.session_state.summary = None

    # Input for keyword search
    keyword = st.text_input("Enter a keyword to search:")

    # Search button
    if st.button("Search"):
        if keyword.strip():
            # Generate embedding for the keyword
            query_embedding = embedding_model.encode(keyword).tolist()

            # Query ChromaDB for relevant documents
            results = collection.query(query_embeddings=[query_embedding], n_results=50)

            # Extract file names and sort by raw distances (ascending order)
            pdf_scores = {}
            for metadata_list, score_list in zip(results["metadatas"], results["distances"]):
                for metadata, score in zip(metadata_list, score_list):
                    file_name = metadata["file_name"]
                    if file_name not in pdf_scores or pdf_scores[file_name] > score:
                        pdf_scores[file_name] = score  # Lower distance = higher rank

            # Sort by raw distance (ascending order) and take the top 5
            sorted_results = sorted(pdf_scores.keys(), key=lambda x: pdf_scores[x])[:5]
            st.session_state.search_results = sorted_results

            # Reset selected PDF and summary
            st.session_state.selected_pdf = None
            st.session_state.summary = None
        else:
            st.error("Please enter a valid keyword.")

    # Display search results if available
    if st.session_state.search_results:
        st.write("### Top 5 Related PDFs:")
        for i, file_name in enumerate(st.session_state.search_results, 1):
            st.write(f"{i}. {file_name}")

        # PDF selection dropdown
        selected_pdf = st.selectbox(
            "Select a PDF to summarize:",
            st.session_state.search_results,
            key="selected_pdf_dropdown"
        )

        # Update session state with selected PDF
        if selected_pdf:
            st.session_state.selected_pdf = selected_pdf

        # Generate Summary button
        if st.button("Generate Summary"):
            if st.session_state.selected_pdf:
                # Retrieve all chunks for the selected PDF
                chunks = collection.get(
                    where={"file_name": st.session_state.selected_pdf}
                )
                doc_chunks = chunks["documents"]

                if not doc_chunks:
                    st.session_state.summary = "No chunks found for the selected PDF. Please try a different PDF."
                else:
                    # Combine all chunks into a single text
                    full_text = " ".join(doc_chunks)

                    # Summarization logic
                    if len(full_text.split()) <= 3000:
                        summary_prompt = f\"\"\"Summarize the following text into approximately 250 words:

{full_text}\"\"\"
                        response = llm(messages=[
                            SystemMessage(content="You are a helpful assistant."),
                            HumanMessage(content=summary_prompt)
                        ])
                        st.session_state.summary = response.content
                    else:
                        summaries = []
                        batch_size = 2000
                        for i in range(0, len(full_text.split()), batch_size):
                            batch = " ".join(full_text.split()[i:i + batch_size])
                            batch_prompt = f\"\"\"Summarize the following text into approximately 250 words:

{batch}\"\"\"
                            batch_response = llm(messages=[
                                SystemMessage(content="You are a helpful assistant."),
                                HumanMessage(content=batch_prompt)
                            ])
                            summaries.append(batch_response.content)

                        # Combine batch summaries into a single summary
                        final_summary_prompt = f\"\"\"Combine the following summaries into a single summary of approximately 250 words:

{' '.join(summaries)}\"\"\"
                        final_response = llm(messages=[
                            SystemMessage(content="You are a helpful assistant."),
                            HumanMessage(content=final_summary_prompt)
                        ])
                        st.session_state.summary = final_response.content

    # Display the summary if available
    if st.session_state.summary:
        st.write("### Summary:")
        st.write(st.session_state.summary)
""")


In [None]:
import subprocess
from pyngrok import ngrok

# Start the Streamlit app
process = subprocess.Popen(["streamlit", "run", "app.py", "--server.port", "8501"])

# Connect ngrok to the Streamlit port
try:
    url = ngrok.connect(addr=8501, proto="http")
    print(f"Streamlit app is running at: {url}")
except Exception as e:
    print(f"An error occurred with ngrok: {e}")


Streamlit app is running at: NgrokTunnel: "https://8e83-34-125-220-53.ngrok-free.app" -> "http://localhost:8501"
