In [None]:
!pip install streamlit pandas numpy

Collecting streamlit
  Downloading streamlit-1.41.1-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.41.1-py2.py3-none-any.whl (9.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m47.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m70.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[

In [None]:
!pip install google-colab


Collecting jedi>=0.16 (from ipython==7.34.0->google-colab)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jedi
Successfully installed jedi-0.19.2


In [None]:
!pip install cohere
!pip install weaviate-client
!pip install rdflib


Collecting cohere
  Downloading cohere-5.13.11-py3-none-any.whl.metadata (3.4 kB)
Collecting fastavro<2.0.0,>=1.9.4 (from cohere)
  Downloading fastavro-1.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.5 kB)
Collecting httpx-sse==0.4.0 (from cohere)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting types-requests<3.0.0,>=2.0.0 (from cohere)
  Downloading types_requests-2.32.0.20241016-py3-none-any.whl.metadata (1.9 kB)
Downloading cohere-5.13.11-py3-none-any.whl (252 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m252.5/252.5 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpx_sse-0.4.0-py3-none-any.whl (7.8 kB)
Downloading fastavro-1.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m46.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading types_requests-2.32.0.20241016-py3-none-any.whl (15 kB)
In

In [None]:
import streamlit as st
import pandas as pd
import numpy as np
import os
import logging
from datetime import datetime
from google.colab import output
import threading
import time
from rdflib import Graph, RDF, RDFS, Namespace, URIRef, Literal
from rdflib.namespace import SKOS, XSD
from weaviate import Client
from weaviate.util import generate_uuid5
from weaviate.classes.init import Auth

# Install required packages
!pip install streamlit rdflib weaviate-client

# Set up proxy environment variables
os.environ["http_proxy"] = "http://172.31.2.4:8080"
os.environ["https_proxy"] = "http://172.31.2.4:8080"

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# ------ Streamlit App Code ------ #
def main():
    st.title("Vector vs Knowledge Graph Search App")

    # Upload file section
    uploaded_file = st.file_uploader("Upload your dataset (CSV)", type="csv")
    if uploaded_file:
        with st.spinner("Loading dataset..."):
            df = load_dataset(uploaded_file)
        st.success(f"Uploaded dataset with {len(df)} records.")
        st.dataframe(df.head(10))

        # Create RDF Knowledge Graph
        if st.button("Create Knowledge Graph"):
            with st.spinner("Creating RDF Knowledge Graph..."):
                kg_file_path = "PubMedGraph.ttl"
                create_knowledge_graph(df, kg_file_path)
            st.success(f"Knowledge Graph created and saved to {kg_file_path}")

        # Initialize Weaviate client
        st.header("Search Articles (Vector Query)")
        client = initialize_weaviate_client()
        if client is not None:
            query_text = st.text_input("Enter your search query (e.g., Alzheimer):")
            if st.button("Search"):
                with st.spinner("Querying Weaviate..."):
                    results = query_weaviate_articles(client, query_text)
                if results:
                    st.write("### Search Results")
                    st.table(results)
                else:
                    st.warning("No results found for your query.")

# ------ Helper Functions ------ #
@st.cache_data
def load_dataset(file):
    """Load and preprocess the dataset."""
    logging.info("Loading dataset...")
    df = pd.read_csv(file)
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.fillna('', inplace=True)
    logging.info("Dataset loaded successfully.")
    return df

@st.cache_data
def create_knowledge_graph(df, file_path):
    """Create an RDF knowledge graph and save it to a file."""
    logging.info("Creating RDF Knowledge Graph...")
    try:
        g = Graph()
        schema = Namespace('http://schema.org/')
        ex = Namespace('http://example.org/')

        g.bind('schema', schema)
        g.bind('ex', ex)

        Article = URIRef(ex.Article)
        title = URIRef(schema.name)
        abstract = URIRef(schema.description)

        g.add((Article, RDF.type, RDFS.Class))

        for _, row in df.iterrows():
            article_uri = URIRef(ex[row['Study Title'].replace(' ', '_')])
            g.add((article_uri, RDF.type, Article))
            g.add((article_uri, title, Literal(row['Study Title'], datatype=XSD.string)))
            g.add((article_uri, abstract, Literal(row['Combined Text'], datatype=XSD.string)))

        g.serialize(destination=file_path, format='turtle')
        logging.info("Knowledge Graph created successfully.")
    except Exception as e:
        logging.error(f"Failed to create Knowledge Graph: {e}")
        st.error("Failed to create Knowledge Graph.")

@st.cache_resource
def initialize_weaviate_client():
    """Initialize the Weaviate client."""
    logging.info("Initializing Weaviate client...")
    try:
        client = Client(
            url="https://80lxhqxlsvwazgbsgg8gpw.c0.asia-southeast1.gcp.weaviate.cloud",
            auth_client_secret=Auth.api_key("AIQmYy7Ik0pIiRVzR0WmxMfHj5B0qQeAGFvd")
        )
        logging.info("Weaviate client initialized successfully.")
        return client
    except Exception as e:
        logging.error(f"Failed to initialize Weaviate client: {e}")
        st.error("Failed to connect to Weaviate. Please check your credentials.")
        return None

@st.cache_data
def query_weaviate_articles(client, query_text, limit=10):
    """Query articles from Weaviate."""
    logging.info("Querying Weaviate...")
    try:
        response = (
            client.query
            .get("Article4", ["title", "abstractText", "meshMajor"])
            .with_near_text({"concepts": [query_text]})
            .with_limit(limit)
            .do()
        )

        results = []
        if "data" in response and "Get" in response["data"]:
            for obj in response["data"]["Get"]["Article4"]:
                results.append({
                    "Title": obj.get("title", "N/A"),
                    "Abstract": obj.get("abstractText", "N/A"),
                    "Keywords": obj.get("meshMajor", "N/A"),
                })
        logging.info(f"Query returned {len(results)} results.")
        return results
    except Exception as e:
        logging.error(f"Failed to query Weaviate: {e}")
        st.error("Failed to query Weaviate.")
        return []

# ------ Colab-Specific Setup ------ #
def run_streamlit():
    """Run the Streamlit app."""
    with open('app.py', 'w') as f:
        f.write(inspect.getsource(main))

    os.system("nohup streamlit run app.py --server.port 8501 --server.headless true &> logs.txt &")

if _name_ == "_main_":
    thread = threading.Thread(target=run_streamlit, daemon=True)
    thread.start()
    time.sleep(5)

    try:
        public_url = output.eval_js("google.colab.kernel.proxyPort(8501)")
        print(f"Your Streamlit app is running at:\n{public_url}")
    except Exception as e:
        print("Failed to get public URL. Visit: http://localhost:8501")
    while True:
        time.sleep(600)