# Streamlit Embeddings & Semantic Search
Paste text or upload files, build an inâ€‘memory vector index using OpenAI, SentenceTransformers, or HF Inference embeddings. Search semantically and inspect scores.


# Installation (commented)

In [None]:
# !pip install streamlit faiss-cpu sentence-transformers openai huggingface_hub pypdf


# Imports & helpers

In [None]:
import os
import io
import streamlit as st
from typing import List

def read_file(file) -> str:
    name = file.name.lower()
    if name.endswith('.pdf'):
        from pypdf import PdfReader
        reader = PdfReader(io.BytesIO(file.read()))
        return "\n".join([p.extract_text() or '' for p in reader.pages])
    else:
        return file.read().decode('utf-8', errors='ignore')

def get_embedder(backend: str):
    if backend == 'OpenAI':
        from openai import OpenAI
        if (k:=st.secrets.get('OPENAI_API_KEY', None) if hasattr(st,'secrets') else None) or os.environ.get('OPENAI_API_KEY'):
            os.environ['OPENAI_API_KEY'] = k or os.environ.get('OPENAI_API_KEY','')
        client = OpenAI()
        return lambda texts: [d.embedding for d in client.embeddings.create(model='text-embedding-3-small', input=texts).data]
    if backend == 'SentenceTransformers':
        from sentence_transformers import SentenceTransformer
        model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
        return lambda texts: model.encode(texts, show_progress_bar=False, normalize_embeddings=True).tolist()
    if backend == 'HuggingFace Inference':
        from huggingface_hub import InferenceClient
        hf = InferenceClient(token=st.secrets.get('HUGGINGFACEHUB_API_TOKEN', os.environ.get('HUGGINGFACEHUB_API_TOKEN')))
        return lambda texts: [hf.embeddings(model='sentence-transformers/all-MiniLM-L6-v2', inputs=[t]).data[0].embedding for t in texts]
    raise ValueError('Unknown backend')

def build_index(vectors):
    import numpy as np
    import faiss
    arr = np.array(vectors, dtype='float32')
    faiss.normalize_L2(arr)
    index = faiss.IndexFlatIP(arr.shape[1])
    index.add(arr)
    return index

def search(index, query_vec, k=5):
    import numpy as np
    import faiss
    q = np.array([query_vec], dtype='float32')
    faiss.normalize_L2(q)
    scores, idxs = index.search(q, k)
    return scores[0], idxs[0]


# UI

In [None]:
st.set_page_config(page_title="Embeddings Search", page_icon="ðŸ”Ž")
st.title("ðŸ”Ž Embeddings & Semantic Search")

with st.sidebar:
    backend = st.selectbox('Embedding backend', ['SentenceTransformers','OpenAI','HuggingFace Inference'], index=0)
    top_k = st.slider('Top K', 1, 20, 5, 1)

st.subheader('Corpus')
text_input = st.text_area('Paste text (one paragraph per line)', height=150)
files = st.file_uploader('Or upload files (PDF/TXT/MD)', type=['pdf','txt','md'], accept_multiple_files=True)


# Build index

In [None]:
if st.button('Build index'):
    docs: List[str] = []
    if text_input.strip():
        docs += [ln.strip() for ln in text_input.splitlines() if ln.strip()]
    for f in files or []:
        try:
            docs.append(read_file(f))
        except Exception as e:
            st.warning(f"Failed to read {f.name}: {e}")
    if not docs:
        st.warning('Provide some text or files first.')
    else:
        embed = get_embedder(backend)
        vecs = embed(docs)
        idx = build_index(vecs)
        st.session_state['emb_docs'] = docs
        st.session_state['emb_index'] = idx
        st.session_state['emb_embed'] = embed
        st.success(f'Indexed {len(docs)} documents.')


# Query

In [None]:
q = st.text_input('Query text')
if st.button('Search') and q:
    if 'emb_index' not in st.session_state:
        st.warning('Build the index first.')
    else:
        embed = st.session_state['emb_embed']
        qv = embed([q])[0]
        scores, idxs = search(st.session_state['emb_index'], qv, k=top_k)
        docs = st.session_state['emb_docs']
        for rank, (i, s) in enumerate(zip(idxs, scores), start=1):
            if 0 <= i < len(docs):
                with st.expander(f"#{rank}  score={float(s):.3f}"):
                    st.write(docs[i])


# Notes
# - Export/import index can be added (faiss write_index/read_index), commented for safety.
# - For large corpora, use persistent vector DBs (Chroma, Milvus, Pinecone).