In [153]:
import pandas as pd
import numpy as np
import pickle
import os
from transformers import AutoTokenizer
from sentence_transformers import SentenceTransformer

In [130]:
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

In [131]:
def _get_model():
    return SentenceTransformer(MODEL_NAME, 
                               cache_folder="./models_cache",
                               token=None
                              )

In [132]:
def create_vector_embedding(text, embedding_model="sentence-transformers/all-MiniLM-L6-v2"):
    model = _get_model()
    return model.encode(text, normalize_embeddings=True) #return vector

In [149]:
library_directory_path = "library.csv"
library_text_path = "library.parquet"
library_vectors_path = "library.pkl"

In [155]:
def get_directory(file_path="library.csv"):
    return pd.read_csv(file_path)

def get_text(file_path):
    return pd.read_parquet(file_path)

def get_vectors(file_path="library.pkl"):
    with open(file_path, "rb") as f:
        return pickle.load(f)

def load_library(csv_path="library.csv",
                 parquet_path="library.parquet",
                 pkl_path="libary.pkl"):
    
    metadata = get_directory(csv_path)
    text = get_text(parquet_path)
    vectors = get_vectors(pkl_path)

    return metadata, text, vectors

In [150]:
def pull_text_for_document(document_name):
    base_name = os.path.splitext(document_name)[0]
    
    metadata, text, vectors = load_library(library_directory_path,
                                           library_text_path,
                                           library_vectors_path
                                          )
    
    rows = metadata[(metadata["document"] == base_name)]

    if rows.empty:
        print(f"No document found matching: {document_name}")
    
    uids = rows["chunk_uid"].tolist()
    
    return text[text["chunk_uid"].isin(uids)]
    

In [151]:
def pull_data_for_document(document_name):
    base_name = os.path.splitext(document_name)[0]

    metadata, text, vectors = load_library(library_directory_path,
                                           library_text_path,
                                           library_vectors_path
                                          )

    rows = metadata[(metadata["document"] == base_name)]

    if rows.empty:
        print(f"No document found matching: {document_name}")
        return

    vectors_df = pd.DataFrame(vectors)
    
    df = rows.merge(text, on="chunk_uid", how="inner")
    df = df.merge(vectors_df, on="chunk_uid", how="inner")

    return df

In [187]:
def topk_cosine(df: pd.DataFrame, query_vector: np.ndarray, K: int=5):

    X = np.stack(df["vector_embedding"].to_numpy())
    Xn = X / np.linalg.norm(X, axis=1, keepdims=True).clip(min=1e-12)
    qn = query_vector / np.linalg.norm(_input_vector).clip(min=1e-12)

    sims = Xn @ qn

    K = min(K, sims.size)
    idx = np.argpartition(-sims, K-1)[:K]
    idx = idx[np.argsort(-sims[idx])]

    out = df.iloc[idx].copy()
    out["cosine"] = sims[idx]

    return out

In [197]:
def find_n_matches(_input, K):
    qvec = create_vector_embedding(_input)
    
    metadata, text, vectors = load_library(library_directory_path,
                                           library_text_path,
                                           library_vectors_path
                                          )

    vectors_df = pd.DataFrame(vectors)
    matches_df = topk_cosine(vectors_df, qvec, K)
    df = matches_df.merge(metadata, on="chunk_uid", how="inner")
    df = df.merge(text, on="chunk_uid", how="inner")
    
    return df

In [233]:
_input = "For the mind, knowledge is not a static entity embedded in neural tissue but a dynamic process of activation. It is a pattern of signals, timings, and intensities across neural networks that is always transient and context-dependent. Structural connections (synapses and circuits) create the potential for knowledge to emerge, but they are not knowledge itself. Just as sheet music (document) is not the performance of a symphony, yet it contains the notes (data) necessary to perform the music (information), neural wiring is not the knowledge expressed through it. What we call “knowing” exists only in the live orchestration of neural signals. The information of the mind only becomes visible through behavior or the creation of artifacts driven by those signals (speaking, writing, acting, creating). There, cognition is embodied into objects, carrying the information of those signals into physical objects."

matches = find_n_matches(_input, 15)

In [234]:
for idx, row in matches.iterrows():
    print(row["cosine"], "\n", row["document"], "\n", row["text"], "\n")

0.5551691651344299 
 eScholarship UC item 4x2561mb-1 
 (This distinction may be overstated. Knowledge may well be represented in the brain in some tangible, physical way. However, for present purposes and for the time being, treating knowledge in the mind as importantly different from artificial stores of information seems reasonable and useful. Academic examinations test individuals' ability to answer questions or to solve problems, which is presumed to provide indirect measures of what they know. But that is not the same.) Knowledge, however, can be represented, just as an event can be filmed. However, the representation is no more knowledge than the film is the event. Any such representation is necessarily in tangible form (sign, signal, data, text, film, etc.) and so representations of knowledge (and of events) are necessarily “information-as-thing.” Information-as-thing is of special interest in the study of information systems. It is with information in this sense that informatio

In [224]:
#for idx, row in pull_text_for_document("?sh=3a6bbac87d53").iterrows():
#    print(row['text'],"\n")