In [2]:
import pandas as pd
import pickle

In [3]:
#Início

In [4]:
from utils.embeddings_utils import (
    get_embedding,
    distances_from_embeddings,
    tsne_components_from_embeddings,
    chart_from_components,
    indices_of_nearest_neighbors_from_distances,
)

In [5]:
EMBEDDING_MODEL = "text-embedding-3-small"

In [7]:
# load data (full dataset available at http://groups.di.unipi.it/~gulli/AG_corpus_of_news_articles.html)
dataset_path = "csv_buscas_encoded.csv"
df = pd.read_csv(dataset_path, delimiter='\t')

n_examples = 5
df.tail(n_examples)

Unnamed: 0,PMID,TITLE,ABSTRACT
5017,31447391,α-Ketoglutarate-Activated NF-κB Signaling Prom...,The rapid proliferation of cancer cells and dy...
5018,36107617,β Cell mass expansion during puberty involves ...,Puberty is associated with transient insulin r...
5019,37368990,β-Cell Glucose Sensitivity to Assess Changes i...,Following a diagnosis of type 1 diabetes (T1D)...
5020,35551509,β-Klotho promotes glycolysis and glucose-stimu...,Impaired glucose-stimulated insulin secretion ...
5021,33627831,βA1-crystallin regulates glucose metabolism an...,"βA3/A1-crystallin, a lens protein that is also..."


In [8]:
df_novo = df[['TITLE', 'ABSTRACT']]


In [9]:
# set path to embedding cache
embedding_cache_path = "csv_buscas.pkl"

# load the cache if it exists, and save a copy to disk
try:
    embedding_cache = pd.read_pickle(embedding_cache_path)
except FileNotFoundError:
    embedding_cache = {}
with open(embedding_cache_path, "wb") as embedding_cache_file:
    pickle.dump(embedding_cache, embedding_cache_file)

def embedding_from_dataset(texto: str, modelo: str = EMBEDDING_MODEL) -> list:
    """Retorna o embedding de um texto, usando cache para evitar recomputação."""
    if (texto, modelo) not in embedding_cache:
        embedding_cache[(texto, modelo)] = get_embedding(texto, modelo)
        with open(embedding_cache_path, "wb") as cache_file:
            pickle.dump(embedding_cache, cache_file)
    return embedding_cache[(texto, modelo)]

In [11]:
example_string = df["ABSTRACT"].values[1]
print(f"\nExample string: {example_string}")

# print the first 10 dimensions of the embedding
example_embedding = embedding_from_dataset(example_string)
print(example_embedding)


Example string: We aimed to determine why the Eagle Books, an illustrated series for American Indian and Alaska Native (AIAN) children to address type 2 diabetes, remain viable long after their release. We sought to answer 2 questions: Why did the books maintain popularity? What factors have sustained them? Type 2 diabetes burgeoned in the US after World War II, compounding a long legacy of injustices for AIAN peoples. By the 1980s, their rates soared above those of White people. Concerned for future generations, Tribal Leaders suggested that the Centers for Disease Control and Prevention and Indian Health Service use traditional storytelling to teach children about staying healthy. Public health interventions are most effective when culture and history are integrated into health education, particularly stories to address a relatively new disease for AIAN peoples. From 2008 through 2013, we conducted a case study among 8 tribal communities to evaluate the uptake of the Eagle Books acr

UnicodeEncodeError: 'ascii' codec can't encode character '\xba' in position 29: ordinal not in range(128)

In [58]:
def print_recommendations_from_strings(
    strings: list[str],
    index_of_source_string: int,
    k_nearest_neighbors: int = 1,
    model=EMBEDDING_MODEL,
) -> list[int]:
    """Print out the k nearest neighbors of a given string."""
    # get embeddings for all strings
    embeddings = [embedding_from_dataset(string, modelo=model) for string in strings]

    # get the embedding of the source string
    query_embedding = embeddings[index_of_source_string]

    # get distances between the source embedding and other embeddings (function from utils.embeddings_utils.py)
    distances = distances_from_embeddings(query_embedding, embeddings, distance_metric="cosine")
    
    # get indices of nearest neighbors (function from utils.utils.embeddings_utils.py)
    indices_of_nearest_neighbors = indices_of_nearest_neighbors_from_distances(distances)

    # print out source string
    query_string = strings[index_of_source_string]
    print(f"Source string: {query_string}")
    # print out its k nearest neighbors
    k_counter = 0
    for i in indices_of_nearest_neighbors:
        # skip any strings that are identical matches to the starting string
        if query_string == strings[i]:
            continue
        # stop after printing out k articles
        if k_counter >= k_nearest_neighbors:
            break
        k_counter += 1

        # print out the similar strings and their distances
        print(
            f"""
        --- Recommendation #{k_counter} (nearest neighbor {k_counter} of {k_nearest_neighbors}) ---
        String: {strings[i]}
        Distance: {distances[i]:0.3f}"""
        )

    return indices_of_nearest_neighbors

In [61]:
article_abstracts = df["ABSTRACT"].tolist()

pubmed_articles = print_recommendations_from_strings(
    strings=article_abstracts,  # let's base similarity off of the article description
    index_of_source_string=0,  # articles similar to the first one about Tony Blair
    k_nearest_neighbors=5,  # 5 most similar articles
)

UnicodeEncodeError: 'ascii' codec can't encode character '\xba' in position 29: ordinal not in range(128)

In [60]:
df

Unnamed: 0.1,Unnamed: 0,PMID,TITLE,ABSTRACT
0,0,25381313,"""A spoonful of sugar helps the medicine go dow...",Sweeteners are often added to liquid formulati...
1,1,30568272,"""IGT-like"" status in normoglucose tolerant obe...",To assess whether combining glucose shape and ...
2,2,37055154,"""Make Stories That Will Always Be There"": Eagl...","We aimed to determine why the Eagle Books, an ..."
3,3,37891544,"""She gives it to her child who doesn't even ta...",There is little research on alcohol and other ...
4,4,36599745,"""The curriculum brings equity to the forefront...",The Accreditation Council for Graduate Medical...
...,...,...,...,...
5018,5018,31447391,-Ketoglutarate-Activated NF-B Signaling Promot...,The rapid proliferation of cancer cells and dy...
5019,5019,36107617,Cell mass expansion during puberty involves s...,Puberty is associated with transient insulin r...
5020,5020,37368990,-Cell Glucose Sensitivity to Assess Changes in...,Following a diagnosis of type 1 diabetes (T1D)...
5021,5021,35551509,-Klotho promotes glycolysis and glucose-stimul...,Impaired glucose-stimulated insulin secretion ...
