In [1]:
from gpt_translate.articles.JsonArticleManager import JsonArticleManager
from gpt_translate.crawl.util import extract_info, scroll_one_step
import pandas as pd

ARTICLE_JSON_PATH = "/opt/shichenh/articles_embedding.json"
article_manager = JsonArticleManager(ARTICLE_JSON_PATH)

In [2]:
articles_df = article_manager.articles_df.copy()
articles_df['date'] = articles_df['date'].apply(str)
articles_df['chinese_tags'] = articles_df['chinese_tags'].apply(str)
articles_df['english_tags'] = articles_df['english_tags'].apply(str)

In [3]:
import chromadb
client = chromadb.PersistentClient(path="/opt/shichenh/gpt_translate/notebooks/embedding_visualizer")

In [4]:
collection = client.get_or_create_collection(name="articles", embedding_function=article_manager.get_embedding,
                                             metadata={"hnsw:space": "cosine"})

In [5]:
documents = articles_df['text'].tolist()
ids = articles_df['id'].apply(str).tolist()
metadatas = articles_df.drop(columns=['text', 'id', 'translation', 'embedding']).fillna("None").to_dict('records')
embeddings = articles_df['embedding'].tolist()

In [6]:
# collection.add(
#     documents=documents,
#     metadatas=metadatas,
#     ids=ids,
#     embeddings=embeddings
# )

In [7]:
collection.count()

1631

In [8]:
search_term = "努力有用吗"
search_embedding = article_manager.get_embedding(search_term)

In [9]:
results = collection.query(
    query_embeddings=[search_embedding],
    n_results=10,
)

Add of existing embedding ID: 0
Add of existing embedding ID: 1
Add of existing embedding ID: 2
Add of existing embedding ID: 3
Add of existing embedding ID: 4
Add of existing embedding ID: 5
Add of existing embedding ID: 6
Add of existing embedding ID: 7
Add of existing embedding ID: 8
Add of existing embedding ID: 9
Add of existing embedding ID: 10
Add of existing embedding ID: 11
Add of existing embedding ID: 12
Add of existing embedding ID: 13
Add of existing embedding ID: 14
Add of existing embedding ID: 15
Add of existing embedding ID: 16
Add of existing embedding ID: 17
Add of existing embedding ID: 18
Add of existing embedding ID: 19
Add of existing embedding ID: 20
Add of existing embedding ID: 21
Add of existing embedding ID: 22
Add of existing embedding ID: 23
Add of existing embedding ID: 24
Add of existing embedding ID: 25
Add of existing embedding ID: 26
Add of existing embedding ID: 27
Add of existing embedding ID: 28
Add of existing embedding ID: 29
Add of existing embe

In [40]:
def find_relevant_articles(input_term: str, vectorDB_collection, top_k=10):
    search_embedding = vectorDB_collection._embedding_function(input_term)
    results = vectorDB_collection.query(
        query_embeddings=[search_embedding],
        include=["documents", "metadatas",  "embeddings", "distances"],
        n_results=top_k,
    )
    df = pd.DataFrame()
    for key in results.keys():
        if key and results[key]:
            df[key] = results[key][0]
    metadatas_df = pd.DataFrame(results['metadatas'][0])
    return df, metadatas_df

from sklearn.manifold import TSNE
import numpy as np

def compute_tsne(embeddings, n_components=2, perplexity=30, learning_rate=200):
    """
    Compute t-SNE coordinates for a list of embeddings.
    
    Parameters:
    - embeddings (list of list): The list of embeddings.
    - n_components (int): Number of dimensions for the t-SNE coordinates.
    - perplexity (float): The perplexity parameter for t-SNE.
    - learning_rate (float): The learning rate for t-SNE.
    
    Returns:
    - np.ndarray: The t-SNE coordinates.
    """
    
    # Convert the list of embeddings to a NumPy array
    embeddings_array = np.array(embeddings)
    
    # Initialize the t-SNE model
    tsne_embedder = TSNE(n_components=n_components, perplexity=perplexity, learning_rate=learning_rate)
    
    # Compute the t-SNE coordinates
    tsne_coordinates = tsne_embedder.fit_transform(embeddings_array)
    
    return tsne_coordinates#, tsne_embedder

In [45]:
results_df, metadatas_df = find_relevant_articles(search_term, collection, top_k=100)

In [46]:
results_df

Unnamed: 0,ids,distances,metadatas,embeddings,documents
0,571,0.416790,"{'chinese_main_topic': '成功和人生态度', 'chinese_tag...","[-0.015952521935105324, -0.003797715762630105,...",\n\n李子柒，我们应该是最后一个写的，在这之前，所有号都为她叫好。\n我们昨天写的是热剧《...
1,1619,0.433275,"{'chinese_main_topic': '成功与感恩', 'chinese_tags'...","[-0.008797785267233849, 0.004995233844965696, ...",\n欢迎关注微信公众号：记忆承载，请关注阅读全文\n有读者问了一个很有意思的问题\n你说李佳...
2,688,0.438320,"{'chinese_main_topic': '中美竞争与坚韧', 'chinese_tag...","[-0.005474371835589409, -0.006829591002315283,...",\n有读者问我，对孟晚舟这事儿怎么看，也有读者问我，这两天川普又跳出来威胁我们。其实这个话题...
3,201,0.439473,"{'chinese_main_topic': '通过努力和坚持获得成功', 'chinese...","[-0.014242274686694145, -0.021819759160280228,...",\n\n最近一直在聊新闻，读者的问题积累了很多。\n我看了一下，几大类。\n第一类，读书晚，...
4,647,0.445293,"{'chinese_main_topic': '新冠疫情', 'chinese_tags':...","[-0.009243706241250038, 0.0017460334347561002,...",\n\n我们昨天的文章，十几个小时就百万+了，话说上一次50万+，都是前年的事儿了。\n因为...
...,...,...,...,...,...
95,1088,0.504163,"{'chinese_main_topic': '商业策略', 'chinese_tags':...","[-0.002697806805372238, -0.020768292248249054,...",这个问题我听到后就笑了，很有意思\n我想起90年代，大家都很喜欢看古惑仔，看完后还纷纷仿效\...
96,11,0.504519,"{'chinese_main_topic': '自由和能够反抗一切的能力', 'chines...","[0.011664139106869698, -0.023092936724424362, ...",\n\n我们近期聊过知否，聊过大江大河，聊过年轻人做网红好不好，甚至还聊过印度的护垫侠。\n...
97,916,0.504683,"{'chinese_main_topic': '中国电视剧中的商业策略', 'chinese...","[-0.0010741507867351174, -0.04353891313076019,...",很多读者留言，让我聊聊风吹半夏这部大女主的戏。说这是内地剧里少有的商战，堪比昔日的《创世纪》...
98,56,0.504828,"{'chinese_main_topic': '中年困境与心态', 'chinese_tag...","[-0.013065957464277744, -0.0217765960842371, 0...",\n\n我曾经写过一篇文章，记一代枭雄的陨落。\n引出了一个话题，叫做中年人要遭遇的各种困境...


In [20]:
collection_items = collection.get(include=["embeddings", "metadatas"])
collection_items_titles = [item["title"] for item in collection_items["metadatas"]]
collection_items_chinese_tags = [item["chinese_tags"] for item in collection_items["metadatas"]]

In [14]:
collection_items.keys(), len(collection_items['embeddings'])

(dict_keys(['ids', 'embeddings', 'metadatas', 'documents']), 1631)

In [41]:
collection_tsnes = compute_tsne(collection_items['embeddings'])
collection_tsnes_df = pd.DataFrame(collection_tsnes)

In [16]:
import matplotlib.pyplot as plt
import plotly.express as px

#### search tersm

In [67]:
from gpt_translate.prompts import key_context_prompt
import openai
import os 
openai.api_key = os.environ["OPENAI_API_KEY"]
gpt4_config = {
    'model': "gpt-4",
    'temperature': 0,
}

completion_config = gpt4_config

In [79]:
search_term = "女生择偶应该看重什么"
key_context_resp = openai.ChatCompletion.create(**completion_config, messages=key_context_prompt(search_term))
key_context = key_context_resp.choices[0]['message']['content'].replace("作者,", "")
key_context

'女生, 择偶, 看重什么'

In [80]:
search_embedding = article_manager.get_embedding(search_term)

In [81]:
results_df, metadatas_df = find_relevant_articles(search_term, collection, top_k=100)

In [89]:
# Combine the single embedding with the list of embeddings
all_embeddings = [search_embedding] + results_df['embeddings'].tolist() #collection_items['embeddings']

# Compute t-SNE coordinates
tsne_coordinates = compute_tsne(all_embeddings)

# Prepare data for plotting
x_coords = tsne_coordinates[:, 0]
y_coords = tsne_coordinates[:, 1]
colors =  ['blue'] * (len(all_embeddings) - 1) + ['red'] 

# Create scatter plot using Plotly
fig = px.scatter(x=x_coords, y=y_coords, color=results_df['distances'].tolist()+[results_df['distances'].min()-0.1], labels={'x': 't-SNE 1', 'y': 't-SNE 2'}, 
                 hover_data=[metadatas_df['title'].tolist()+[search_term], metadatas_df['chinese_tags'].tolist()+[{}]], 
                 width=1000, height=600)
fig.update_traces(marker=dict(size=10))
fig.show()

In [21]:
px.scatter(collection_tsnes_df, x=0, y=1, hover_data=[collection_items_titles, collection_items_chinese_tags], width=1000, height=1000)

In [31]:
results_df

Unnamed: 0,ids,distances,metadatas,embeddings,documents
0,571,0.41679,"{'chinese_main_topic': '成功和人生态度', 'chinese_tag...","[-0.015952521935105324, -0.003797715762630105,...",\n\n李子柒，我们应该是最后一个写的，在这之前，所有号都为她叫好。\n我们昨天写的是热剧《...
1,1619,0.433275,"{'chinese_main_topic': '成功与感恩', 'chinese_tags'...","[-0.008797785267233849, 0.004995233844965696, ...",\n欢迎关注微信公众号：记忆承载，请关注阅读全文\n有读者问了一个很有意思的问题\n你说李佳...
2,201,0.439473,"{'chinese_main_topic': '通过努力和坚持获得成功', 'chinese...","[-0.014242274686694145, -0.021819759160280228,...",\n\n最近一直在聊新闻，读者的问题积累了很多。\n我看了一下，几大类。\n第一类，读书晚，...
3,518,0.448711,"{'chinese_main_topic': '教育与学习', 'chinese_tags'...","[0.0025502329226583242, 0.01888122782111168, 0...",\n\n今天大号在聊<\n某些人不仅数学是体育老师教的，连语文也是\n>\n孔子讲民可，使由...
4,409,0.449409,"{'chinese_main_topic': '成功学', 'chinese_tags': ...","[-0.013263363391160965, -0.0006487715872935951...",\n\n这个社会很流行成功学，啥叫个成功学？\n其实就九个字：\n跟对人、走对路、做对事。\...
5,347,0.450205,"{'chinese_main_topic': '中国社会与文化', 'chinese_tag...","[-0.006797545123845339, -0.012640676461160183,...",\n\n《小欢喜》最近热播，黄磊饰演爸爸方圆，海清饰演妈妈董文洁，她们的孩子在经历高考，其实...
6,315,0.45138,"{'chinese_main_topic': '时间管理', 'chinese_tags':...","[-0.01033133827149868, 0.0022077634930610657, ...",\n\n我们号的编辑一直希望我能够回复一期，回复那些不喜欢我的文章，又坚持每期必读的，特殊的...
7,480,0.452585,"{'chinese_main_topic': '克服劣势，找到立足之地', 'chinese...","[-0.02305769920349121, 0.02152341417968273, 0....",\n\n我们有个读者在后台发消息，讲述了自己的故事，故事太长了，太占篇幅，我归纳总结下，总结...
8,682,0.458323,"{'chinese_main_topic': '成功和个人努力', 'chinese_tag...","[-0.0267731212079525, -0.006828425917774439, 0...",\n昨天聊，近七成大学生认为自己毕业十年内将年薪百万。我是回答问题的，没想到引出更多问题。在...
9,325,0.45841,"{'chinese_main_topic': '人生哲学', 'chinese_tags':...","[-0.006586086004972458, -0.014691769145429134,...",\n\n有读者问我怎么看婚姻。\n为啥在文章出轨的时候，能够共患难，当事情平息多年后，反而一...


In [30]:
results_df['ids'].tolist()

['571', '1619', '201', '518', '409', '347', '315', '480', '682', '325']