In [1]:
import json
import pandas as pd

In [ ]:
from openai import OpenAI
from pydantic import BaseModel

client = OpenAI(api_key="YOUR_API_KEY_HERE")

In [10]:
system_prompt = """
Categorize the input in either: 
1. Machine Learning
2. Consciousness
3. Meditation
4. None

Anything related to pytorch, machine learning, llms, ai, research goes in "Machine Learning"

Output only one of the 4 things from above, that's all, just the words, no number, nothing else.
"""

In [38]:
def categorize(text):
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": [{"type": "text", "text": system_prompt}]},
            {"role": "user", "content": [{"type": "text", "text": text}]},
        ],
        response_format={
            "type": "text"
        },
        temperature=1,
        max_completion_tokens=4,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )
    return response.choices[0].message.content

In [3]:
import numpy as np
import pandas as pd

df = pd.read_json("../links.jsonl", lines=True)
df['prompt'] = df.apply(lambda x: f"Title: {x.subject}\nLink: {x.link}", axis=1)
print(df.shape)

(1324, 4)


In [20]:
from tqdm import tqdm

In [37]:
# def categorize(text):
#     if "arxiv" in text.lower():
#         return "Arxiv"
#     elif "github" in text.lower():
#         return "Github"
#     else:
#         return "None"


In [53]:
keywors = ['rl ', 'ml ', 'ai ', 'llm', 'model', 'tech', 'sft', 'deepseek', 'cuda', 'agi', 'torch', 'training', 'agent', 'bert', 'gpu', 'llama', 'jax', 'transformer', 'reinforcement', 'gradient', 'tensor', 'neural', 'token', 'anthropic', 'machine learning', 'grpo', 'gpt', 'github', 'hugging', 'deepmind', 'attention', 'openai']

In [39]:
categories = []
for i in tqdm(df['prompt'].values):
    if "arxiv" in i.lower():
        categories.append("Arxiv")
    elif "github" in i.lower():
        categories.append("Github")
    else:
        categories.append(categorize(i))

100%|██████████| 1324/1324 [12:27<00:00,  1.77it/s] 


In [41]:
df['category'] = categories

In [51]:
df['category'].value_counts()

category
None                471
Machine Learning    424
Github              206
Arxiv               131
Consciousness        41
Meditation           32
2. Consciousness      1
Name: count, dtype: int64

In [40]:
from collections import Counter
Counter(categories)

Counter({'Machine Learning': 425,
         'Github': 213,
         'None': 480,
         'Consciousness': 42,
         'Meditation': 32,
         'Arxiv': 131,
         '2. Consciousness': 1})

In [9]:
df.sample(1).prompt.values[0]

'Title: Subframe – The best way to build UI, fast.\nLink: https://www.subframe.com/'

In [46]:
df = df[df['prompt'].apply(len) < 1000]
df.shape

(1306, 5)

In [43]:
df = df[df['prompt'].apply(len) < 1000]

In [49]:
df

Unnamed: 0,subject,date,link,prompt,category,embeddings
0,Jared Kaplan's intro to RL / ML,"Fri, 21 Feb 2025 03:12:21 +0000",https://t.co/h8NwzG7DEq?ssr=true,Title: Jared Kaplan's intro to RL / ML\nLink: ...,Machine Learning,"[0.01486137229949236, -0.02036067098379135, -0..."
1,A socratic dialogue over the utility of DNA la...,"Fri, 21 Feb 2025 00:45:00 +0000",https://www.owlposting.com/p/a-socratic-dialog...,Title: A socratic dialogue over the utility of...,Machine Learning,"[-0.0077461740002036095, -0.02742907591164112,..."
2,GitHub - SRSWTI/axis: AI eXplainable Inference...,"Fri, 21 Feb 2025 00:10:48 +0000",https://github.com/SRSWTI/axis,Title: GitHub - SRSWTI/axis: AI eXplainable In...,Github,"[0.011364064179360867, 0.022419951856136322, -..."
3,Books and Papers Libray Archive,"Thu, 20 Feb 2025 23:34:19 +0000",https://open-slum.org/,Title: Books and Papers Libray Archive\nLink: ...,,"[0.00933102983981371, 0.021284399554133415, -0..."
4,gemini-samples/examples/gemini-structured-outp...,"Thu, 20 Feb 2025 10:47:18 +0000",https://github.com/philschmid/gemini-samples/b...,Title: gemini-samples/examples/gemini-structur...,Github,"[0.006022864952683449, -0.008834462612867355, ..."
...,...,...,...,...,...,...
1319,,"Wed, 17 Jan 2024 04:02:53 +0530",https://alexanderobenauer.com/?s=09,Title: \nLink: https://alexanderobenauer.com/?...,,"[0.0042592198587954044, 0.033369679003953934, ..."
1320,,"Wed, 17 Jan 2024 03:55:20 +0530",https://blog.southparkcommons.com/what-is-nega...,Title: \nLink: https://blog.southparkcommons.c...,,"[0.006409750320017338, -0.002439069328829646, ..."
1321,,"Wed, 17 Jan 2024 03:30:02 +0530",https://notes.giorgiop.com/rob-burbea,Title: \nLink: https://notes.giorgiop.com/rob-...,,"[0.011286010965704918, -0.02334591932594776, -..."
1322,,"Mon, 15 Jan 2024 04:45:12 +0530",https://nhigham.com/2018/12/03/half-precision-...,Title: \nLink: https://nhigham.com/2018/12/03/...,Machine Learning,"[0.020562395453453064, 0.02013615518808365, -0..."


In [47]:
df['embeddings'] = [i.embedding for i in client.embeddings.create(input=list(df['prompt'].values), model="text-embedding-3-large").data]

In [48]:
df.to_pickle("../df.embedding")

In [45]:
inputs = list(df['prompt'].values)

In [50]:
embeddings = [i.embedding for i in client.embeddings.create(input=inputs, model="text-embedding-3-large").data]

In [51]:
df['embeddings'] = embeddings
df.to_pickle("../df.embedding")

In [13]:
df = pd.read_pickle("../df.embedding")
print("# of embeddings: ", df.shape)
embeddings = df['embeddings'].tolist()

# of embeddings:  (1173, 5)


In [14]:
def cosine_similarity(arr, matrix):
    arr_norm = arr / np.linalg.norm(arr)
    matrix_norm = matrix / np.linalg.norm(matrix, axis=1, keepdims=True)
    similarity = np.dot(matrix_norm, arr_norm)
    return similarity


def search(term: str):
    # if not term:
    #     df = df[['link', 'subject', 'date']]
    #     df.columns = ['url', 'title', 'date']
    #     return df.to_dict(orient='records')
    # query_emb = self.co.embed(texts=[term], model="embed-english-v3.0", input_type="search_query").embeddings[0]
    query_emb = client.embeddings.create(input=[term], model="text-embedding-3-large").data[0].embedding
    similarity = cosine_similarity(query_emb, embeddings)
    # Replace argpartition with argsort for full sorting
    top_indices = np.argsort(similarity)[::-1]
    sdf = df.iloc[top_indices][['link', 'subject', 'date']]
    sdf.columns = ['url', 'title', 'date']
    matching_links = sdf.to_dict(orient='records')
    return matching_links


In [16]:
search("LLMs from scratch")

[{'url': 'https://github.com/patrickvonplaten/notebooks/blob/master/getting_the_most_out_of_LLMs.md',
  'title': 'notebooks/getting_the_most_out_of_LLMs.md at master · patrickvonplaten/notebooks',
  'date': 'Fri, 10 May 2024 16:23:11 -0400'},
 {'url': 'https://sakana.ai/llm-squared/',
  'title': 'Can LLMs invent better ways to train LLMs?',
  'date': 'Thu, 13 Jun 2024 04:02:09 -0400'},
 {'url': 'https://glhf.chat/landing/home',
  'title': 'run vLLM models',
  'date': 'Tue, 19 Nov 2024 18:19:54 +0000'},
 {'url': 'https://www.yitay.net/blog/training-great-llms-entirely-from-ground-zero-in-the-wilderness',
  'title': 'Training great LLMs entirely from ground up in the wilderness as a startup — Yi Tay',
  'date': 'Wed, 12 Jun 2024 00:05:45 -0400'},
 {'url': 'https://arunpatro.github.io/blog/mcts/',
  'title': 'LLM code gen',
  'date': 'Wed, 24 Jul 2024 03:34:21 +0000'},
 {'url': 'https://www.reddit.com/r/LocalLLaMA/comments/14vnfh2/my_experience_on_starting_with_fine_tuning_llms/?share_id=

In [26]:
class ProductInfo(BaseModel):
    seller_name: str
    brand_name: str
    sub_brand_name: str
    category_name: str
    product_line_name: str


extraction_prompt = """
    You will be provided with an HTML converted to Markdown format.
    Your goal will be to extract the following information following the schema provided.
    Here is a description of the parameters:
    - seller_name: name of the seller
    - brand_name: name of the brand
    - sub_brand_name: name of the sub-brand
    - category_name: name of the category
    - product_line_name: name of the product line
"""

def query_gpt4(html):
    clean_html = md(html, strip=['a']).replace("\#", "").replace("RN", "")
    try:
        response = client.beta.chat.completions.parse(
            model="gpt-4o-2024-08-06",
            messages=[
                {"role": "system", "content": extraction_prompt},
                {"role": "user", "content": clean_html}
            ],
            temperature=0,
            response_format=ProductInfo
        )
        return response.choices[0].message.parsed.json()
    except Exception as e:
        print(f"Error querying GPT-4: {e}")
        return None


  clean_html = md(html, strip=['a']).replace("\#", "").replace("RN", "")


In [None]:
df = pd.read_csv("~/Downloads/amazon_fragments_1stOct_13thOct.tsv", sep="\t")
df = df[~df['pageType'].isna()]
df['product_info'] = df['combined '].apply(query_gpt4)

In [57]:
x = df.iloc[3]
print(x['pageType'])
print(x['product_info'])

https://www.amazon.com/Drywall-Phillips-Self-Tapping-Woodworking-Projects/dp/B0C3MBNNMH/ref=sw_img_d_sspa_dk_huc_pt_thematic_1?_encoding=UTF8&pd_rd_i=B0C3MBNNMH&pd_rd_w=iVSqr&content-id=amzn1.sym.193b00ef-a68f-43e4-bf24-3a5077a0fd33&pf_rd_p=193b00ef-a68f-43e4-bf24-3a5077a0fd33&pf_rd_r=4SY1HN3C808QJTQV46C0&pd_rd_wg=W2Knz&pd_rd_r=4dbb2f9f-72c5-4f1b-aa40-15b2cddd3b74&sp_csd=d2lkZ2V0TmFtZT1zcF9odWNfdGhlbWF0aWM=
{"seller_name":"HCLX","brand_name":"LEOU","sub_brand_name":"LEOU Store","category_name":"Industrial & Scientific","product_line_name":"Wood Screws"}


In [4]:
import base64

base64.urlsafe_b64decode('ANGjdJ_0SxtNeU1gW1IDr8Hgv0OrvAt1iBCz_ulnW-RsN-G6M1Z2WYN6Ch7kAPy4t_a_Wc1jTXVRXp_I6_BXfMijeLUnBo5jGL3xbGJE-C48ICfN6tzgW9I0s3b58pJBLyHoVc8qJV5g8G5yoE-NutV4ZQvJ3IMe9UvQ-UWRZMFyaVf4ji4r3tgskwMFt4G1yP2q2TpGyiKS56dz9MLT8yQuWMPuQeqjDTnEVJdg_v_CfsEn9wzeV-c6BJN2v16BAvJTQJUqtsgW8fmvCD35Zx75UaleFxHZlOJqBYgZ3tNMSXPEtDVHa1XkWvABnhmsOmqOZ4bK-crS1VrOKBakBBS5R7q9p5kbIeo3oq3t-A7X9Uhncej6iqE4gbKondp79mmfC2cQmkHtoy6idXGR')

b'\x00\xd1\xa3t\x9f\xf4K\x1bMyM`[R\x03\xaf\xc1\xe0\xbfC\xab\xbc\x0bu\x88\x10\xb3\xfe\xe9g[\xe4l7\xe1\xba3VvY\x83z\n\x1e\xe4\x00\xfc\xb8\xb7\xf6\xbfY\xcdcMuQ^\x9f\xc8\xeb\xf0W|\xc8\xa3x\xb5\'\x06\x8ec\x18\xbd\xf1lbD\xf8.< \'\xcd\xea\xdc\xe0[\xd24\xb3v\xf9\xf2\x92A/!\xe8U\xcf*%^`\xf0nr\xa0O\x8d\xba\xd5xe\x0b\xc9\xdc\x83\x1e\xf5K\xd0\xf9E\x91d\xc1riW\xf8\x8e.+\xde\xd8,\x93\x03\x05\xb7\x81\xb5\xc8\xfd\xaa\xd9:F\xca"\x92\xe7\xa7s\xf4\xc2\xd3\xf3$.X\xc3\xeeA\xea\xa3\r9\xc4T\x97`\xfe\xff\xc2~\xc1\'\xf7\x0c\xdeW\xe7:\x04\x93v\xbf^\x81\x02\xf2S@\x95*\xb6\xc8\x16\xf1\xf9\xaf\x08=\xf9g\x1e\xf9Q\xa9^\x17\x11\xd9\x94\xe2j\x05\x88\x19\xde\xd3LIs\xc4\xb45GkU\xe4Z\xf0\x01\x9e\x19\xac:j\x8eg\x86\xca\xf9\xca\xd2\xd5Z\xce(\x16\xa4\x04\x14\xb9G\xba\xbd\xa7\x99\x1b!\xea7\xa2\xad\xed\xf8\x0e\xd7\xf5Hgq\xe8\xfa\x8a\xa18\x81\xb2\xa8\x9d\xda{\xf6i\x9f\x0bg\x10\x9aA\xed\xa3.\xa2uq\x91'

In [22]:
message_body = """<https://github.com/rasbt/LLMs-from-
scratch/blob/main/ch07/04_preference-tuning-with-dpo/dpo-from-
scratch.ipynb>

Thanks,
Shubham
"""

In [27]:
link = message_body.strip().replace('Thanks,\nShubham', '')
link = link.replace("<", "").replace(">", "")
print(link)

https://github.com/rasbt/LLMs-from-
scratch/blob/main/ch07/04_preference-tuning-with-dpo/dpo-from-
scratch.ipynb




In [30]:
print(link.replace("\n", ""))

https://github.com/rasbt/LLMs-from-scratch/blob/main/ch07/04_preference-tuning-with-dpo/dpo-from-scratch.ipynb


In [24]:
link = link.split("\n")[0].strip()
link

'<https://github.com/rasbt/LLMs-from-'