In [1]:
import ha
jokes = ha.joke_datasets['reddit_jokes']

In [2]:
import os, config2py
os.environ["OPENAI_API_KEY"] = config2py.config_getter("OPENAI_API_KEY")

In [3]:
from typing import Iterable
from lkj import chunker
import oa

def reddit_joke_metadata_to_key_and_text(metdata: dict, *, sep='\n'):
    return f"{metdata['title']}{sep}({metdata['body']})"

# segment = next(chunker(jokes, 20))

In [None]:
# jokes_list = list(chunker(jokes, 20))
# jokes_list


In [5]:
# texts = list(map(reddit_joke_metadata_to_key_and_text, segment))
# cumul2 = oa.embeddings(texts)

In [4]:
import tiktoken
from typing import List
def tokenize(text: str, encoding_name : str ) -> List[str]:
    encoding = tiktoken.get_encoding(encoding_name)
    int_tokens = encoding.encode(text)
    str_tokens = [encoding.decode_single_token_bytes(token) for token in int_tokens]
    return str_tokens

In [5]:
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [6]:

import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/Sana/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
from nltk.corpus import stopwords
from typing import List, Tuple

# stop_words = set(stopwords.words('english'))

# def remove_stop_words(tokens: List[str]) -> List[str]:
#     return [token for token in tokens if token.lower() not in stop_words]


In [8]:
encoding_name = "cl100k_base" 

def process_joke(joke: dict, max_tokens: int) -> Tuple[str, dict, bool]:
    joke_text = reddit_joke_metadata_to_key_and_text(joke)
    # joke_tokens = tokenize(joke_text)
    # joke_tokens = remove_stop_words(joke_tokens)
    joke_tokens = num_tokens_from_string(joke_text, encoding_name)

    if joke_tokens > max_tokens:
        # joke_tokens = joke_tokens[:max_tokens]
        return "", {}, True  # Skips the joke

    # truncated_text = ' '.join(joke_tokens)
    # return truncated_text
    metadata = {
        "source": "Reddit",
        "id": joke['id'],
        "score": joke['score']
    }
    return joke_text, metadata, False

# def process_jokes(jokes: List[dict], max_tokens: int) -> List[Tuple[str, dict]]:
#     processed_jokes = map(lambda joke: process_joke(joke, max_tokens), jokes)
#     return [(joke_text, metadata)  for joke_text, metadata, skip in processed_jokes if not skip]

In [71]:
max_tokens = 8192
batch_size =  20
single_batch = next(chunker(jokes, batch_size))

import time
start_time = time.time()
texts = [joke_text for joke_text, _, skip in map(lambda joke: process_joke(joke, max_tokens), single_batch) if not skip]
embeddings = oa.embeddings(texts)
end_time = time.time()

elapsed_time = end_time - start_time
print("Temps required pour un batch de 20:", elapsed_time, "secondes")



Temps required pour un batch de 20: 0.7049269676208496 secondes


In [72]:
## If we need to include all jokes (no exceed of the maximum token limit (max_tokens) under the best conditions)
total_batches = (len(jokes) + batch_size - 1) // batch_size  
total_time = total_batches * elapsed_time
print("Temps total pour tout le dataset:", total_time, "secondes")


Temps total pour tout le dataset: 6857.529541015625 secondes


In [None]:
def seconds_to_hours_minutes_seconds(seconds):
    hours = seconds // 3600 
    remaining_seconds = seconds % 3600  
    minutes = remaining_seconds // 60  
    remaining_seconds = remaining_seconds % 60  
    return hours, minutes, remaining_seconds

hours, minutes, remaining_seconds = seconds_to_hours_minutes_seconds(total_time)
print(f"{total_time} secondes ==> {hours} heures, {minutes} minutes, {remaining_seconds} secondes")

In [11]:
#chroma run --path /Users/Sana/chromadb   

import chromadb

## Use open ai embedding in chroma:

# import chromadb.utils.embedding_functions as embedding_functions
# openai_ef = embedding_functions.OpenAIEmbeddingFunction(
#                 api_key= os.environ["OPENAI_API_KEY"] ,
#                 model_name="text-embedding-3-small"
#             )

# chroma_client = chromadb.Client()
# chroma_client = chromadb.PersistentClient(path="/Users/Sana/chromadb")
# chroma_client = chromadb.HttpClient(host="localhost", port=8000)
from chromadb.config import Settings
chroma_client = chromadb.HttpClient(
    settings=Settings(chroma_client_auth_provider="chromadb.auth.token.TokenAuthClientProvider",
                      chroma_client_auth_credentials="test-token"))
chroma_client.heartbeat()  
# chroma_client.get_version()  
# chroma_client.list_collections()  
# chroma_client.delete_collection(name="jokes_rag")
# chroma_client.delete_collection(name="reddit_jokes")

# joke_collection = chroma_client.get_or_create_collection(name="rag_jokes", embedding_function=openai_ef)
joke_collection = chroma_client.get_or_create_collection(name="rag_jokes")
chroma_client.list_collections() 

[Collection(name=rag_jokes)]

In [84]:
max_tokens = 8192
batch_size = 20
joke_chunker = chunker(jokes, batch_size)  # Generator for batching jokes

for single_batch in joke_chunker:
    # Process each joke in the batch and filter out the ones that exceed max_tokens
    processed_batch = map(lambda joke: process_joke(joke, max_tokens), single_batch)
    valid_jokes = [(joke_text, metadata) for joke_text, metadata, skip in processed_batch if not skip]

    # Separate the joke texts, metadata, and IDs for embedding calculation and storage
    if valid_jokes:
        texts, metadatas = zip(*valid_jokes)
        ids = [metadata['id'] for metadata in metadatas]

        # Compute embeddings for the filtered jokes
        embeddings = oa.embeddings(list(texts))

        # Store embeddings, texts, IDs, and metadata in the Chroma database
        for embedding, joke_text, joke_id, metadata in zip(embeddings, texts, ids, metadatas):
            joke_collection.add(embeddings=embedding, documents=joke_text, ids=joke_id, metadatas=metadata)

In [None]:
all_jokes = joke_collection.get()

In [None]:
len(all_jokes['ids'])

In [12]:
## Embed query with the same model
query = "What is the funiest joke?"
embed_query = oa.embeddings(query)

In [13]:
len(embed_query)

1536

In [32]:
## Then, we'll retrieve the top-k most relevant chunks by extracting the closest embedded chunks to our embedded query. We use cosine distance
# joke_collection.modify(metadata={"hnsw:space": "cosine"})
results = joke_collection.query(
    query_embeddings=[embed_query],
    n_results=10,
    #include = [ "documents" ]
)

In [33]:
res = "\n".join(str(item) for item in results['documents'][0])
res

'What is the Funniest Joke You Can Think of\n(Make it hilarious please.)\nWhats the funniest joke you know?\n(You.)\nWorld\'s Funniest Joke\n(The "world\'s funniest joke" is a term used by Richard Wiseman of the University of Hertfordshire in 2002 to summarize one of the results of his research. For his experiment, named LaughLab, he created a website where people could rate and submit jokes. Purposes of the research included discovering the joke that had the widest appeal and understanding among different cultures, demographics and countries.\n\nThe History Channel eventually hosted a special on the subject.\n\n\nThe winning joke, which was later found to be based on a 1951 Goon Show sketch by Spike Milligan,was submitted by Gurpal Gosal of Manchester:\n\n\n   *Two hunters are out in the woods when one of them collapses. He doesn\'t seem to be breathing and his eyes are glazed. The other guy whips out his phone and calls the emergency services. He gasps, "My friend is dead! What can I

In [16]:
from openai import OpenAI
openai_client = OpenAI()

In [17]:
from openai import OpenAI

client = OpenAI()

def get_response(invite, instance_client, modele="gpt-3.5-turbo"):
    messages = [{"role": "user", "content": invite}]
    reponse = instance_client.chat.completions.create(
        model=modele,
        messages=messages,
        max_tokens=50,
        temperature=0,
    )
    return reponse.choices[0].message.content



In [18]:
## Response generation

prompt=f'```{res}```is the funiest joke'

messages = [
        {"role": "system", "content": "You answer questions about reddit jokes."},
        {"role": "user", "content": prompt}
]
response = openai_client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=messages,
    temperature=0
)
response_message = response.choices[0].message.content

print(response_message)

It seems like you're looking for the world's funniest joke, which was determined by Richard Wiseman's LaughLab experiment. The winning joke, submitted by Gurpal Gosal of Manchester, goes like this:

"Two hunters are out in the woods when one of them collapses. He doesn't seem to be breathing and his eyes are glazed. The other guy whips out his phone and calls the emergency services. He gasps, 'My friend is dead! What can I do?' The operator says 'Calm down. I can help. First, let's make sure he's dead.' There is a silence, then a gun shot is heard. Back on the phone, the guy says 'OK, now what?'"

It's always interesting to see what jokes people find the funniest!


In [42]:
## Response generation

context = [item for item in results['documents']]
sources = [item["source"] for sublist in results['metadatas'] for item in sublist]
scores = [item["score"] for sublist in results['metadatas'] for item in sublist]
user_content = f"query: {query}, context: {context}"
messages = [{"role": role, "content": content} for role, content in [
        ("system",  "Answer the query using the context provided. Be succinct."), 
        ("user", user_content)] if content]
response = openai_client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=messages,
    temperature=0
)
response_message = response.choices[0].message.content


In [50]:
result = {
"question": query,
# "sources": sources,
"score" : scores,
"answer": response_message
}

In [51]:
result

{'question': 'What is the funiest joke?',
 'score': [0, 0, 828, 0, 0, 0, 0, 11, 0, 0],
 'answer': 'The funniest joke, according to research by Richard Wiseman, is: "Two hunters are out in the woods when one of them collapses. He doesn\'t seem to be breathing and his eyes are glazed. The other guy whips out his phone and calls the emergency services. He gasps, \'My friend is dead! What can I do?\' The operator says \'Calm down. I can help. First, let\'s make sure he\'s dead.\' There is a silence, then a gun shot is heard. Back on the phone, the guy says \'OK, now what?\'"'}

In [52]:
results['documents']

[['What is the Funniest Joke You Can Think of\n(Make it hilarious please.)',
  'Whats the funniest joke you know?\n(You.)',
  'World\'s Funniest Joke\n(The "world\'s funniest joke" is a term used by Richard Wiseman of the University of Hertfordshire in 2002 to summarize one of the results of his research. For his experiment, named LaughLab, he created a website where people could rate and submit jokes. Purposes of the research included discovering the joke that had the widest appeal and understanding among different cultures, demographics and countries.\n\nThe History Channel eventually hosted a special on the subject.\n\n\nThe winning joke, which was later found to be based on a 1951 Goon Show sketch by Spike Milligan,was submitted by Gurpal Gosal of Manchester:\n\n\n   *Two hunters are out in the woods when one of them collapses. He doesn\'t seem to be breathing and his eyes are glazed. The other guy whips out his phone and calls the emergency services. He gasps, "My friend is dead! 

In [58]:
joke_with_score_828 = []
for joke in jokes: 
    if joke["score"] == 828:
        joke_with_score_828.append(joke)

if joke_with_score_828:
    print("Blagues trouvée :")
    for joke in joke_with_score_828:
        print(joke)
else:
    print("Aucune blague trouvée avec un score de 828.")


Blagues trouvée :
{'body': 'Methed Up', 'id': '4s7b2g', 'score': 828, 'title': 'What do you call Mike Tyson on drugs?'}
{'body': 'Apparently it just changes the color of the baby', 'id': '49vgvc', 'score': 828, 'title': "After my vasectomy I thought I couldn't get my wife pregenant"}
{'body': 'The "world\'s funniest joke" is a term used by Richard Wiseman of the University of Hertfordshire in 2002 to summarize one of the results of his research. For his experiment, named LaughLab, he created a website where people could rate and submit jokes. Purposes of the research included discovering the joke that had the widest appeal and understanding among different cultures, demographics and countries.\n\nThe History Channel eventually hosted a special on the subject.\n\n\nThe winning joke, which was later found to be based on a 1951 Goon Show sketch by Spike Milligan,was submitted by Gurpal Gosal of Manchester:\n\n\n   *Two hunters are out in the woods when one of them collapses. He doesn\'t s