# Get your daily Reddit!

i. Import dependencies

In [1]:
import os
from pathlib import Path
import tiktoken
import openai
from wandb.integration.openai import autolog
from getpass import getpass
from rich.markdown import Markdown
from dotenv import load_dotenv

ii. Initialize environment variables:

In [2]:
load_dotenv()

if os.getenv("OPENAI_API_KEY") is None:
  if any(['VSCODE' in x for x in os.environ.keys()]):
    print('Please enter password in the VS Code prompt at the top of your VS Code window!')
  os.environ["OPENAI_API_KEY"] = getpass("Paste your OpenAI key from: https://platform.openai.com/account/api-keys\n")
assert os.getenv("OPENAI_API_KEY", "").startswith("sk-"), "This doesn't look like a valid OpenAI API key"
print("OpenAI API key configured")

if os.getenv("REDDIT_CLIENT_ID") is None:
  if any(['VSCODE' in x for x in os.environ.keys()]):
    print('Please enter password in the VS Code prompt at the top of your VS Code window!')
  os.environ["REDDIT_CLIENT_ID"] = getpass("Paste your Reddit client ID from: https://old.reddit.com/prefs/apps/\n")
print("Reddit client ID configured")

if os.getenv("REDDIT_CLIENT_SECRET") is None:
  if any(['VSCODE' in x for x in os.environ.keys()]):
    print('Please enter password in the VS Code prompt at the top of your VS Code window!')
  os.environ["REDDIT_CLIENT_SECRET"] = getpass("Paste your Reddit client secret from: https://old.reddit.com/prefs/apps/\n")
print("Reddit client secret configured")

os.environ["LANGCHAIN_WANDB_TRACING"] = "true"
os.environ["WANDB_PROJECT"] = "dailyreddit"

OpenAI API key configured
Reddit client ID configured
Reddit client secret configured


iii. Initialize `wandb` logging

In [3]:
# cell_name: dev
def log():
    autolog.enable({"project": "dailyreddit", "job_type": "dev"})
log()

[34m[1mwandb[0m: Currently logged in as: [33msharifhsn[0m. Use [1m`wandb login --relogin`[0m to force relogin


TypeError: __name__ must be set to a string object

## 1. Download the daily Reddit

The document base for this application is the top Reddit posts of the day. This data is accessed through the `PRAW` API. 

Caching is implemented for each post.

In [10]:
from typing import Any, Dict, List, Tuple
import time
import praw
from praw import Reddit
from praw.models import Subreddit, Submission, Comment
import pickle

reddit: Reddit = praw.Reddit(
    client_id = os.environ["REDDIT_CLIENT_ID"],
    client_secret = os.environ["REDDIT_CLIENT_SECRET"],
    password = os.environ["REDDIT_PASSWORD"],
    user_agent = os.environ["REDDIT_USER_AGENT"],
    username = os.environ["REDDIT_USERNAME"],
)

if not os.path.exists("cache"):
    os.mkdir("cache")

submissions_cache_file: str = "cache/submissions_cache.pkl"

def load_cache(cache: str):
    try:
        with open(cache, 'rb') as file:
            return pickle.load(file)
    except (FileNotFoundError, EOFError):
        return None
    
def save_cache(cache: str, data: Any):
    with open(cache, 'wb') as file:
        pickle.dump(data, file)

def format_comments(comments: List[Comment], depth: int = 0) -> str:
    formatted_comments: str = ""
    for comment in comments:
        comment_text: str = comment.body.replace("\n", " ")
        formatted_comment: str = f"{'  ' * depth}- {comment_text.replace(' - ', ' _ ').replace('--', '—')}\n"
        formatted_comments += formatted_comment
        if comment.replies:
            formatted_comments += format_comments(comment.replies, depth + 1)
    return formatted_comments

submissions_cache: Dict[str, Any] = load_cache(submissions_cache_file)

if not submissions_cache or (time.time() - submissions_cache.get("timestamp", 0)) > 7200:
    # clear cache
    [os.remove(os.path.join("cache", file)) for file in os.listdir("cache")]

    # get subreddit and top posts
    subreddit: Subreddit = reddit.subreddit("nba")
    top_posts: List[Submission] = subreddit.top(time_filter = "day", limit = 25)

    # save posts to cache
    save_cache(submissions_cache_file, {
        "timestamp": time.time(),
        "posts": top_posts
    })


    # create a new markdown file for each post
    for post in top_posts:
        post_md: str = ""

        post.comments.replace_more(limit=1)
        comments: List[Comment] = post.comments
        post_md += format_comments(comments)
        

        sanitized_post_title = "".join(c if c.isalnum() or c in (" ", "-", "_") else "_" for c in post.title).strip("_")[:250]
        with open(f"cache/{sanitized_post_title}.md", "w", encoding = "utf-8") as f:
            f.write(post_md)
else:
    top_posts: List[Submission] = submissions_cache.get("posts", [])


## 2. Process data

We have stored data as Markdown files, now we want to construct each thread as its own document.

In [12]:
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import MarkdownTextSplitter
from langchain_core.documents.base import Document
from langchain_community.document_loaders.telegram import text_to_docs
import re
import glob
from copy import deepcopy

MODEL_NAME: str = "text-davinci-003"

documents = []

markdown_files = glob.glob(os.path.join("cache", "*.md"))
for markdown_file in markdown_files:
    with open(markdown_file, "r", encoding = "utf-8") as file:
        documents.append(Document(page_content=file.read(), metadata={"source": markdown_file}))

document_sections: List[Document] = []

pattern = r"(\n\s*)?- (.*?)(?=\s*\n\s*-\s+|$)"
for document in documents:
    matches: List[Tuple[str, str]] = re.findall(pattern, document.page_content, re.DOTALL)
    threads: List[Document] = []
    current_thread: List[str] = [document.metadata["source"][6:-3]]
    current_depth: int = 0

    for whitespace, comment_text in matches:
        depth: int = len(whitespace) // 2 if whitespace else 0
        #print(f"{'  ' * depth}- {comment_text}")

        if depth == current_depth:
            # depth is as expected, continue thread
            current_thread.append(comment_text)
            current_depth += 1
        elif depth < current_depth:
            # thread has finished, and this next comment is a retreat
            threads.append(Document(page_content = "\n".join(deepcopy(current_thread)), metadata = document.metadata))
            while depth < current_depth:
                current_thread.pop()
                current_depth -= 1
            current_thread.append(comment_text)
            current_depth += 1
        else:
            # this should be unreachable
            raise Exception(
                f"depth exceeds current depth, malformed data. current_depth: {current_depth}, depth: {depth}, comment: {comment_text}"
            )
    
    document_sections.extend(threads)

for document in document_sections:
    print("Thread:")
    print(document.page_content)
    print()

Thread:
Afseth_ DJJ on playing with Luka Doncic_ _I have never been on a team where someone is scoring the ball like this at such a high rate and also getting his teammates involved and making sure that everybody feels comfortable on the floor_ It_s incredib
50pts and 15 assists is wild.  Locked on Mavs said this is only the 2nd time someone has done this (Harden was the 1st)
Harden had a 53/16/17 (54/56/89 splits) game against the knicks at the end of 2016 according to stat muse
Harden was unreal in 2018-2020 seasons. That was his peak.  Luka also had a 60+21+10 game last season on 67% fg.   Thing is Luka at 24 years is not even his prime yet.
Yeah Luka took his superstar leap earlier than Harden did so I'm sure he'll have more absurd stat lines by the end of his career vs Harden
luka never even leaped he came into the league as a superstar. was obvious since day one  that’s why he’s the only guy we have who has the potential to be the greatest ever by the end of his career
He really 

## 3. Embed documents in vector store

In [13]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores.chroma import Chroma
from langchain_core.vectorstores import VectorStoreRetriever

embeddings: OpenAIEmbeddings = OpenAIEmbeddings()
db: Chroma = Chroma().from_documents(document_sections, embeddings)

retriever: VectorStoreRetriever = db.as_retriever(search_type="mmr")

query: str = "How much does it matter to role players if they get a ring?"
docs: List[Document] = retriever.get_relevant_documents(query)

for doc in docs:
    print(f"{doc.page_content}\n")


  if self._task_type is "RETRIEVAL_DOCUMENT":


Langlois_ Monty Williams finds inspiration in the way the Pistons show up the next day to work_ _The way they come back the next day is something I_m blown away by
If you pay me 7 or 8 figures, I'll suck it up and come to work.
If you pay me what their two way guys are getting paid (559,000) I’ll do more than suck it up, I’ll ride it and bounce on it too.
I know dudes who were top of their classes in HS, undergrad, grad...only to be working 90 hour weeks making significantly less than 400k. Really puts shit into perspective.
we should have been nba players.  we are fucking dumb.
I knew it! Where were you when I needed advice!!

Langlois_ Monty Williams finds inspiration in the way the Pistons show up the next day to work_ _The way they come back the next day is something I_m blown away by
Everyone let’s applaud the basketball players who are paid millions to play a sport for showing up to their jobs and fulfilling their contracts
why are we pretending like turning up to a court to lose

## 4. Instantiate prompt template

In [14]:
from langchain.prompts import PromptTemplate

prompt_template: str = """Use the opinions expressed in the following Reddit comments to answer the question at the end.
Answer questions with the perspective and tone shown in the comments.
Use similar slang and vocabulary to the comments.

{context}

Question: {question}
Answer:"""

PROMPT: PromptTemplate = PromptTemplate(
    template = prompt_template, input_variables = ["context", "question"]
)

context: str = "\n\n".join([doc.page_content for doc in docs])
prompt = PROMPT.format(context = context, question = query)

In [15]:
from langchain.llms.openai import OpenAI

llm: OpenAI = OpenAI()
response: str = llm.predict(prompt)

Markdown(response)


In [16]:
import wandb

wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))