In [None]:
%load_ext autoreload
%autoreload 2

# Webvox
Get audio summaries of any website, blog or paper

[![GitHub](https://img.shields.io/badge/GitHub-View_on_GitHub-blue?logo=GitHub)](https://github.com/puravparab/webvox)

---
## Table of Contents

1. [Data](#Data-)
    - 1.1. [Blog](#Blog-)
    - 1.2. [Website](#Website-)
    - 1.3. [Paper](#Paper-)
2. [Summarization](#Summarization-)
3. [Audio](#Audio-)

---
## Data <a id='Data-'></a>

Let's import data from blogs, websites and papers that we can summarize

In [None]:
from scraper import Content

### Blog <a id='Blog-'></a>

In [None]:
# Insert url of a blog below
url = "https://paulgraham.com/foundermode.html"

blog = Content(url, 'blog')
blog.scrape()
if blog.text:
    print(blog.text[:300])

### Paper <a id='Paper-'></a>

In [None]:
# Insert url of a paper below
url = "https://ar5iv.labs.arxiv.org/html/1706.03762"

paper = Content(url, 'blog')
paper.scrape()
if paper.text:
    print(paper.text[1754:3000])

## Summarization <a id='Summarization-'></a>

Using an LLM to summarize content

### Llama 3.2 1B Instruct
https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct

In [None]:
import os
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login

from dotenv import load_dotenv
load_dotenv(override=True)

# Login to Hugging Face
hf_token = os.getenv('HF_TOKEN')
if not hf_token:
    raise ValueError("HF_TOKEN not found in environment variables")
login(token=hf_token)

# Load the model
model_name = 'meta-llama/Llama-3.2-1B-Instruct'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

In [None]:
blog._tokenize(tokenizer)

In [None]:
# %%time
# from transformers import TextIteratorStreamer
# from threading import Thread

# def summarize_blog_streaming(blog_text, max_length=4000):
#     prompt = f"Summarize the following blog post:\n\n{blog_text}\n\nSummary:"
    
#     # Tokenize the input
#     inputs = tokenizer(prompt, return_tensors="pt", max_length=6000, truncation=True)
    
#     # Create a streamer
#     streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
    
#     # Generate the summary in a separate thread
#     generation_kwargs = dict(
#         inputs,
#         streamer=streamer,
#         max_new_tokens=max_length,
#         num_return_sequences=1,
#         do_sample=True,
#         temperature=0.7,
#         top_p=0.95,
#     )
    
#     thread = Thread(target=model.generate, kwargs=generation_kwargs)
#     thread.start()
    
#     # Stream the output
#     print("Streaming summary:")
#     generated_text = ""
#     for new_text in streamer:
#         print(new_text, end="", flush=True)
#         generated_text += new_text
    
#     # Extract only the generated summary, removing the input prompt
#     summary = generated_text.split("Summary:")[1].strip()
    
#     return summary

# summary = summarize_blog_streaming(blog.text)
# print("\n\nFinal Summary:")
# print(summary)