In [1]:

import os
from openai import OpenAI, OpenAIError  
import tiktoken
from pathlib import Path

# 🔧 CONFIG
INPUT_FILE = "parsinghtml.html"
OUTPUT_FILE = "cleaned_output.txt"
CHUNK_TOKEN_LIMIT = 1500
MODEL = "gpt-4o-mini"
OpenAI.api_key = os.environ["OPEN_AI_SECRET_KEY"]

In [4]:
# 📐 Tokenizer
enc = tiktoken.encoding_for_model(MODEL)

def tokenize(text):
    return enc.encode(text)

def detokenize(tokens):
    return enc.decode(tokens)

def chunk_text(text, max_tokens=CHUNK_TOKEN_LIMIT):
    tokens = tokenize(text)
    return [detokenize(tokens[i:i + max_tokens]) for i in range(0, len(tokens), max_tokens)]

def extract_text_with_llm(chunk):
    prompt = f"""You are a cleaning bot. Remove all HTML tags and keep only the human-readable text.

    HTML:
    {chunk}

    Cleaned text:"""

    client = OpenAI(api_key= os.environ["OPEN_AI_SECRET_KEY"])
    
    response = client.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": "You are a helpful assistant that cleans HTML."},
            {"role": "user", "content": prompt}
        ],
        temperature=0,
    )

    cleaned_text = response.choices[0].message.content

    return cleaned_text

def summarize_text(text):
    try:
        print("Starting summarization...")

        prompt = f"""
        Summarize the following text in clear and structured Markdown. Use appropriate headings, subheadings, and bullet points where needed. Keep it concise but informative.

        Text:
        {text}
        """

        client = OpenAI(api_key=os.environ["OPEN_AI_SECRET_KEY"])

        print("Calling OpenAI API...")
        response = client.chat.completions.create(
            model=os.environ.get("OPENAI_MODEL", "gpt-4"),
            messages=[
                {"role": "system", "content": "You are a helpful assistant that summarizes text into well-structured Markdown format using headings, bullet points, and concise language."},
                {"role": "user", "content": prompt}
            ],
            temperature=0,
        )

        print("Received response.")
        return response.choices[0].message.content

    except KeyError as e:
        print(f"Environment variable missing: {e}")
    except OpenAIError as e:
        print(f"OpenAI API error: {e}")
    except Exception as e:
        print(f"Unexpected error: {e}")

    return None



In [5]:
from bs4 import BeautifulSoup

# Load the file
with open(INPUT_FILE, "r", encoding="utf-8") as f:
    html = f.read()

# Strip HTML
soup = BeautifulSoup(html, "html.parser")
text = soup.get_text(separator=" ", strip=True)

cleaned_text= summarize_text(text)
print("starting cleaning")
print(cleaned_text)

starting problem
finished response
starting cleaning
# Business Ideas from OpenAI's GPT-OSS Models

OpenAI has introduced numerous opportunities with their new GPT-OSS models. Below are 27 innovative business ideas that leverage these models.

## 1. Password Hygiene Auditor (Offline)
- **What**: Reviews exported vault data to flag weak/duplicate passwords.
- **How it works**:
  - Parses local CSV/JSON export.
  - Produces a prioritized fix list.
- **Why now**: Enhances security without exposing user passwords.

## 2. Internal Phishing Simulator & Coach (Security)
- **What**: Generates realistic internal phishing tests and teaches safe responses.
- **How it works**:
  - Crafts scenario emails and landing pages.
  - Scores responses and provides instant feedback.
- **Why now**: Utilizes open weights for effective training.

## 3. First-Party Creative Studio (Marketing)
- **What**: Generates tailored ad content without sending data externally.
- **How it works**:
  - Reads CRM/exported an

In [39]:
    raw_html = Path(INPUT_FILE).read_text(encoding="utf-8")
    chunks = chunk_text(raw_html)

    print(f"Processing {len(chunks)} chunks...")

    cleaned_parts = []
    
    MAX_ITERATIONS = 1

    for i, chunk in enumerate(chunks):
        if i >= MAX_ITERATIONS:
            break
        print(f"Chunk {i+1}/{len(chunks)}")
        print(chunk)
        cleaned = extract_text_with_llm(chunk)
        cleaned_parts.append(cleaned)

    Path(OUTPUT_FILE).write_text("\n\n".join(cleaned_parts), encoding="utf-8")
    print(f"Done. Output written to {OUTPUT_FILE}")

Processing 136 chunks...
Chunk 1/136
<div style="position: relative; min-height: 19134.2px;"><div class="css-175oi2r" data-testid="cellInnerDiv" style="transform: translateY(0px); position: absolute; width: 100%;"><div class="css-175oi2r r-j5o65s r-qklmqi r-1adg3ll r-1ny4l3l"><div class="css-175oi2r"><article aria-labelledby="id__p8hntqmrok id__jsh4hfkfq1s id__1u2wwu23xnp id__8gsm1qzmvcc id__7k8bd35aygd id__xcdojrzpydh id__mzlhetjdzcj id__1r9lr3l34bw id__f2ckz3syn9 id__1rxupbv8qlw id__fl0a8hk1pm id__22y4j9x156h id__o6g4lcchqil id__pdrv8msmjx id__3yj99ydts9f id__3cvb887vly9 id__qd15zzi3hal id__h189snwoghu id__y14esi9an id__k1qb8n6pg7q" role="article" tabindex="-1" class="css-175oi2r r-18u37iz r-1udh08x r-1c4vpko r-1c7gwzm r-1ny4l3l" data-testid="tweet"><div class="css-175oi2r r-eqz5dr r-16y2uox r-1wbh5a2"><div class="css-175oi2r r-16y2uox r-1wbh5a2 r-1ny4l3l"><div class="css-175oi2r"><div class="css-175oi2r r-18u37iz"><div class="css-175oi2r r-1iusvr4 r-16y2uox r-ttdzmv"><div class="css