In [12]:
import os
import time
import requests
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display
from openai import OpenAI

In [None]:
load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY','your-key-if-not-using-env')
openai = OpenAI()

In [None]:
class Website:
    url: str
    title: str
    text: str

    def __init__(self, url):
        self.url = url
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        for irrelevant in soup.body(["script", "style", "img", "input"]):
            irrelevant.decompose()
        self.text = soup.body.get_text(separator="\n", strip=True)

In [None]:
system_prompt = "You are an assistant that analyzes the contents of a website \
and provides a short summary, ignoring text that might be navigation related. \
Respond in markdown."

In [None]:
def user_prompt_for(website):
    user_prompt = f"You are looking at a website titled {website.title}"
    user_prompt += "The contents of this website is as follows; \
please provide a short summary of this website in markdown. \
If it includes news or announcements, then summarize these too.\n\n"
    user_prompt += website.text
    return user_prompt

In [None]:
def messages_for(website):
    return [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt_for(website)}
    ]

In [None]:
def summarize(url):
    website = Website(url)
    response = openai.chat.completions.create(
        model = "gpt-4o-mini",
        messages = messages_for(website)
    )
    return response.choices[0].message.content

In [None]:
def display_summary(url):
    summary = summarize(url)
    display(Markdown(summary))

In [None]:
display_summary("https://cnn.com")

In [None]:
display_summary("https://openai.com")

In [None]:
display_summary("https://anthropic.com")

### Updated code to handle javascript enabled website

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import platform
import random
import undetected_chromedriver as uc

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec

In [None]:
class WebsiteScrapper:
    def __init__(self, url, max_retries=2, headless=True, wait_selector="body", wait_timeout=10):
        self.url = url
        self.__text = ""
        self.__title = ""
        self.headless = headless
        self.max_retries = max_retries
        self.wait_selector = wait_selector
        self.wait_timeout = wait_timeout

    def __log_html(self, html, filename="last_scraped.html"):
        try:
            with open(filename, "w", encoding="utf-8") as f:
                f.write(html)
            print(f"[✅] Saved page HTML to {filename} for debugging.")
        except Exception as e:
            print(f"[⚠️] Could not save page HTML: {e}")

    def parse(self):
        # Launch stealth browser

        attempt = 0
        while attempt < self.max_retries:
            try:
                options = uc.ChromeOptions()
                options.headless = self.headless  # Set to False if you want to see the browser
                options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36")
                options.add_argument("--no-sandbox")
                options.add_argument("--disable-dev-shm-usage")
                options.add_argument("--disable-gpu")
                with uc.Chrome(options=options) as driver:
                    driver.get(self.url)
                    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                    time.sleep(random.uniform(1, 3))
                    WebDriverWait(driver, self.wait_timeout).until(
                        ec.presence_of_element_located((By.CSS_SELECTOR, self.wait_selector))
                    )

                    time.sleep(1)  # Give JS a moment more if needed
                    page_source = driver.page_source
                    self.__log_html(page_source)

                # Detect bot protection
                if "enable javascript" in page_source.lower() or "checking your browser" in page_source.lower():
                    self.__title = "Blocked by Bot Protection"
                    self.__text = "This website uses advanced protection (e.g., Cloudflare). Content not accessible."
                    return

                soup = BeautifulSoup(page_source, 'html.parser')
                self.__title = soup.title.string if soup.title else "No title found"

                for irrelevant in soup(["script", "style", "img", "input"]):
                    irrelevant.decompose()

                self.__text = soup.body.get_text(separator="\n", strip=True)
                try:
                    os.remove("last_scraped.html")
                    print("Cleaned up debug HTML file.")
                except Exception as e:
                    print(f"Could not delete debug HTML file: {e}")
                return  # Success

            except Exception as e:
                print(f"[❌] Attempt {attempt + 1} failed: {e}")
                attempt += 1
                time.sleep(2)

        # All retries failed
        self.__title = "Failed to load"
        self.__text = "Website could not be scraped after several attempts."

    def get_text(self):
        return self.__text

    def get_title(self):
        return self.__title

In [None]:
class JSWebsiteSummarizer:
    def __init__(self, url, headless=True):
        self.url = url
        self.website_scrapper = WebsiteScrapper(url, headless=headless)
        self.system_prompt = "You are an assistant that analyzes the contents of a website \
                            and provides a short summary, ignoring text that might be navigation related. \
                            Respond in markdown."

    @staticmethod
    def __user_prompt_for(title, content):
        user_prompt = f"You are looking at a website titled {title}"
        user_prompt += "The contents of this website is as follows; \
                        please provide a short summary of this website in markdown. \
                        If it includes news or announcements, then summarize that too.\n\n"
        user_prompt += content
        return user_prompt

    def __messages_for(self, title, content):
        return [{"role": "system", "content": self.system_prompt}, {"role": "user", "content": JSWebsiteSummarizer.__user_prompt_for(title, content)}]

    def __summarize(self):
        self.website_scrapper.parse()
        chat_config = self.__messages_for(self.website_scrapper.get_title(), self.website_scrapper.get_text())
        response = openai.chat.completions.create(model="gpt-4o-mini", messages=chat_config)
        return response.choices[0].message.content

    def display_summary(self):
        summary = self.__summarize()
        display(Markdown(summary))

In [None]:
url1 = "https://cnn.com"
url2 = "https://openai.com"
url3 = "https://anthropic.com"

In [None]:
web_summariser = JSWebsiteSummarizer(url=url1)
web_summariser.display_summary()

In [None]:
web_summariser = JSWebsiteSummarizer(url=url3, headless=False)
web_summariser.display_summary()

In [None]:
web_summariser = JSWebsiteSummarizer(url=url2, headless=False)
web_summariser.display_summary()

In [None]:
web_summariser = JSWebsiteSummarizer(url=url2, headless=False)
web_summariser.display_summary()

In [13]:
OLLAMA_API = "http://localhost:11434/api/chat"
HEADERS = {"Content-Type": "application/json"}
MODEL = "llama3.1:8b"

In [None]:
messages = [
    {"role": "user", "content": "Describe some of the business applications of Generative AI"}
]

In [None]:
payload = {
        "model": MODEL,
        "messages": messages,
        "stream": False
    }

In [None]:
response = requests.post(OLLAMA_API, json=payload, headers=HEADERS)
# print(response.json())

# print(response.json()['message']['content'])

In [None]:
print(response.json()['message']['content'])

In [14]:
import ollama

In [None]:
response = ollama.chat(model=MODEL, messages=messages)
print(response['message']['content'])

In [15]:
MODEL = "qwen2.5:14b"
messages=[{"role": "user", "content": "Please give definitions of some core concepts behind LLMs: a neural network, attention and the transformer"}]
response = ollama.chat(model=MODEL, messages=messages)
print(response['message']['content'])

Certainly! Understanding the foundational concepts in large language models (LLMs) is crucial to appreciate how they work and achieve remarkable performance in various natural language processing tasks.

### Neural Network

A **neural network** is an interconnected group of nodes or artificial neurons that mimic biological neural networks. In machine learning, these networks are used to solve complex problems by learning from data without being explicitly programmed for a specific task. A typical neural network consists of:

- **Layers**: These include the input layer (where data enters), hidden layers (which process information using weights and biases), and an output layer (which produces predictions or classifications).
- **Nodes (Neurons)**: Each node performs computations on its inputs (weighted sum) followed by a non-linear activation function to introduce complexity.
- **Connections**: Weights assigned to each connection between nodes, which are adjusted during training to minim