# Purpose:
This code uses gpt_4o_mini to summarize the latest crypto news from 10 best crypto websites and emails the result to my mailbox.

In [None]:
# imports

import os
import requests
from dotenv import load_dotenv
from IPython.display import Markdown, display
from openai import OpenAI

# import web scrappers libraries
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time

# import email libraries
from email.message import EmailMessage
import ssl
import smtplib

### Define a website scrapper class to collect data from any website

In [None]:
class WebsiteCrawler:
    def __init__(self, url, wait_time=20, chrome_binary_path=None):
        """
        Initialize the WebsiteCrawler using Selenium to scrape JavaScript-rendered content.
        """
        self.url = url
        self.wait_time = wait_time

        options = uc.ChromeOptions()
        options.add_argument("--disable-gpu")
        options.add_argument("--no-sandbox")
        options.add_argument("--disable-dev-shm-usage")
        options.add_argument("--disable-blink-features=AutomationControlled")
        options.add_argument("start-maximized")
        options.add_argument(
            "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
        )
        if chrome_binary_path:
            options.binary_location = chrome_binary_path

        self.driver = uc.Chrome(options=options)

        try:
            # Load the URL
            self.driver.get(url)

            # Wait for Cloudflare or similar checks
            time.sleep(10)

            # Ensure the main content is loaded
            WebDriverWait(self.driver, self.wait_time).until(
                EC.presence_of_element_located((By.TAG_NAME, "main"))
            )

            # Extract the main content
            main_content = self.driver.find_element(By.CSS_SELECTOR, "main").get_attribute("outerHTML")

            # Parse with BeautifulSoup
            soup = BeautifulSoup(main_content, "ht`ml.parser")
            self.title = self.driver.title if self.driver.title else "No title found"
            self.text = soup.get_text(separator="\n", strip=True)

        except Exception as e:
            print(f"Error occurred: {e}")
            self.title = "Error occurred"
            self.text = ""

        finally:
            self.driver.quit()

In [None]:
# run webrsite scrapper class

chrome_path = "C:/Program Files/Google/Chrome/Application/chrome.exe"

web = WebsiteCrawler("https://thedefiant.io/", 30, chrome_path)

In [None]:
# show title of website

web.title

In [None]:
# show website content
web.text

### Prompt to summarize news from a single website for the model

In [None]:
# define a system prompt
system_prompt = "You are a crypto investor that searches website to extract and summarize daily latest \
crypto news that will impact the price of crypto. Ignore text that are not crypto related or that are not current news. \
do nothing if you are unable to access the website or there is no content available. \
Respond in markdown."

# define a user prompt
def user_prompt_for(website):
    user_prompt = f"You are looking at a website titled {website.title}" 
    user_prompt += "\nThe contents of this website is as follows; please highlight top 3 breaking news related to cryptocurrencies.\n \
    Provide a link to access this breaking news"
    user_prompt += website.text
    return user_prompt

def messages_for(website):
    return [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt_for(website)}
    ]

In [None]:
# Load environment variables in a file called .env

load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')

# Check the key

if not api_key:
    print("No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!")
elif not api_key.startswith("sk-proj-"):
    print("An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook")
elif api_key.strip() != api_key:
    print("An API key was found, but it looks like it might have space or tab characters at the start or end -\
    please remove them - see troubleshooting notebook")
else:
    print("API key found and looks good so far!")

openai = OpenAI()

### Get latest crypto news from individual website

In [None]:
# Get the latest news that influences crptocurrency price trajectory using gpt

def new_summary(url, chrome_path):
    web = WebsiteCrawler(url, 30, chrome_path)

    response = openai.chat.completions.create(
            model = "gpt-4o-mini",
            messages = messages_for(web)
        )

    web_summary = response.choices[0].message.content
    
    return web_summary

### Get Top Crypto news from 9 crypto website

In [None]:
# test the model on a scrapped website`
new_summary("https://thedefiant.io", chrome_path)

In [None]:
website_list = [
    "https://www.coindesk.com",
    "https://www.cointelegraph.com/",
    "https://beincrypto.com/",
    "https://www.decrypt.co/",
    "https://thedefiant.io/",
    "https://www.coinbureau.com",
    "https://www.blockworks.co",
    "https://www.reddit.com/r/CryptoCurrency/?rdt=65535",
    "https://www.cryptonews.com/"
    ] 

news_stack = {}
for website in website_list:
    summary = []
    try:
        summary = new_summary(website, chrome_path)
    except:
        print(f"Unable to access {website}")
    news_stack[website] = summary

In [None]:
news_stack

### Prompt to summarize the result of multiple website's news highlights

In [None]:
# define a system prompt
system_prompt = "You are a crypto news analyst named Amy, that reports crypto news. \
Prepare the news in a format that can be emailed to crypto Enthusiast. \
Add the source to each news"

# define a user prompt
def user_prompt(all_news):
    user_prompt = f"You are looking at the content of a Python dictionary where the key is the source website and \
    the values are the top 3 latest news from that website." 
    user_prompt += "\nThe contents of this dictionary are as follows; Please combine and summarize \
    all the news avoiding duplicating the information. Highlight the top 12 latest news that \
    have a high impact on cryptocurrencies price. \
    When a news appears in multiple sources, identify all the website sources it appeared in and the date of the article \
    for example: BTC experience a price surge because of drop in interest rate (Coindesk, CoinTelegrph Jan 26th) \
    Return nothing when there is no content \n"
    for source, news in all_news.items():
        user_prompt += f"source={source}, news = {news}"
    return user_prompt

def messages(all_news):
    return [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt(all_news)}
    ]

### Summarize all highlighted news from all the websites

In [None]:
def final_summary(news):
    response = openai.chat.completions.create(
            model = "gpt-4o-mini",
            messages = messages(news)
        )

    web_summary = response.choices[0].message.content
    display(Markdown(web_summary))
    return web_summary

In [None]:
# call the final_summary() into a variable 'email' which contains the summarized news
email = final_summary(news_stack)

### Email latest news

In [None]:
# this is what the email looks like in string format
email

In [None]:
# identify the email sender and receiver
sender = "chiamy694@gmail.com"
password = os.getenv('EMAIL_PASSWORD')
receiver = "priscacare20@gmail.com"

In [None]:
# extract the subject of the email
subject = email.split('\n')[0][9:]

In [None]:
# extract the body of the email
body = '\n'.join(email.split('\n')[1:])

In [None]:
Markdown(body)

In [None]:
# sending the email using emailmessage package
email_message = EmailMessage()
email_message["From"] = sender
email_message["To"] = receiver
email_message["Subject"] = subject
email_message.set_content(body)

# to add a layer of security
context = ssl.create_default_context()
#send email with smtp
with smtplib.SMTP_SSL("smtp.gmail.com", 465, context=context) as smtp:
    smtp.login(sender, password)
    smtp.sendmail(sender, receiver, email_message.as_string())

### Convert notebook to pdf and script

In [None]:
!jupyter nbconvert --to script Day1_challenge.ipynb

In [None]:
import schedule
import time

def job():
    # Scrape news
    news = []
    for site in crypto_sites:
        news.extend(scrape_crypto_news(site))

    # Summarize news
    summaries = [summarize_article(item["title"]) for item in news]

    # Send email
    send_email(summaries)

# Schedule the script to run daily at 8 AM 🕗
schedule.every().day.at("08:00").do(job)

while True:
    schedule.run_pending()
    time.sleep(1)