# Yahoo news Data fetch and save

This notebook downloads all the data from the reuters about a specific topic and saves it to the database

In [1]:
import bs4
from bs4 import BeautifulSoup
import requests
import ray
from math import ceil
import time
from datetime import datetime
from http.client import HTTPSConnection
from urllib.parse import urljoin
import re
from time import sleep

In [2]:
num_cpus = 4

ray.init(ignore_reinit_error=True, num_cpus=num_cpus)


2024-06-15 17:44:16,047	INFO worker.py:1715 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


0,1
Python version:,3.8.18
Ray version:,2.9.3
Dashboard:,http://127.0.0.1:8265


In [3]:
import duckdb

class NewsMetaRepository:
    def __init__(self, csv_file=None):
        self.connection = duckdb.connect(database=':memory:', read_only=False)
        if csv_file is None:
            self.connection.execute("CREATE TABLE news_meta (id VARCHAR, title VARCHAR UNIQUE, url VARCHAR, timestamp VARCHAR UNIQUE,source VARCHAR)")
            self.connection.execute("CREATE SEQUENCE id_sequence START 1 INCREMENT BY 1;")
        else:
            self.connection.execute(f"IMPORT DATABASE '{csv_file}';")

    def insert(self, news_meta):
        self.connection.execute("PREPARE insert_meta AS "
                                "INSERT INTO news_meta VALUES (nextval('id_sequence'), ?, ?, ?, ?) ON CONFLICT DO NOTHING;")
        self.connection.execute(f"EXECUTE insert_meta('{news_meta['title']}', '{news_meta['url']}', '{news_meta['timestamp']}', '{news_meta['source']}');")

    def select_all(self):
        return self.connection.execute("SELECT * FROM news_meta").fetchdf()

    def select_by_id(self, id):
        return self.connection.execute("SELECT * FROM news_meta WHERE id = ?", id).fetchdf()

    def select_by_title(self, title):
        return self.connection.execute("SELECT * FROM news_meta WHERE title = ?", title).fetchdf()

    def select_by_url(self, url):
        return self.connection.execute("SELECT * FROM news_meta WHERE url = ?", url).fetchdf()

    def select_by_date(self, date_from, date_to):
        return self.connection.execute(f"SELECT * FROM news_meta WHERE strptime(timestamp, '%Y-%m-%dT%H:%M:%S%z') BETWEEN strptime('{date_from}','%Y-%m-%d') AND strptime('{date_to}','%Y-%m-%d')").fetchdf()

    def delete_all(self):
        self.connection.execute("DELETE FROM news_meta")

    def export(self, csv_file):
        self.connection.execute(f"EXPORT DATABASE '{csv_file}';")

    def close(self):
        self.connection.close()
        


In [4]:
def try_request(url, headers, params):

    # resp_proxy = requests.get('https://free-proxy-list.net/')
    # df = pd.read_html(resp_proxy.text)[0]
    # df_http = df[df['Https']=='no']
    # df_https = df[df['Https']=='yes']
    # proxy_http = f'https://{df_http["IP Address"].values[0]}:{df_http["Port"].values[0]}'
    # proxy_https = f'https://{df_https["IP Address"].values[0]}:{df_https["Port"].values[0]}'

    proxies={
        "http": 'socks5://193.35.18.30:30808',
        "https": 'socks5://193.35.18.30:30808'
    }

    response_page = requests.request("GET", url, headers=headers, params=params, timeout=5)
    try:
        response = response_page.text

        if response_page.status_code > 300:
            print("Status code error: " + str(response_page.status_code))
            return False, params

        return True, response
    except requests.exceptions.JSONDecodeError as e:
        print(f"Bad Request: GET {url} \n Status Code: {response_page.status_code} | Error : {e}")
        return False, params
    except requests.exceptions.Timeout:
        print("Timed out")
        return False, params
    except requests.exceptions.RequestException as e:
        print(f"Bad Request: GET {url} \n")
        return False, params

def get(host, url, payload, headers):
    connection = HTTPSConnection(host)
    connection.request('GET', url, payload, headers)

    response = connection.getresponse()
    location_header = response.getheader('location')

    if response.status < 400:
        if location_header is None:
            return 200, response.read()
        else:
            location = urljoin(url, location_header)
            return get(host, location, payload, headers)
    else: return response.status, None

In [11]:
headers = {
    'accept': '*/*',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'en-US,en;q=0.9',
    'referer': 'https://www.google.com',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.44'
}

def get_article(card):
    """Extract article information from the raw html"""
    headline = card.find('h4', 's-title').text
    source = card.find("span", 's-source').text
    posted = card.find('span', 's-time').text.replace('·', '').strip()
    description = card.find('p', 's-desc').text.strip()
    raw_link = card.find('a').get('href')
    unquoted_link = requests.utils.unquote(raw_link)
    pattern = re.compile(r'RU=(.+)\/RK')
    clean_link = re.search(pattern, unquoted_link).group(1)

    article = (headline, source, posted, description, clean_link)
    return article

def get_news_meta(search_term):
    template = 'https://news.search.yahoo.com/search?p={}'
    url = template.format(search_term)
    articles_titles = set()
    articles = []
    
    page = 0

    while True:
        page += 1
        print(f"Page {page}")
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        cards = soup.find_all('div', 'NewsArticle')

        # extract articles from page
        for card in cards:
            try:
                article_element: bs4.PageElement = card.find('li')
                article_title = ''
                if article_element.find('a', 'thmb ') is None:
                    article_title = article_element.find('h4').text
                else :
                    article_title = article_element.find('a')['title']
                                
                if article_title not in articles_titles:
                    articles_titles.add(article_title)
                    article = get_article(card)
                    news_meta = {
                        'title': article_title,
                        'source': article[1],
                        'timestamp': article[2],
                        'description': article[3],
                        'url': article_element.find('a')['href']
                    }
                    print(f"{article_title}: {news_meta['url']}")
                
            except Exception as e:
                print('Error at this:', card)
                break
        try:
            url = soup.find('a', 'next').get('href')
            sleep(1)
        except AttributeError:
            break

articles = get_news_meta('Tesla')

Page 1
Tesla Cybertruck Deliveries Reportedly Halted Due To Safety Issue: https://r.search.yahoo.com/_ylt=AwrEbpVoum1mJX0Q2wHQtDMD;_ylu=Y29sbwNiZjEEcG9zAzEEdnRpZAMEc2VjA3Ny/RV=2/RE=1718495976/RO=10/RU=https%3a%2f%2finsideevs.com%2fnews%2f723427%2ftesla-cybertruck-wiper-recall-june%2f/RK=2/RS=BJyCXyx3OprFYBiLsYu55fXrpWo-
How Elon Musk's $44.9B Tesla pay package compares with the most generous plans for other U.S. CEOs: https://r.search.yahoo.com/_ylt=AwrEbpVoum1mJX0Q3QHQtDMD;_ylu=Y29sbwNiZjEEcG9zAzIEdnRpZAMEc2VjA3Ny/RV=2/RE=1718495976/RO=10/RU=https%3a%2f%2fwww.kiro7.com%2fnews%2fhow-elon-musks-449b%2fVXO6GRNLXYS4B4GKG2TBPEAMRM%2f/RK=2/RS=7Y3kQ7pi3_u69wwZEno1h6XQkI0-
Tesla, Inc. (NASDAQ:TSLA) Shares Sold by WNY Asset Management LLC: https://r.search.yahoo.com/_ylt=AwrEbpVoum1mJX0Q3gHQtDMD;_ylu=Y29sbwNiZjEEcG9zAzMEdnRpZAMEc2VjA3Ny/RV=2/RE=1718495976/RO=10/RU=https%3a%2f%2fwww.etfdailynews.com%2f2024%2f06%2f15%2ftesla-inc-nasdaqtsla-shares-sold-by-wny-asset-management-llc%2f/RK=2/RS=JdG2l

KeyboardInterrupt: 