# BBC news Data fetch and save

This notebook downloads all the data from the reuters about a specific topic and saves it to the database

In [24]:
import bs4
from bs4 import BeautifulSoup
import requests
# import ray
from math import ceil
import time
from datetime import datetime
from http.client import HTTPSConnection
from urllib.parse import urljoin, unquote
import re
from time import sleep
import datetime
import asyncio
import aiohttp
import os.path
from dateutil import parser

In [4]:
import duckdb

class NewsMetaRepository:
    def __init__(self, csv_file=None):
        self.connection = duckdb.connect(database=':memory:', read_only=False)
        if os.path.isfile(f'{csv_file}/news_meta.csv'):
            try:
                self.connection.execute(f"IMPORT DATABASE '{csv_file}';")
            except Exception as e:
                self._create_table()
                self.connection.execute(f"EXPORT DATABASE '{csv_file}';")
        else:
            self._create_table()

    def _create_table(self):
        self.connection.execute("CREATE TABLE news_meta (id VARCHAR, title VARCHAR UNIQUE, url VARCHAR, timestamp VARCHAR UNIQUE,term VARCHAR)")
        self.connection.execute("CREATE SEQUENCE id_sequence START 1 INCREMENT BY 1;")

    def insert(self, news_meta):
        self.connection.execute("PREPARE insert_meta AS "
                                "INSERT INTO news_meta VALUES (nextval('id_sequence'), ?, ?, ?, ?) ON CONFLICT DO NOTHING;")
        self.connection.execute(f"EXECUTE insert_meta('{news_meta['title']}', '{news_meta['url']}', '{news_meta['timestamp']}', '{news_meta['term']}');")
        
    def update_timestamp(self, news_meta_id, date):
        self.connection.execute("PREPARE update_timestamp AS "
                                "UPDATE news_meta SET timestamp = ? WHERE id = ?")
        self.connection.execute(f"EXECUTE update_timestamp('{news_meta_id}', '{date}');")

    def select_all(self):
        return self.connection.execute("SELECT * FROM news_meta").fetchdf()

    def select_by_id(self, id):
        return self.connection.execute("SELECT * FROM news_meta WHERE id = ?", id).fetchdf()

    def select_by_title(self, title):
        return self.connection.execute("SELECT * FROM news_meta WHERE title = ?", title).fetchdf()

    def select_by_url(self, url):
        return self.connection.execute("SELECT * FROM news_meta WHERE url = ?", url).fetchdf()

    def select_by_term(self, source):
        self.connection.execute("PREPARE select_by_source AS "
                                "SELECT * FROM news_meta WHERE term = ?")
        return self.connection.execute(f"EXECUTE select_by_source('{source}');").fetchdf()

    def select_by_date(self, date_from, date_to):
        return self.connection.execute(f"SELECT * FROM news_meta WHERE strptime(timestamp, '%Y-%m-%dT%H:%M:%S%z') BETWEEN strptime('{date_from}','%Y-%m-%d') AND strptime('{date_to}','%Y-%m-%d')").fetchdf()

    def delete_all(self):
        self.connection.execute("DELETE FROM news_meta")

    def export(self, csv_file):
        self.connection.execute(f"EXPORT DATABASE '{csv_file}';")

    def close(self):
        self.connection.close()
        


In [5]:
def try_request(url, headers, params, isJson=True):

    # resp_proxy = requests.get('https://free-proxy-list.net/')
    # df = pd.read_html(resp_proxy.text)[0]
    # df_http = df[df['Https']=='no']
    # df_https = df[df['Https']=='yes']
    # proxy_http = f'https://{df_http["IP Address"].values[0]}:{df_http["Port"].values[0]}'
    # proxy_https = f'https://{df_https["IP Address"].values[0]}:{df_https["Port"].values[0]}'

    proxies={
        "http": 'socks5://193.35.18.30:30808',
        "https": 'socks5://193.35.18.30:30808'
    }

    response_page = requests.request("GET", url, headers=headers, params=params, timeout=5)
    try:
        response = response_page.json() if isJson else response_page.text

        if response_page.status_code > 300:
            print("Status code error: " + str(response_page.status_code))
            return False, params

        return True, response
    except requests.exceptions.JSONDecodeError as e:
        print(f"Bad Request: GET {url} \n Status Code: {response_page.status_code} | Error : {e}")
        return False, params
    except requests.exceptions.Timeout:
        print("Timed out")
        return False, params
    except requests.exceptions.RequestException as e:
        print(f"Bad Request: GET {url} \n")
        return False, params

In [66]:
newsMetaRepo = NewsMetaRepository("meta/bbc")

def create_request_url(term, page, page_size):
    template = 'https://web-cdn.api.bbci.co.uk/xd/search?terms={}?page={}?pageSize={}'
    url = template.format(term, page, page_size)
    return url

bbc_url = 'https://web-cdn.api.bbci.co.uk/xd/search'

def get_news_meta(search_term):
    hasData = True
    page = 0
    
    while hasData:
        print(f'Loading page {page}')

        response_ok, response = try_request(
            url = bbc_url,
            headers={},
            params = {
                'terms': search_term,
                'page': page,
                'pageSize': 100
            }
        )
                
        if response_ok:            
            if len(response["data"]) > 0:
                for data in response["data"]:
                    news_meta = {
                        'title': data['title'].replace("'","''"),
                        'term': search_term,
                        'timestamp': data['firstPublishedAt'], 
                        'url': data['path']
                    }
                    print(news_meta)
                    # newsMetaRepo.insert(news_meta) 
            else: 
                print("Done loading.")
                hasData = False
        
        # newsMetaRepo.export('meta/bbc')
        page += 1
        hasData = False

get_news_meta("Tesla")

Loading page 0
{'title': 'Tesla faces recall over alerts for unlatched bonnet', 'term': 'Tesla', 'timestamp': '2024-07-30T16:22:35.884Z', 'url': '/news/articles/cv2gyzn7jwro'}
{'title': 'Uber strikes EV deal with Chinese Tesla rival BYD', 'term': 'Tesla', 'timestamp': '2024-08-01T04:30:56.741Z', 'url': '/news/articles/cley0xn8e88o'}
{'title': 'Musk says Tesla to use humanoid robots next year', 'term': 'Tesla', 'timestamp': '2024-07-23T04:01:04.916Z', 'url': '/news/articles/cz5reve8476o'}
{'title': "China firm claims world''s fastest-charging EV battery", 'term': 'Tesla', 'timestamp': '2024-08-14T04:36:22.512Z', 'url': '/news/articles/cgl20971wxpo'}
{'title': 'Mosques raise £100,000 for hospital scanner', 'term': 'Tesla', 'timestamp': '2024-08-07T12:09:11.686Z', 'url': '/news/articles/cgm7g1kkwjpo'}
{'title': 'Watch: Cars crash off road in near-miss with cafe', 'term': 'Tesla', 'timestamp': '2024-08-20T14:04:50.427Z', 'url': '/news/videos/c5y8k19xl9qo'}
{'title': "Dashcam captures ''exc

In [61]:
async def download_article(news_meta_url, news_meta_id, news_meta_filename):
    file_name = f'articles/bbc/{news_meta_filename}.txt'
    #if not os.path.isfile(file_name):
    async with aiohttp.ClientSession() as session:
        print(f'Downloading {news_meta_id}, https://bbc.com{news_meta_url}')

        async with session.get(f'https://bbc.com{news_meta_url}') as response: 
            soup = BeautifulSoup(await response.text(), 'html.parser')
    
            complete_text = ''
            
            date = ""
    
            if soup.find('article') is not None:
                text_blocks = soup.find('article').find_all('div', {'data-component':"text-block"})
                try:
                    headline = soup.find('article').find_all('div', {'data-component':"byline-block"})
                    # date = headline.find_all('time').text
                    date_tag = headline[0]
                    
                    print(date_tag)
                    print("This is the article: " + news_meta_url)
                except Exception as e:
                    print(e)
                    print("This is the article: " + news_meta_url)
                    
                    
                for text_block in text_blocks:
                    text_paragraphs = text_block.find_all('p')
                    complete_text += ' '.join([paragraph.text for paragraph in text_paragraphs])
            else:
                video_type = soup.find('div',{'data-testid':'video-page-video-section'})
                date = video_type.find('span').text
                if video_type is not None:
                    paragraphs = video_type.find_all('p')
                    complete_text += ' '.join([paragraph.text for paragraph in paragraphs])
            
            if date != "":
                update_article_date(news_meta_id, date)
            
            with open(f'articles/bbc/{news_meta_filename}.txt', 'w') as f:
                f.write(complete_text)

def update_article_date(news_meta_id, date):
    # Input date string
    date_str = date
    
    # Parse the date string into a datetime object
    date_obj = datetime.strptime(date_str, "%d %b %Y")
    
    # Convert the datetime object to the desired format
    formatted_date = date_obj.strftime("%Y-%m-%dT%H:%M:%S%z")
    
    print(formatted_date)
    
    # newsMetaRepo.update_timestamp(news_meta_id, formatted_date)
    pass

async def get_news_articles(source):
    news_meta_list = newsMetaRepo.select_by_term(source)
    # for i in range(0, 200): #len(news_meta_list)
    #     print(f'Downloading {news_meta_list["id"][i]} , https://bbc.com{news_meta_list["url"][i]}')
    #     download_article(news_meta_list['url'][i],news_meta_list['id'][i])
    #     sleep(0.35)

    def get_file_name(id):
        title = news_meta_list['title'][id].replace(' ','-').replace("/","-")
        return f"{title}-{parser.parse(news_meta_list['timestamp'][id]).timestamp()}"

    for i in range(0, len(news_meta_list))[::10]:
        print(i)
        try:
            task_list = [
                asyncio.create_task(
                    download_article(
                        news_meta_list['url'][i+idx],
                        news_meta_list['id'][i+idx],
                        get_file_name(i+idx)
                    )
                ) for idx in range(0,10 if len(news_meta_list) > i + 10 else len(news_meta_list) % 10)
            ]
            await asyncio.gather(*task_list)
        except Exception as e:
            print(f"ERROR DOWNLOADING ... RETRYING...")
            failed_tasks = [
                asyncio.create_task(
                    download_article(
                        news_meta_list['url'][i+idx],
                        news_meta_list['id'][i+idx],
                        get_file_name(i+idx)
                    )
                ) for idx in range(0,10 if len(news_meta_list) > i + 10 else len(news_meta_list) % 10)
            ]
            await asyncio.gather(*failed_tasks)

        if i % 50 == 0:
            sleep(3)

        sleep(0.300)
    # 
    # if len(failed_tasks) > 0:
    #     print(f'\n\nLength of the failed task:  {len(failed_tasks)}')
    #     # load failed tasks
    #     for i in range(0,len(failed_tasks))[::10]:
    #         print("Loading")
    #         await asyncio.gather(*(failed_tasks[i:i+10]))
    #         sleep(2)


In [62]:
company = "Tesla"

async def get_news(search_term):
    # get_news_meta(search_term)
    await get_news_articles(company)
    
await get_news(company)

0
Downloading 1, https://bbc.com/news/articles/cleezyxjv4jo
Downloading 2, https://bbc.com/news/articles/crggk5rvqd1o
Downloading 3, https://bbc.com/news/articles/c9rre6nwnpwo
Downloading 4, https://bbc.com/news/articles/cw008xgn49po
Downloading 5, https://bbc.com/news/articles/ckr5rmg0d58o
Downloading 6, https://bbc.com/news/articles/cpww6w412n7o
Downloading 7, https://bbc.com/news/articles/cp00jj7ze3qo
Downloading 8, https://bbc.com/news/articles/c888kmxez8xo
Downloading 9, https://bbc.com/news/articles/cv2j0v237vko
Downloading 10, https://bbc.com/news/world-europe-68993225
<div class="sc-18fde0d6-0 dlWCEZ" data-component="byline-block"><!--$!--><!--/$--></div>
This is the article: /news/articles/ckr5rmg0d58o
<div class="sc-18fde0d6-0 dlWCEZ" data-component="byline-block"><!--$!--><!--/$--></div>
This is the article: /news/articles/crggk5rvqd1o
<div class="sc-18fde0d6-0 dlWCEZ" data-component="byline-block"><!--$!--><!--/$--></div>
This is the article: /news/articles/cp00jj7ze3qo
ERR

AttributeError: module 'datetime' has no attribute 'strptime'