### JCU MA5851
<p style="line-height: 1.5; font-size:14pt">
    Student: Sacha Schwab <br>
    Location: Zurich, Switzerland
</p>


# Assessment 3 - Code for Part Two (WebCrawling)

In [13]:
from bs4 import BeautifulSoup
import requests
import datetime
import pandas as pd
import numpy as np
from datetime import date

In [14]:
# Statics
dir_path = 'data/'
raw_file_name = 'raw_data.csv'
yahoo_url = "https://finance.yahoo.com/cryptocurrencies/"

In [26]:
def yahoo_crypto_crawler_pipeline(file_path):
    # Get new urls
    #df = get_yahoo_crypto_news_only_url(file_path)
    #df.to_csv(dir_path + raw_file_name, index = False)
    # Get the content
    df = crawl_new_articles(file_path)
    df.to_csv(dir_path + raw_file_name, index = False)
    return(df)

In [27]:
df = yahoo_crypto_crawler_pipeline(dir_path + raw_file_name)

Opening raw data file
Now crawling: https://finance.yahoo.com/news/nfts-go-mainstream-at-art-basel-153543008.html
Getting url: https://finance.yahoo.com/news/nfts-go-mainstream-at-art-basel-153543008.html
Now crawling: https://finance.yahoo.com/news/hodl-wave-show-coin-maturation-144137273.html
Getting url: https://finance.yahoo.com/news/hodl-wave-show-coin-maturation-144137273.html
Now crawling: https://finance.yahoo.com/news/hodl-wave-show-coin-maturation-144137273.html
Getting url: https://finance.yahoo.com/news/hodl-wave-show-coin-maturation-144137273.html
Now crawling: https://finance.yahoo.com/news/cook-finance-launches-defi-index-143137080.html
Getting url: https://finance.yahoo.com/news/cook-finance-launches-defi-index-143137080.html
Now crawling: https://finance.yahoo.com/news/grayscale-launches-trust-dedicated-solana-140055819.html
Getting url: https://finance.yahoo.com/news/grayscale-launches-trust-dedicated-solana-140055819.html
Now crawling: https://finance.yahoo.com/news/

In [15]:
def get_page_content(url):
    ''' Request and retrieve html from a webpage, and status code
        Input: the url to be crawled
        Output: A timestamp and status code of the request and the page content
        Prints: The url loaded at the moment, for monitoring purpose
    '''
    print("Getting url: " + url)
    status_codes = {}
    page = requests.get(url)
    status_code = page.status_code
    timestamp = datetime.datetime.now()
    return(status_code, page)

def get_soup(page):
    ''' Convert the page html content from a request into a beutifulsoup soup
        Input: The page html content
        Output: The soup
    '''
    soup = BeautifulSoup(page.content, 'html.parser')
    return(soup)

In [16]:
def get_title(soup):
    ''' Extract the title from a Yahoo articles page
        Input: Soup
        Output: The title text
    '''
    # Extract the title
    if soup.find('header', class_='caas-title-wrapper'):
        title = soup.find('header', class_='caas-title-wrapper').text.strip()
        return(title)
    else:
        return('')

def get_date_time(soup):
    ''' Extract the date stamp from a Yahoo articles page
        Input: Soup
        Output: The date text
    '''
    # Extract the date
    if soup.find('div', class_='caas-attr-time-style'):
        date = soup.find('div', class_='caas-attr-time-style').text.split("·")[0]
        return(date)
    else:
        return('')

def get_text(soup):
    ''' Extract the body articles text
        Input: Soup
        Output: The article body text
    '''
    # Extract the article text
    art_text = soup.find('div', class_='caas-body').text
    return(art_text)

In [17]:
def get_yahoo_crypto_news_only_url(file_path):
    ''' Extract the urls currently feature on Yahoo cryptocurrency news
        Input: n/a
        Output: Urls (i.e. new ones) extracted here are directly save
                into the raw data file.
    '''
    # Get the soup and the status of the response
    df = pd.read_csv(file_path)
    status, page = get_page_content(yahoo_url)
    soup = get_soup(page)
    # Loop through html items and extract the data
    titles_tags = soup.find_all("a", class_="js-content-viewer", href=True)
    for title_tag in titles_tags:
        url = 'https://yahoo.com' + title_tag['href']
        # Proceed only if the url does not yet exist
        if not (url in df['url']):
            data = {}
            data['url'] = url
            data['title'] = ''
            df = df.append(data, ignore_index=True)
            print('Added: ' + url)
    return(df)

In [18]:
# TEST the cell above (uncomment lines here)
# df = get_yahoo_crypto_news_only_url(dir_path + raw_file_name)
# df.tail()
# df.to_csv(dir_path + 'mock_data.csv')
# df = pd.read_csv(dir_path + 'mock_data.csv')
# df.tail()

In [19]:
def crawl_new_articles(file_path):
    ''' Crawl Yahoo articles newly obtained
        Input: Path to the file containing the new urls
        Output: Dataframe with titles and body text data to each new url
        Prints: The url crawled at the moment
    '''
    # Read the raw articles data
    print('Opening raw data file')
    df = pd.read_csv(file_path)
    # Backup just in case
    df.to_csv(dir_path + 'raw_data_backup' + str(date.today()) + '.csv')
    # GOVERNANCE: Clean backups from time to time

    # Erase NaNs
    df = df.fillna('')
    # Filter the urls that have not yet been crawled
    df_todo = df[df['text'] == '']

    # Loop through urls to crawl and get the data
    i = 0
    for index, row in df_todo.iterrows():
        # Print 'status'
        print('Now crawling: ' + row['url'])
        # Dict to hold the sample data
        sample = {}
        # Get response code
        response_code, page = get_page_content(row['url'])
        if response_code == 200:
            # Get the soup
            soup = get_soup(page)
            title = get_title(soup)
            if (len(title) > 0):
                df.loc[index, 'title'] = title
                text = get_text(soup)
                if len(text) > 0:
                    df.loc[index, 'text'] = text
                    df.loc[index, 'date_time'] = get_date_time(soup)
                    
                else:
                    print('dropping row')
                    df = df.drop(index = index)
            else:
                df = df.drop(index = index)
        else:
            print('dropping')
            df = df.drop(index = index)
    return df

In [20]:
# TEST the cell above (uncomment lines here)
# df = crawl_new_articles(dir_path + 'mock_data.csv')
# df.to_csv(dir_path + 'mock_data.csv')

In [21]:
def get_yahoo_crypto_news():
    # Yahoo url
    yahoo_url = "https://finance.yahoo.com/topic/crypto/"
    # The class for the titles we are interested in
    title_class = 'mega-item-header-link'
    # Yahoo url prefix
    prefix = 'https://finance.yahoo.com/'
    
    # Get the soup and the status of the response
    soup, status = get_page_content(yahoo_url)
    
    # Loop through html items and extract the data
    article_items = soup.find_all('li', class_='js-stream-content Pos(r)')
    if article_items:
        i = 0
        for item in article_items:
            sample = {}
            item_title = item.find("a", class_=soup_class)
            if item_title:
                sample['title'] = item_title.text.strip()
            a_class = item.find("a", class_="js-content-viewer", href=True)
            if a_class:
                url = prefix + a_class['href']
                if url:
                    sample['url'] = url
                    # Get the full article text from url
                    art_soup, art_status = get_page_content(url)
                    art_text = ''
                    for p in art_soup.find_all('p'):
                        art_text = art_text + p.text
                    #print(art_text)
                    sample['text'] = art_text
            print(sample)
            if (i == 0):
                break
        else:
            print('None')
        

## Cryptocurrencies list crawler (under construction)

In [159]:
url = 'https://finance.yahoo.com/cryptocurrencies/?count=25&offset=375'

In [164]:
soup = ''
page = requests.get(url, timeout=1) 
soup = get_soup(page)

In [165]:
for s in soup.find_all(attrs={"aria-label" : "Symbol"}):
    print(s.text)
    print(s.find_next_sibling().text)

BTC-USD
Bitcoin USD
ETH-USD
Ethereum USD
BNB-USD
BinanceCoin USD
USDT-USD
Tether USD
SOL1-USD
Solana USD
ADA-USD
Cardano USD
USDC-USD
USDCoin USD
XRP-USD
XRP USD
HEX-USD
HEX USD
DOT1-USD
Polkadot USD
AVAX-USD
Avalanche USD
DOGE-USD
Dogecoin USD
LUNA1-USD
Terra USD
SHIB-USD
SHIBA INU USD
CRO-USD
CryptocomCoin USD
MATIC-USD
MaticNetwork USD
LTC-USD
Litecoin USD
UNI3-USD
Uniswap USD
ALGO-USD
Algorand USD
LINK-USD
Chainlink USD
BCH-USD
BitcoinCash USD
TRX-USD
TRON USD
AXS-USD
AxieInfinity USD
XLM-USD
Stellar USD
DAI1-USD
Dai USD
