In [None]:
import bs4 as bs
import concurrent.futures
import json
import matplotlib.pyplot as plt
import os
import pandas as pd
import requests
import time

# Section 1: Requests Library

### We use the Python 'requests' library to make calls to the internet

In [None]:
# makes a 'GET' request to the yahoo, should return '200'
response = requests.get("https://finance.yahoo.com/quote/TSLA?p=TSLA")
print(response)

### Response header (metadata about response)

In [None]:
# response header
header = response.headers
# print each header item
for index, item in enumerate(header):
    print(f"{index} {item} -- {header[item]}")


### '.text' method gives us the page source

In [None]:
# page source
page_text = response.text
# notices that requests returns the page as a 'str'
print(type(page_text))
print(page_text)

### The 'beautifulsoup' module allows us to search the webpage by tag/selector

In [None]:
# create BeautifulSoup Object
source = bs.BeautifulSoup(page_text)
# notices that 'source' is a BeautifulSoup object
print(type(source))
print(source)

In [None]:
# finds all elements with the "a" tag
links = source.find_all('a')
for link in links:
    print(link)
    # note each element in list is a BuutifulSoup 'element tag' object
    print(type(link))
    print('--', link['href'])
    print("–"*100)

In [None]:
# finds all elements with the "a" tag, prints 
links = source.find_all('a')
for link in links:
    print(link.text)
    print("–"*90)

# Section 2: Get Company Data

In [None]:
# takes any ticker symbol and returs financial stats as a dict
def get_company_data(ticker: str) -> dict:
    '''
    Parameters: A ticker symbol (str)
    Returns: A dict of financial data
    '''
    # base url for yahoo financial stats
    url = f'https://finance.yahoo.com/quote/{ticker}/key-statistics?p={ticker}'
    headers = {'User-Agent': "Mozilla/5.0"}
    
    # makes request 
    response = requests.get(url, headers=headers, timeout=10)
    
    # handles for bad url
    if response.status_code != 200:
        return {"ticker": ticker, "!status": f'code {response.status_code}'}
    
    # main bs page object
    source = bs.BeautifulSoup(response.text)
    data = source.find('section', {"data-test":"qsp-statistics"})
    
    # handles for invalid ticker symbol, ".find()" returns "None" 
    if data == None:
        return {"ticker": ticker, "!status": "data == None"}
    # finds the company name by id and h1 tag
    company_name = source.find('div', {'id':'quote-header-info'}).find('h1').text
    
    # creates a list of all 'tr' elements
    rows = data.find_all('tr')

    # dict comprehension
    #info_dict = {row.find_all('td')[0].text: row.find_all('td')[1].text for row in rows}
    
    info_dict = {"ticker": ticker, "!status": "good", '!!company_name': company_name}
    for row in rows:
        data = row.find_all('td')
        key = data[0].text
        value = data[1].text
        info_dict[key] = value

    return info_dict
    

In [None]:
get_company_data.__annotations__

In [None]:
ticker_list = ['tsla', 'goog', 'gs']
df = pd.DataFrame(list(map(get_company_data, ticker_list)))
df = df.set_index('ticker')

In [None]:
print(f"{len(df.index)} Rows, {len(df.columns)} Columns")
df

# Section 3: Get List of S&P 500 Tickers

In [None]:
# creates a list of S&P 500 tickers from Wikipedia,
# returns dictionary of company ticker and name
def get_tickers() -> list:
    '''
    Parameters: None
    Returns: A list of dictionaries containing companies and their tickers
    '''
    url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
    
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers, timeout=10)
    
    # bs object, searchable by dom elements
    source = bs.BeautifulSoup(response.text)    
    main_table = source.find('table', {"id": "constituents"})
    table_body = main_table.find('tbody')
    rows = table_body.find_all('tr')
    # a list of to elements dictionaries
    company_list = []
    for row in rows:
        # row contains list of td (table data) elements
        row_cells = row.find_all('td')
        # skips any row missing esential data (first and second column)
        if len(row_cells) <= 1:
             continue
        # first column in table
        ticker = row_cells[0].text.strip()
        company_list.append(ticker)
        
    # dictionary of financial metrics   
    return company_list
              

In [None]:
ticker_list = get_tickers()
print(f"{len(ticker_list)} tickers in list")
print(ticker_list[:5], ticker_list[-5:])

In [None]:
# gets data for first 10 on a single thread
t0 = time.time()
company_data_list = []
for ticker in ticker_list[:10]:
    company_data_list.append(get_company_data(ticker))

t1 = time.time()
print("{:.4} seconds".format(t1-t0))
print(f"{len(company_data_list)} tickers in list")


In [None]:
# make data frame from list of dictionaries
df = pd.DataFrame(company_data_list)    
                            
#df = pd.DataFrame(list(map(get_company_data, ticker_list_filtered)))
                        
df = df.set_index('ticker')
print(f"{len(df.index)} Rows, {len(df.columns)} Columns")
df

# Section 3: Saving Data

In [None]:
print(f"company_data_list length: {len(company_data_list)}")

In [None]:
# current working directory
print(os.getcwd())

In [None]:
for index, f in enumerate(os.listdir()):
    print(index, f)

In [None]:
# save as csv
df.to_csv('financial_data.csv')

In [None]:
for index, f in enumerate(os.listdir()):
    print(index, f)

In [None]:
# creates a csv
main_str = ""

main_str += (df.index.name).replace(","," ")
main_str += ","

# header row
for index, col_name in enumerate(df.columns):
    main_str += col_name.replace(","," ")
    if index == (len(df.columns)-1):
        main_str += "\n"
    else:
        main_str += ","
        
# body rows               
for row in range(len(df.index)):
    main_str += df.index[row] + ","
    for col in range(len(df.columns)):  
        main_str += (df.iloc[row, col]).replace(","," ")
        if col == (len(df.columns)-1):
            main_str += "\n"
        else:
            main_str += ","
            
with open('made_csv2.csv', 'w') as file:
    file.write(main_str)       

# Multithread Requests (Thread pool executor)

In [None]:
print(f"{len(ticker_list)} tickers in list")
print(ticker_list[:5], ticker_list[-5:])

In [None]:
# asynchronously call "get_company_data"
def thread_function(num: int, ticker_list: list=ticker_list, get_company_data:'function'=get_company_data) -> None:
    # ticker at point in list
    ticker = ticker_list[num]
    # calls company data function
    company_data = get_company_data(ticker)
    # adds compnay data to shared list
    company_data_list.append(company_data)
    

In [None]:
t0 = time.time()
company_data_list = []
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        executor.map(thread_function, range(30))
t1 = time.time()

print("{:.4} seconds".format(t1-t0))
print(f"{len(company_data_list)} tickers in list")

In [None]:
df = pd.DataFrame(company_data_list)
df = df.set_index('ticker')
print(f"{len(df.index)} Rows, {len(df.columns)} Columns")
df

In [None]:
# save as csv
df.to_csv('financial_data.csv')

# Visualize Data

In [None]:

labels = []
values = []
for index, i in df.iterrows():
    pf = float(i["Profit Margin "].replace('%',''))
#     company = i["!!company_name"].strip()
#     labels.append(company)
    
    labels.append(index)
    values.append(pf)
    

In [None]:

fig = plt.figure(figsize=(18,8))
ax = fig.add_subplot(111)

ax.bar(labels, values)

plt.xticks(rotation=45)
plt.ylabel('Profit margin (%)')
plt.title('Profit margin by company')

plt.grid()

# NOTE: Following sections will not work with Anaconda alone

# Using An In Memory Data Base (Redis)

In [None]:
import redis

In [None]:
# must start redis server in terminal first
r_db = redis.Redis(port=6377, db=0)

In [None]:
r_db.mset({"name": "Stefan"})

In [None]:
# key value from db
r_db.mget('name')[0].decode('UTF-8')

In [None]:
# clearS db
r_db.flushall()

In [None]:
print(f"{len(ticker_list)} tickers in list")
print(ticker_list[:5], ticker_list[-5:])

In [None]:
# asynchronously call "get_company_data"
def thread_map(num, ticker_list=ticker_list, get_company_data=get_company_data):
    # ticker at point in list
    ticker = ticker_list[num]
    # calls company data function
    company_data = get_company_data(ticker)
    r_db.mset({ticker: str(company_data)})
    

In [None]:
t0 = time.time()
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        executor.map(thread_map, range(40))
        
t1 = time.time()
print("{:.4} seconds".format(t1-t0))


In [None]:
# create datadrame from data in redis db
df = pd.DataFrame([json.loads(r_db.get(ticker).decode('UTF-8').replace("'",'"')) for ticker in r_db.keys()]).set_index('ticker')

print(f"{len(df.index)} Rows, {len(df.columns)} Columns")
df

# Common issues

In [None]:
response = requests.get('https://www.wsj.com/news/archive/20021001', timeout=10)

In [None]:
source = bs.BeautifulSoup(response.text)
# print(source.text)
articles = source.find_all('article')
print(len(articles))
for article in articles:
    print(article.text)
    print('-'*30)

# Selenium (render full web page before extracting data)

In [None]:
from selenium import webdriver
# for setting request headers
from selenium.webdriver.firefox.options import Options
# selecting items from dropdown list
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
import datetime

In [None]:
# define webdriver use firefox browser
options = Options()
# run without broswer window
#options.add_argument('--headless')

driver = webdriver.Firefox(options=options)

In [None]:
driver.get('https://www.wsj.com/')

In [None]:
driver.get('https://www.wsj.com/news/archive/20040608')

# Get newspaper title and article summary from WSJ archives

In [None]:
# creates a list of dates for wsj archive url
def create_date() -> list:
    start_date = datetime.date(1996, 4, 6)
    dates_list = []
    while True:
        start_date += datetime.timedelta(days=1)
        dates_list.append(str(start_date).replace('-',''))
        if datetime.date.today() == start_date:
            break
    return dates_list


In [None]:
# list list of possible dates fro url
dates = create_date()
print(f"{len(dates)} total dates")
print(dates[1:3],dates[-3:-1])


In [None]:
# current directory
print(os.listdir())

In [None]:
# makes directory for each days csvs
if not os.path.exists('wsj_csvs'):
    os.mkdir('wsj_csvs')
print(os.listdir())

In [None]:
# makes errors directory for each days csvs
if not os.path.exists('errors'):
    os.mkdir('errors')
print(os.listdir())

In [None]:
r_db = redis.Redis(port=6377, db=1)

In [None]:
# takes a formatted date, appends to wsj archive url, returns df of days articles
def get_days_news(date):

    driver.get(f'https://www.wsj.com/news/archive/{date}')
    raw_source = driver.page_source
    source = bs.BeautifulSoup(raw_source)
    articles = source.select("article[class*='WSJTheme--story']")
    
    # if page does not load, date is added to error file
    timeout = 0
    while len(articles) == 0:
        time.sleep(1)
        raw_source = driver.page_source
        source = bs.BeautifulSoup(raw_source)
        articles = source.select("article[class*='WSJTheme--story']")
        timeout += 1
        if timeout >= 10:
            with open(os.path.join(os.getcwd(), 'errors', f"{date}.txt"),'w') as f:
                f.write(date)
            return
    
    time.sleep(1)
        
    dict_list = []
    for article in articles:
        #print(article.text)
        
        # the tree sections of each article row
        days_articles = {'section': article.select("div[class*='WSJTheme--flashline']"),
                         'headline': article.select("h3[class*='WSJTheme--headline']"), 
                         'summary': article.select("p[class*='WSJTheme--summary']")
                        }
        
        # adds each of the three sections to dict, used for df
        for item in days_articles:
            if days_articles[item] == []:
                days_articles[item] = 'None'
            else:
                days_articles[item] = days_articles[item][0].text
         
        # for date columns
        days_articles['date'] = date
        
        dict_list.append(days_articles)
    
     # writes to redis db
    r_db.mset({str(date): str(dict_list)})
    
    # creates pandas df from list of article dicts
    df = pd.DataFrame(dict_list)
    
    # add to csv
    df.to_csv(os.path.join(os.getcwd(), 'wsj_csvs', f"{date}.csv"))
    
    return df
 

In [None]:
# gets archive at specific day
df = get_days_news(dates[1005])

In [None]:
df

In [None]:
for date in dates[4000:4005]:
    get_days_news(date)
    

In [None]:
pd.set_option('max_colwidth', 70)
df

In [None]:
driver.quit()

In [None]:
driver.execute_script('alert("warning")')

In [None]:
driver.execute_script('console.log("selenium")')

In [None]:
divs = driver.execute_script('let f = document.querySelectorAll("div")')