In [None]:
import pandas as pd
import numpy as np
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
from tqdm.auto import tqdm
from datetime import datetime
from random import choice

from bs4 import BeautifulSoup
import requests
from requests.exceptions import ProxyError, SSLError, ConnectTimeout
from lxml import etree
from time import sleep

from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import as_completed

In [None]:
nasdaq_df = pd.read_csv('./data/nasdaq_list.csv')

In [None]:
def get_proxy_list(only_https = False):
    url = "https://free-proxy-list.net/"
    resp = requests.get(url)
    soup = BeautifulSoup(resp.content, "html.parser")
    # dom = etree.HTML(str(soup))

    html_table = soup.find('table', attrs={'class': 'table table-striped table-bordered'})
    df_table = pd.read_html(str(html_table))[0]

    proxy_server_list = []
    for i in range(len(df_table)):
        ip = df_table.loc[i, 'IP Address']
        port = df_table.loc[i, 'Port']
        https = df_table.loc[i, 'Https']

        if (only_https and https == "yes") or not only_https:
            server = f"{ip}:{port}"
            proxy_server_list.append(server)

    return proxy_server_list


PROXY_SERVER_LIST = get_proxy_list(only_https=True)


In [None]:
MAX_ATTEMPTS = 5

def scrape_marketwatch(symbol: str, use_proxy = False):
    headers = {"User-Agent": "Mozilla/5.0"}
    URL = f"https://www.marketwatch.com/investing/stock/{symbol.lower()}/financials/income"

    if use_proxy:
        proxy_server_list = PROXY_SERVER_LIST[:]
        
        while proxy_server_list:
            proxy_server = choice(proxy_server_list)
            proxies = {"http": proxy_server, "https": proxy_server}

            try:
                resp = requests.get(
                    URL,
                    headers=headers,
                    proxies=proxies,
                    timeout=10,
                )
            except (ProxyError, SSLError, ConnectTimeout) as e:       
                proxy_server_list.remove(proxy_server)
                print(f'{symbol}: Retry on another proxy, {len(proxy_server_list)}')
    else:
        try:
            resp = requests.get(
                URL,
                headers=headers,
                timeout=10,
            )
        except requests.exceptions.RequestException as e:       
            print(f'{symbol}: Connection failed. {e}')
            return None
    
    soup = BeautifulSoup(resp.text)

    for attempt in range(MAX_ATTEMPTS):
        try:
            financial_table = soup.find(
                "table", attrs={"class": "table table--overflow align--right"}
            )
            df = pd.read_html(str(financial_table))[0]
            df = df.drop(df.columns[[-1]], axis=1)  # drop 5-year trend column
        except:
            print(f"{symbol}: Parsing failed. Retry {attempt + 1}")
            continue
        else:
            return df


In [None]:
def thread_scrape(start: int, end: int, use_proxy = False):
    result = pd.DataFrame()
    for i in tqdm(range(start, end)):
        name = nasdaq_df.loc[i, 'Name']
        symbol = nasdaq_df.loc[i, 'Symbol']
        industry = nasdaq_df.loc[i, 'Industry']
        
        df = scrape_marketwatch(symbol, use_proxy)
        if df is None:
            continue
        
        # add multiindex level
        df = pd.concat([df], keys=[industry], names=['Industry'])
        df = pd.concat([df], keys=[symbol], names=['Symbol'])
        df = pd.concat([df], keys=[name], names=['Name'])
        
        result = pd.concat([result, df])
    
    return result

In [None]:
work_list = [(i, min(i + 500, len(nasdaq_df))) for i in range(0, len(nasdaq_df), 500)]
print(work_list)

with ThreadPoolExecutor(max_workers=8) as executor:
    futures = [executor.submit(thread_scrape, work[0], work[1]) for work in work_list]

    df = pd.DataFrame()
    for future in as_completed(futures):
        result = future.result()
        # print(result)
        df = pd.concat([df, result])

df.to_csv('./data/nasdaq_full2.csv')
df

In [22]:
import re

def _conv_to_float(s):
    if s == '-' or not isinstance(s, str):
        return None

    if s[0] == '(' or s[-1] == ')':
        s = s.replace('(', '')
        s = s.replace(')', '')
    if s[-1] == '%':
        s = s.replace('%', '')
    if s[-1] in list('BMK'):
        powers = {'B': 10 ** 9, 'M': 10 ** 6, 'K': 10 ** 3, '': 1}
        m = re.search("([0-9\.]+)(M|B|K|)", s)
        if m:
            val, mag = m.group(1), m.group(2)
            return float(val) * powers[mag]
    try:
        result = float(s)
    except:
        result = None
    return result

In [25]:
df = pd.read_csv('./data/nasdaq_full_raw2.csv')

conv_list =[str(year) for year in range(2016, 2023)]
for col in conv_list:
    df[col] = df[col].apply(_conv_to_float)
df.to_csv('./data/nasdaq_full_proc.csv')