In [6]:
import pandas as pd
import numpy as np
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
from tqdm.auto import tqdm
from datetime import datetime
from random import choice

from bs4 import BeautifulSoup
import requests
from lxml import etree
from time import sleep

from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import as_completed

In [3]:
nasdaq_df = pd.read_csv('./data/nasdaq_list.csv')

In [None]:
def get_proxies():
    url = "https://free-proxy-list.net/"
    resp = requests.get(url)
    soup = BeautifulSoup(resp.content, "html.parser")
    dom = etree.HTML(str(soup))

    tr_list = dom.xpath('//*[@id="proxylisttable"]/tbody/tr')
    proxy_server_list = []

    for tr in tr_list:
        ip = tr.xpath("td[1]/text()").extract_first()
        port = tr.xpath("td[2]/text()").extract_first()
        https = tr.xpath("td[7]/text()").extract_first()

        if https == "yes":
            server = f"{ip}:{port}"
            proxy_server_list.append(server)

    return proxy_server_list


PROXY_SERVER_LIST = get_proxies()


In [4]:
def scrape_marketwatch(symbol: str):
    headers = {"User-Agent": "Mozilla/5.0"}
    proxy_server = choice(PROXY_SERVER_LIST)
    proxies = {"http": proxy_server, "https": proxy_server}

    try:
        resp = requests.get(
            f"https://www.marketwatch.com/investing/stock/{symbol.lower()}/financials/income",
            headers=headers,
            proxies=proxies,
            timeout=5,
        )
        soup = BeautifulSoup(resp.text)
    except:
        print(symbol, "request failed.")
        return None

    try:
        financial_table = soup.find(
            "table", attrs={"class": "table table--overflow align--right"}
        )
        df = pd.read_html(str(financial_table))[0]
        df = df.drop(df.columns[[-1]], axis=1)  # drop 5-year trend column
    except:
        print(symbol, "parsing failed.")
        return None
    else:
        return df


In [5]:
def thread_scrape(start: int, end: int):
    result = pd.DataFrame()
    for i in tqdm(range(start, end)):
        name = nasdaq_df.loc[i, 'Name']
        symbol = nasdaq_df.loc[i, 'Symbol']
        industry = nasdaq_df.loc[i, 'Industry']
        
        df = scrape_marketwatch(symbol)
        if df is None:
            continue
        
        # add multiindex level
        df = pd.concat([df], keys=[industry], names=['Industry'])
        df = pd.concat([df], keys=[symbol], names=['Symbol'])
        df = pd.concat([df], keys=[name], names=['Name'])
        
        result = pd.concat([result, df])
    
    return result

In [1]:
work_list = [(i, min(i + 500, len(nasdaq_df))) for i in range(0, len(nasdaq_df), 500)]
print(work_list)

with ThreadPoolExecutor(max_workers=8) as executor:
    futures = [executor.submit(thread_scrape, work[0], work[1]) for work in work_list]

    df = pd.DataFrame()
    for future in as_completed(futures):
        result = future.result()
        print(result)
        df = pd.concat([df, result])

df.to_csv('./data/nasdaq_full2.csv')
df

NameError: name 'nasdaq_df' is not defined

In [None]:
import re

def _conv_to_float(s):
    if s == '-':
        return None

    if s[-1] == '%':
        s = s.replace('%', '')
    if s[-1] in list('BMK'):
        powers = {'B': 10 ** 9, 'M': 10 ** 6, 'K': 10 ** 3, '': 1}
        m = re.search("([0-9\.]+)(M|B|K|)", s)
        if m:
            val, mag = m.group(1), m.group(2)
            return float(val) * powers[mag]
    try:
        result = float(s)
    except:
        result = None
    return result

In [None]:
df = pd.read_csv('./data/nasdaq_marketcap_full.csv')
df = df.drop(df.columns[[0]], axis=1)

conv_list =['MarketCap', 'Income', 'Sales', 'GrossMargin', 'OperatingMargin', 'ProfitMargin']
for col in conv_list:
    df[col] = df[col].apply(_conv_to_float)

nasdaq_df_proc = df.dropna(subset=['MarketCap'])

nasdaq_df_proc