In [29]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from tqdm.auto import tqdm

from bs4 import BeautifulSoup
import requests
from lxml import etree
from time import sleep

from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import as_completed

In [30]:
nasdaq_df = pd.read_csv('./data/nasdaq_list.csv')

In [47]:
def scrape_cnbc(symbol: str):
    MAX_ATTEMPT = 5
    HEADERS = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 \
                (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36",
        # "Accept-Language": "en-US, en;q=0.5",
    }
    # URL = f"https://www.cnbc.com/quotes/{symbol}?tab=financials"
    URL = f"https://www.cnbc.com/quotes/{symbol}"

    for attempt in range(MAX_ATTEMPT):
        try:
            resp = requests.get(url=URL, headers=HEADERS, timeout=15)
            soup = BeautifulSoup(resp.content, "html.parser")
            dom = etree.HTML(str(soup))
        except:
            print(symbol, 'soup failed. Retry', attempt)
            continue
        else:
            try:
                market_cap = soup.select_one(
                    "#MainContentContainer > div > div.QuotePageBuilder-row > div.QuotePageBuilder-mainContent.QuotePageBuilder-col > div.QuotePageTabs > div:nth-child(2) > section > div:nth-child(1) > ul > li:nth-child(9) > span.Summary-value"
                ).get_text()
                # revenue = soup.select_one(
                #     '#MainContentContainer > div > div.QuotePageBuilder-row > div.QuotePageBuilder-mainContent.QuotePageBuilder-col > div.QuotePageTabs > div:nth-child(2) > section > div:nth-child(3) > ul > li.Summary-stat.Summary-revenue > span.Summary-value'
                # ).get_text()
                # gross_margin = soup.select_one(
                #     '#MainContentContainer > div > div.QuotePageBuilder-row > div.QuotePageBuilder-mainContent.QuotePageBuilder-col > div.QuotePageTabs > div:nth-child(2) > section > div:nth-child(3) > ul > li:nth-child(7) > span.Summary-value'
                # ).get_text()
                # net_margin = soup.select_one(
                #     '#MainContentContainer > div > div.QuotePageBuilder-row > div.QuotePageBuilder-mainContent.QuotePageBuilder-col > div.QuotePageTabs > div:nth-child(2) > section > div:nth-child(3) > ul > li:nth-child(8) > span.Summary-value'
                # ).get_text()

            except:
                print(symbol, 'html element failed. Retry', attempt)
                continue
            else:
                # succeed
                return [market_cap]
                # return [market_cap, revenue, gross_margin, net_margin]
    else:
        # totaly failed
        print(symbol, 'Skip')
        return None


In [50]:
def thread_scrape(start: int, end: int):
    result = []
    for i in tqdm(range(start, end)):
        name = nasdaq_df.loc[i, 'Name']
        symbol = nasdaq_df.loc[i, 'Symbol']
        industry = nasdaq_df.loc[i, 'Industry']
        
        data = [name, symbol, industry]
        scraped = scrape_cnbc(symbol)
        if data is not None:
            data.extend(scraped)
            result.append(data)
    
    return result

In [54]:
work_list = []
for i in range(0, len(nasdaq_df), 500):
    work_list.append((i, min(i + 500, len(nasdaq_df) + 1)))
print(work_list)

with ThreadPoolExecutor(max_workers=5) as executor:
    futures = [executor.submit(thread_scrape, work[0], work[1]) for work in work_list]

    df = []
    for future in as_completed(futures):
        df.extend(future.result())

df = pd.DataFrame(df)
df.to_csv('./data/nasdaq_marketcap_full.csv')
df

[(0, 500), (500, 1000), (1000, 1500), (1500, 2000), (2000, 2500), (2500, 3000), (3000, 3500), (3500, 4000), (4000, 4500), (4500, 4635)]


  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

BOWXU html element failed. Retry 0
BOWXU html element failed. Retry 1
BOWXU html element failed. Retry 2
BOWXU html element failed. Retry 3
BOWXU html element failed. Retry 4
BOWXU Skip


  0%|          | 0/500 [00:00<?, ?it/s]

SVOK html element failed. Retry 0
SVOK html element failed. Retry 1
SVOK html element failed. Retry 2


KeyboardInterrupt: 

SVOK html element failed. Retry 3
SVOK html element failed. Retry 4
SVOK Skip


  0%|          | 0/500 [00:00<?, ?it/s]

AHAC html element failed. Retry 0
AHAC html element failed. Retry 1
AHAC html element failed. Retry 2
AHAC html element failed. Retry 3
AHAC html element failed. Retry 4
AHAC Skip


  0%|          | 0/500 [00:00<?, ?it/s]

GWACU html element failed. Retry 0
GWACU html element failed. Retry 1
GWACU html element failed. Retry 2
GWACU html element failed. Retry 3
GWACU html element failed. Retry 4
GWACU Skip


  0%|          | 0/500 [00:00<?, ?it/s]

MCADU html element failed. Retry 0
MCADU html element failed. Retry 1
MCADU html element failed. Retry 2
MCADU html element failed. Retry 3
MCADU html element failed. Retry 4
MCADU Skip


  0%|          | 0/135 [00:00<?, ?it/s]

AHAC html element failed. Retry 0
AHAC html element failed. Retry 1
AHAC html element failed. Retry 2
AHAC html element failed. Retry 3
AHAC html element failed. Retry 4
AHAC Skip


  0%|          | 0/500 [00:00<?, ?it/s]

ALTA html element failed. Retry 0
ALTA html element failed. Retry 1
ALTA html element failed. Retry 2
ALTA html element failed. Retry 3
ALTA html element failed. Retry 4
ALTA Skip
MAXNV html element failed. Retry 0
MAXNV html element failed. Retry 1
MAXNV html element failed. Retry 2
MAXNV html element failed. Retry 3
MAXNV html element failed. Retry 4
MAXNV Skip
MCADU html element failed. Retry 0
MCADU html element failed. Retry 1
MCADU html element failed. Retry 2
MCADU html element failed. Retry 3
MCADU html element failed. Retry 4
MCADU Skip


  0%|          | 0/135 [00:00<?, ?it/s]

LANDP html element failed. Retry 0
LANDP html element failed. Retry 1
LANDP html element failed. Retry 2
LANDP html element failed. Retry 3
LANDP html element failed. Retry 4
LANDP Skip
ALTA html element failed. Retry 0
ALTA html element failed. Retry 1
ALTA html element failed. Retry 2
ALTA html element failed. Retry 3
ALTA html element failed. Retry 4
ALTA Skip
MAXNV html element failed. Retry 0
SRNG html element failed. Retry 0
MAXNV html element failed. Retry 1
SRNG html element failed. Retry 1
SRNG html element failed. Retry 2
MAXNV html element failed. Retry 2
SRNG html element failed. Retry 3
MAXNV html element failed. Retry 3
SRNG html element failed. Retry 4
SRNG Skip
MAXNV html element failed. Retry 4
MAXNV Skip
LANDP html element failed. Retry 0
LANDP html element failed. Retry 1
LANDP html element failed. Retry 2
LANDP html element failed. Retry 3
LANDP html element failed. Retry 4
LANDP Skip
PPD html element failed. Retry 0
PPD html element failed. Retry 1
PPD html element

In [45]:
a = [1,2, 3]
a.extend([5,1, 3])
print(a)

[1, 2, 3, 5, 1, 3]
