In [None]:
import time
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
import ssl
import tqdm
import re
import json
import calendar
import datetime

In [None]:
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

month_dict = {v: prepend_zero(k) for k,v in enumerate(calendar.month_name)}

In [None]:
# Helper functions

def get_soup(url):
    page = Request(url, headers = {"User-Agent": "Mozilla/5.0"})
    html = urlopen(page, context = ctx).read()
    soup = BeautifulSoup(html, "html.parser")
    return soup

def get_companies(url):
    soup = get_soup(url).tbody
    tags = soup('tr')
    companies = [tag.find_all('a')[1].text for tag in tags]
    return companies

def prepend_zero(i):
    return '0'+ str(i) if i < 10 else i

def get_dt(date):
    "Q2 2019 Earnings Conference Call July 24, 2019  6:30 PM ET"
    fdate = date.split(" Call ")[1].replace(",","").split()
    d = fdate[:2]
    t = fdate[3:].join()
    
    mkt_open = datetime.time(9, 30, 0)
    mkt_close = datetime.time(16, 0, 0)
    

    dt = datetime.date(int(d[2]), month_dict[d[0]], int(d[1]))
    time = datetime.strptime(t)
    if time < mkt_open:
        return dt, 0
    elif time > mkt_close:
        return dt, 1

In [None]:
# Helper functions ctd.

def get_transcripts(companies):
    script_dict = {}
    for company in tqdm.tqdm_notebook(companies):
        url = f"http://www.seekingalpha.com/symbol/{company}/earnings/transcripts"
        soup = get_soup(url)
        divs = soup.find_all("div", class_ = "symbol_article")
        time.sleep(5)

        for div in divs:
            if "Transcript" in div.a.text:
                url = "http://www.seekingalpha.com" + div.a.get("href") + "?part=single"
                soup = get_soup(url)
                time.sleep(5)
                
                # Get reported vs. estimated earnings
                eps = soup.find_all('div', class_ = "data-line")[0]
                reported = eps.text.split("$")[1]
                x_by = eps.span.text.split("$")[1]
                estimated = reported + x_by
                
                # Get transcript
                ps = soup.find_all('p', class_ = re.compile("p p.+"))
                text = str()
                for p in ps:
                    text += p.text + " "
                text = text.split("Operator", 1)[1]
                
                # Check before or after hours
                date = soup.find('p', class_ = "p p1").text
                date, post_time = get_dt(date)
                
                # Update dictionary
                company_info = {"text": text, "date": date, 
                                "estimated": estimated, "reported": reported, "hours": post_time}
                script_dict[company] = company_info 
                break
        
    return script_dict

In [None]:
companies = get_companies("http://www.slickcharts.com/sp500")

In [None]:
# Get financial info from SeekingAlpha

script_dict = get_transcripts(companies)

In [None]:
# Append stock info from days before/after earnings
# Don't run this on the entire company list, API token will fail out
# Split into 200/200/100 with API tokens:

token1 = "RC7d8ta4i4mqyVp6HRaOZ5OPpEcZ7YKec03Mn0rQ6REgXo4VqV4jbIYYtAcl"
token2 = "2f0pSBZmhpuyR4ONe6kX4Eia0Q8qFVYZKxIahfYOCwEhk6coMZDApJMRyqqo"
token3 = "E9kdHUN3ZDrcBsN3LRATY1MyBATFnaM4WfxClx8v3KBcn7ce10T976oE2xwS"

for company in companies:
    token = "RC7d8ta4i4mqyVp6HRaOZ5OPpEcZ7YKec03Mn0rQ6REgXo4VqV4jbIYYtAcl"
    base = "https://api.worldtradingdata.com/api/v1/history?"
    symbol = company

    script_date = script_dict[company]["date"]
    date_from = (script_date - datetime.timedelta(days = 3)).strftime("%Y-%m-%d")
    date_to = (script_date + datetime.timedelta(days = 3)).strftime("%Y-%m-%d")

#     # TEST 
#     date_from = "2019-11-15"
#     date_to = "2019-11-18"
#     symbol = "SNAP"

    url = f"{base}symbol={symbol}&date_from={date_from}&date_to={date_to}&api_token={token}"
    response = urlopen(url)
    info = response.read().decode('utf-8')
    print(info)

    change = datetime.timedelta(days = 1)

    day1 = script_date + change
    day2 = day1 + change
    day3 = day2 + change

    dayn1 = script_date - change
    dayn2 = dayn1 - change
    dayn3 = dayn2 - change

    if script_dict[company]["hours"] == 0:
        # Script was reported before market hours
        # Thus next day is current day
        next_day = script_date.strftime("%Y-%m-%d")

        if dayn1.strftime("%Y-%m-%d") in info['history']:
            prev_day = dayn1.strftime("%Y-%m-%d")
            continue
        elif dayn2.strftime("%Y-%m-%d") in info['history']:
            prev_day = dayn2.strftime("%Y-%m-%d")
            continue
        else:
            prev_day = dayn3.strftime("%Y-%m-%d")

    else:
        # Script was reported after market hours
        # Previous day is current day
        prev_day = script_date.strftime("%Y-%m-%d")

        if day1.strftime("%Y-%m-%d") in info['history']:
            next_day = day1.strftime("%Y-%m-%d")
            continue
        elif day2.strftime("%Y-%m-%d") in info['history']:
            next_day = day2.strftime("%Y-%m-%d")
            continue
        else:
            next_day = day3.strftime("%Y-%m-%d")

    script_dict[company]['pre_script_high'] = info['history'][prev_day]['high']
    script_dict[company]['pre_script_low'] = info['history'][prev_day]['low']

    script_dict[company]['post_script_high'] = info['history'][next_day]['high']
    script_dict[company]['post_script_low'] = info['history'][next_day]['low']

In [None]:
script_dict