In [35]:
import pandas as pd
import math
from datetime import datetime, timezone, timedelta
import pytz
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.common.by import By
import requests

### Define dictionary for converting month and quarter data to date in DD-MM-YYYY format

In [2]:
month_dict = {'Jan' : '01-01', 'Feb' : '01-02', 'Mar' : '01-03', 'Apr' : '01-04', 'May' : '01-05', 'Jun': '01-06',
'Jul' : '01-07', 'Aug' : '01-08', 'Sep' : '01-09', 'Oct' : '01-10', 'Nov' : '01-11', 'Dec' : '01-12'}
qtr_dict = {'Q1' : '01-01', 'Q2' : '01-04', 'Q3' : '01-07', 'Q4' : '01-10'}

### Get current date as per US Western TZ 

In [3]:
# western_tz = pytz.timezone('US/Alaska')
utc_now = datetime.now(pytz.utc) - timedelta(days=1)
# western_now = utc_now.astimezone(western_tz) 
# current_date = datetime.now(timezone.utc).strftime('%d-%m-%Y')
# current_date = western_now.date().strftime('%d-%m-%Y')
current_date = utc_now.date().strftime('%d-%m-%Y')

### Initialize a Chrome Driver instance

In [4]:
chrome_driver_path = ChromeDriverManager().install()

### Getting the User Agent and updating Chrome Driver Options parameter

In [5]:
options = Options()
options.add_argument("--incognito")
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
agent = driver.execute_script("return navigator.userAgent")
options.add_argument(f"user-agent={agent}")

### Function for scraping website data

In [6]:
def scrape_data(url, chrome_driver_path, options):  
    driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
    driver.get(url)
    return driver

### Function for getting date in 'DD-MM-YYYY' format

In [7]:
def get_date(date_string):
    if date_string[:3] in month_dict.keys():
        date_val =month_dict[date_string[:3]] + '-' + date_string[-5:-1]
    else:
        date_val =qtr_dict[date_string[:2]] + '-' + date_string[-5:-1]
    return date_val

### Function for getting feature data directly from a URL with text

In [43]:
def get_data_from_url(url):
    response = requests.get(url)
    # Get last one week data for the feature
    rows = response.text.split('\r\n')[-8:-1]
    values = []
    dates = []
    for row in rows:
        values.append(row.split(' ')[-1])
        dates.append(row.split(' ')[0])
    return values, dates

### Load Feature data with URLs and HTML Elements to Dataframe

In [38]:
feature_df = pd.read_csv('Feature_list.csv')

### Extract website data along with error URLs if any

In [44]:
def get_attributes(df, chrome_driver_path, options):
    values = {}
    dates = {}
    error_urls=[]
    for _, row in df.iterrows():
        if not isinstance(row['div_element'],str) and math.isnan(row['div_element']):
            try:
                print(row['Feature'])
                values[row['Feature']], dates[row['Feature']] = get_data_from_url(row['URL'])
            except Exception as e:
                print(e)
                error_urls.append(row['URL']) 
        else:
            try:
                print(row['Feature'])
                sub_ele = row['sub_element_type']
                # print(sub_ele)           
                driver = scrape_data(row['URL'], chrome_driver_path, options)
                try:
                    if not isinstance(sub_ele,str) and math.isnan(sub_ele):
                        values[row['Feature']] = driver.find_elements(By.XPATH, f'//div[@class="{row["div_element"]}"]')[0].text.replace('$', '').replace(',','')
                    else:
                        if sub_ele == 'bdo':
                            values[row['Feature']] = driver.find_elements(By.XPATH, f'//div[@class="{row["div_element"]}"]')[0].find_element(By.XPATH, f'.//bdo[@class="{row["sub_element_value"]}"]').text.replace('$', '').replace(',','')
                        else :
                            values[row['Feature']] = driver.find_elements(By.XPATH, f'//div[@class="{row["div_element"]}"]')[0].find_element(By.XPATH, f'.//span[@class="{row["sub_element_value"]}"]').text.replace('$', '').replace(',','')
                except StaleElementReferenceException:
                    driver.refresh()
                    if not isinstance(sub_ele,str) and math.isnan(sub_ele):
                        values[row['Feature']] = WebDriverWait(driver, 10).until(\
                            EC.presence_of_all_elements_located((By.XPATH, f'//div[@class="{row["div_element"]}"]'))[0].text.replace('$', '')).replace(',','')
                    else:
                        if sub_ele == 'bdo':
                            values[row['Feature']] = WebDriverWait(driver, 10).until(\
                                EC.presence_of_all_elements_located((By.XPATH, f'//div[@class="{row["div_element"]}"]'))[0].find_element(By.XPATH, f'.//bdo[@class="{row["sub_element_value"]}"]').text.replace('$', '')).replace(',','')
                        else:
                            values[row['Feature']] = WebDriverWait(driver, 10).until(\
                                EC.presence_of_all_elements_located((By.XPATH, f'//div[@class="{row["div_element"]}"]'))[0].find_element(By.XPATH, f'.//span[@class="{row["sub_element_value"]}"]').text.replace('$', '')).replace(',','')
                            
                if not isinstance(row['sub_element_max_date'],str) and math.isnan(row['sub_element_max_date']):
                    dates[row['Feature']] = current_date
                else:
                    try:
                        if sub_ele == 'bdo':
                            date_string = driver.find_elements(By.XPATH, f'//div[@class="{row["div_element"]}"]')[0].find_element(By.XPATH, f'.//bdo[@class="{row["sub_element_max_date"]}"]').text
                        else:
                            date_string = driver.find_elements(By.XPATH, f'//div[@class="{row["div_element"]}"]')[0].find_element(By.XPATH, f'.//span[@class="{row["sub_element_max_date"]}"]').text
                    except StaleElementReferenceException:
                        driver.refresh()
                        if sub_ele == 'bdo':
                            date_string = WebDriverWait(driver, 30).until(\
                                EC.presence_of_all_elements_located((By.XPATH, f'//div[@class="{row["div_element"]}"]'))[0].find_element(By.XPATH, f'.//bdo[@class="{row["sub_element_max_date"]}"]').text)
                        else:
                            date_string = WebDriverWait(driver, 30).until(\
                                EC.presence_of_all_elements_located((By.XPATH, f'//div[@class="{row["div_element"]}"]'))[0].find_element(By.XPATH, f'.//span[@class="{row["sub_element_max_date"]}"]').text)
                    dates[row['Feature']] =  get_date(date_string)
            except Exception as e:
                print(e)
                error_urls.append(row['URL']) 
    return values, dates, error_urls

In [45]:
res_values, res_dates, error_urls = get_attributes(feature_df, chrome_driver_path, options)

CRUDE_PRICE
S&P500
UNRATE
CPI
GDP
FED_GRANTS
GOLD_PRICE


In [55]:
data_df = pd.read_csv('exog_variables.csv', dayfirst=True, index_col=0)
if pd.to_datetime(data_df['DATE'].iloc[-1], dayfirst=True, format='mixed') < pd.to_datetime(current_date, dayfirst=True, format='mixed'):
    data_df.loc[len(data_df), 'DATE'] = current_date
for key, value in res_dates.items():
    if isinstance(value, list):
        for i, date in enumerate(value):
            if pd.to_datetime(date, dayfirst=True, format='mixed') < pd.to_datetime(data_df['DATE'].iloc[-1], dayfirst=True, format='mixed'):
                selected = pd.to_datetime(data_df['DATE'], dayfirst=True, format='mixed') >= pd.to_datetime(date, dayfirst=True, format='mixed')
                data_df.loc[selected, key] = res_values[key][i]      
            else:
                data_df.loc[len(data_df)-1, key] = res_values[key][i]
    else:
        if pd.to_datetime(value, dayfirst=True, format='mixed') < pd.to_datetime(data_df['DATE'].iloc[-1], dayfirst=True, format='mixed'):
            selected = pd.to_datetime(data_df['DATE'], dayfirst=True, format='mixed') >= pd.to_datetime(value, dayfirst=True, format='mixed')
            data_df.loc[selected, key] = res_values[key]      
        else:
            data_df.loc[len(data_df)-1, key] = res_values[key]

### Create output dataframe with ffill for missing values for any features

In [58]:
data_df['DATE'] = pd.to_datetime(data_df['DATE'], dayfirst=True, format='mixed')
df_daily = data_df.set_index('DATE').resample('D').asfreq()
df_daily.reset_index(inplace=True)

df_daily[df_daily.select_dtypes(['object']).columns] = df_daily.select_dtypes(['object']).astype(float)
df_daily = df_daily.replace('.', '')

df_daily.loc[:, data_df.columns != 'COVID_STRINGENCY_INDEX'] = df_daily.loc[:, data_df.columns != 'COVID_STRINGENCY_INDEX'].fillna(method='ffill')

# df_daily[df_daily.select_dtypes(['object']).columns] = df_daily.select_dtypes(['object']).astype(float)

### Export the data

In [63]:
df_daily.to_csv('exog_variables.csv')

In [64]:
json_dict = {}
json_dict['Update_date'] = current_date
json_dict['Error_URLs'] = error_urls

In [65]:
with open('run_summary.json', 'w') as f:
    json.dump(json_dict, f, indent=4)