# Create sqlite database from CSV files  
#### Use at your own risk

In [None]:
# Use as is, you need to know what you are doing to prevent overwriting important data - you have been warned
import sqlite3
import pandas as pd

# Change db_name to what you need
db_name = "z_newdb.sqlite"

connection = sqlite3.connect(db_name)
symbols_table = "symbols"
symbols_df_from_csv = pd.read_csv("symbols.csv")
symbols_df_from_csv.drop(["Unnamed: 0"], axis=1, inplace=True)
symbols_df_from_csv.to_sql("symbols", connection)
connection.close()


In [None]:
import sqlite3
import pandas as pd

# Make sur the first run has no prices_daily table or it will append
# Repeat for each year you want to load
csv_file = "prices_2014.csv"

connection = sqlite3.connect(db_name)  # db_name is initialized in previous cell
prices_table = "prices_daily"
prices_df_from_csv = pd.read_csv(csv_file)
prices_df_from_csv.to_sql("prices_daily", connection, if_exists="append")
connection.close()

# Database export to CSV files

In [1]:
# Export SQLite3 Symbols to CSV file for github push
import pandas as pd
import sqlite3

con100 = sqlite3.connect("TSX_Prices.sqlite")
sql = f"SELECT * FROM 'symbols' ORDER BY ticker ASC"
data = pd.read_sql_query(sql, con100)
data.drop(labels="index", axis=1, inplace=True)
data.to_csv("symbols.csv", index="False")

In [17]:
# Export SQLite3 Prices to CSV file for github push using yearly export
import pandas as pd
import sqlite3

year="2013"
conn = sqlite3.connect("TSX_Prices.sqlite")
sql = f"SELECT * FROM 'prices_daily' WHERE Date LIKE '{year}%' ORDER BY ticker ASC, Date DESC"
data = pd.read_sql_query(sql, conn)
data.drop(labels="index", axis=1, inplace=True)
data["Date"] = pd.to_datetime(data["Date"], infer_datetime_format=True)
data["Date"] = data["Date"].dt.date
data.to_csv(f"prices_{year}.csv", index=False)


# WEB Scraping functions

#### Chromedriver : Open a chrome browser to use for scraping

In [None]:
# Open a Chrome Browser that will be controlled by Selemium
# No page loaded in driver
import time
import warnings
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

warnings.filterwarnings("ignore", category=DeprecationWarning) 

# Setup Selenium browser
CHROME_DRIVER_LOCATION = "chromedriver.exe"
OPTIONS = webdriver.ChromeOptions()
OPTIONS.add_argument('--ignore-certicate-errors')
OPTIONS.add_argument('--incognito')
#OPTIONS.add_argument('--headless')
OPTIONS.add_experimental_option('excludeSwitches', ['enable-logging'])
driver = webdriver.Chrome(executable_path=CHROME_DRIVER_LOCATION,options=OPTIONS)
#driver.implicitly_wait(10)
wait = WebDriverWait(driver, 10)


#### Function to open the TSX listings page, select a letter and exchange to extract data from

In [None]:
# Function to open TSX Page for symbols listed, select the exchange and grab data from page
# Complicated scraper as data is JAVASCRIPT generated, cannot simply use pandas_datareader
# Returns a Pandas DataFrame with scrapped data
def tickers_currenlty_listed(letter, exchange):
    tsx_url = f"https://www.tsx.com/listings/listing-with-us/listed-company-directory"
    driver.get(tsx_url)
    
    # Set exchange for page
    try:
        btn_switch = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.ID, 'exchange-toggle')))
        if exchange == "tsx" and btn_switch.get_attribute("class") == "btn-switch invert":
            btn_switch.click()
        if exchange == "tsxv" and btn_switch.get_attribute("class") == "btn-switch":
            btn_switch.click()
    except Exception as e:
        print(f"Unable to locate exchange-toggle button, ",e)
        return {}

    # Push letter+ENTER in search field
    try:
        search = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.ID, 'query')))
        search.send_keys(letter + Keys.ENTER)
    except Exception as e:
        print(f"Unable to send_keys in inut field [{letter} ], ",e)
        return {}    

    # Extract data using selenium ad create a list
    try:
        #WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, "//*[text()='Symbol']")))
        result_data_xpath = '//*[@id="tresults"]/tbody'
        WebDriverWait(driver, 5).until( EC.visibility_of_element_located((By.XPATH, result_data_xpath)))
        datagrid = driver.find_element_by_xpath(result_data_xpath)
    except Exception as e:
        print(f"Unable to locate HTML Table containing listing data, ",e)
        return {}    

    data_rows = datagrid.find_elements(By.TAG_NAME, "tr")
    data = []
    for row in data_rows:
        cells = row.find_elements(By.TAG_NAME, "td")
        # Skip the line with no ticker symbol
        if (cells[1].text != ""):
            ticker = cells[1].text
            name = cells[0].text
            url = f"https://money.tmx.com/en/quote/{ticker}"
            row_info = {"ticker": ticker, "company":name.strip(), "exchange": exchange, "url":url, "yahoo":"-"}
            data.append(row_info)

    return data


#### Function to open the TSX Trade history page for a specified TICKER symbol and extract data to a CSV file

In [None]:
def extract_trading_history(symbol, pages_to_read, msg1):
# Function to extract trading history for tmx web site and store in a CSV file
# Parameters:
# - symbol to extract
# - pages_to_read : number of historical pages to read (selenium script will click on the navigation buttons automatically)
# - msg1 : Use when looping through a bunch of tickers to follow progress

    tmx_url = f"https://money.tmx.com/en/quote/{symbol}/trade-history?selectedTab=price-history"
    driver.get(tmx_url)
    header_flag = True
    #username = input("Please click in web page then ENTER")
    time.sleep(5)

    # CLOSE THE Freaking AD at bottom of screen to expose the next button
    ad_closed = False
    try:
        #close_ad = WebDriverWait(driver, 5).until(presence_of_element_located(By.ID('ssrt-close-anchor-button')))
        close_ad = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.ID, 'ssrt-close-anchor-button')))
        print(type(close_ad))
        close_ad.click()
        ad_closed = True
    except Exception as e:
        print(f"Unable to clase_ads : {e}")   

    if ad_closed:
        # Set to pages_to_read to 45 for a good 2 years back data ( for does not loop through last number of range)
        fin_loop = pages_to_read + 1
        prev_date = "None"
        for i in range(1,fin_loop):
            html_page = driver.page_source
            try:
                data = pd.read_html(html_page)
                prices_df = data[0]
                #print(f"Data extracted: {i}, date at bottom : {prices_df['Date'].iloc[-1]}")
                print(f"{msg1} => {symbol} [{i}/{fin_loop - 1}] :Extracted date: {prices_df['Date'].iloc[-1]}, Previous Date Extracted: {prev_date}")
                filename = f"CSV\{symbol}.csv"
                prices_df.to_csv(filename, mode='a', header=header_flag)
                header_flag = False
                prev_date = prices_df['Date'].iloc[-1]        
                # Find the next button and click it
                btn_next = driver.find_element(By.XPATH, "//button[@data-testid='next-button']")
                btn_next.click() 
            except:
                i = fin_loop
    
    print("End of extraction")  

#### Function to load a CSV file to a pandas DataFrame

In [None]:
import pandas as pd

def load_prices_from_csv(path, file, symbol_to_load):
    file_to_load = path + file
    try:
        data = pd.read_csv(f"{file_to_load}")
        # Clean the data make sure all new data rows are unique
        if data.Date.is_unique == False:
            data_cleaned = data[~data.Date.duplicated()]
        else:
            data_cleaned = data
    except Exception as e:
        print(f"Error opening file : {e}")
        return None

    # # Structure the data according to database table structure
    # data_cleaned.drop(['Unnamed: 0', 'VWAP ($)', 'Change ($)', 'Trade Value', '# Trades', 'Change (%)'], axis=1, inplace=True)
    # data_cleaned.rename(columns={'Open ($)': 'Open', 'High ($)': 'High', 'Low ($)': 'Low', 'Close ($)': 'Close'  }, inplace=True)
    # data_cleaned["Ticker"] = symbol_to_load
    # data_cleaned['Date'] = pd.to_datetime(data_cleaned["Date"], infer_datetime_format=True)
    # data_cleaned = data_cleaned.reindex(["Date","Ticker","Open","High","Low","Close","Volume"],axis=1)

    return data_cleaned

#### Function to save a DataFrame (loaded from CSV file) to database table : prices_daily  
Date from CSV must be formatted to yyyy-mm-dd  
Data will be appended only for dates greater than existing Dates

In [None]:
import pandas as pd
import sqlite3

def update_prices_daily(ticker, price_data):
        conn = sqlite3.connect("TSX_Prices.sqlite")
        if conn is not None:
            symbol_to_load = ticker
            existing_prices_df = pd.read_sql(f"SELECT * FROM 'prices_daily' WHERE Ticker='{symbol_to_load}' ORDER BY Date DESC LIMIT 1", conn)
        try:
            # Only keep dataframe dates that are greater than the existing ones in prices_daily
            last_date = existing_prices_df.loc[0]["Date"]
            new_prices_df = price_data.loc[price_data["Date"] > last_date]
        except KeyError:
            last_date = None
            new_prices_df = price_data

        # Structure the data according to database table structure
        new_prices_df.drop(['VWAP ($)', 'Change ($)', 'Trade Value', '# Trades', 'Change (%)'], axis=1, inplace=True)
        new_prices_df.rename(columns={'Open ($)': 'Open', 'High ($)': 'High', 'Low ($)': 'Low', 'Close ($)': 'Close'  }, inplace=True)
        new_prices_df["Ticker"] = symbol_to_load
        #new_prices_df['Date'] = pd.to_datetime(data_cleaned["Date"], infer_datetime_format=True)
        new_prices_df = new_prices_df.reindex(["Date","Ticker","Open","High","Low","Close","Volume"],axis=1)
        # Drop duplicate dates
        new_prices_df.drop_duplicates(subset="Date", inplace=True)
        new_prices_df.to_sql("prices_daily", conn, if_exists="append")

        print(f"\n\nUpdating 'prices_daily': {ticker} \n{price_data} \nExisting Prices\n{existing_prices_df} \nLast date: {last_date} \nNew Prices:\n{new_prices_df}")


#### Manual Extraction functions

In [None]:
# List symbols with no price data for a specific exchange (tsx or tsxv)
import pandas as pd
import sqlite3

exchange = "tsxv"
conn = sqlite3.connect("TSX_Prices.sqlite")
sql = f"SELECT ticker FROM 'symbols' WHERE exchange = '{exchange}' AND ticker NOT IN (  SELECT DISTINCT Ticker FROM 'prices_daily' )"
data = pd.read_sql(sql, conn)
symbols_list = data["ticker"].to_list()
print(symbols_list)

In [None]:
# Extract trading history and store to csv file (in CSV folder)
for symbol in symbols_list:
    extract_trading_history(symbol, 1, symbol)

In [None]:
import os
from os import listdir
from os.path import isfile, join
path = "CSV\TSXV\\"
files = [x for x in listdir(path) if isfile(join(path,x)) ]
tickers = [os.path.splitext(x)[0] for x in files]

for file in files:
    ticker,_ = os.path.splitext(file)
    data = load_prices_from_csv(path, file, ticker)
    update_prices_daily(ticker, data)

