### Data Quality Analysis

In [None]:
import sqlite3
import pandas as pd
import pandas_datareader as pdr
import numpy as np
import datetime
import plotly.graph_objects as go

In [None]:
def get_all_symbols():
    """ Get all symbols in database as a DataFrame """
    conn = sqlite3.connect("TSX_Prices.sqlite")
    sql = f"SELECT * FROM symbols ORDER BY ticker"
    data = pd.read_sql_query(sql, conn, index_col="ticker")
    data.drop(labels=["index", "url", "yahoo"], axis=1, inplace=True )

    return data

In [None]:
def yahoo_to_csv(ticker, exchange):
    start_date = "2014-01-01"
    end_date   = str(datetime.datetime.now().strftime("%Y-%m-%d"))
    if exchange == "tsx":
        yahoo_symbol = ticker.replace(".", "-") + ".TO"
    else:
        yahoo_symbol = ticker.replace(".", "-") + ".V"

    try:
        data = pdr.DataReader(yahoo_symbol, "yahoo", start_date, end_date)
        data["Ticker"] = ticker
        data.index = pd.to_datetime(data.index)
        data.to_csv(f"CSV/{ticker}.csv", index_label="Date", mode="w", date_format="%Y-%m-%d %H:%M:%S")
        return data
    except Exception as e:
        print(f"Unable to read Data from Yahoo : {e}")
        return None

# yahoo_to_csv("SHOP")

In [None]:
notfound = []
df = get_all_symbols()
df["YahooExists"] = False
#df_t = df[40:70]
df_t = df

for index, row in df_t.iterrows():
    symbol = index
    exchange = row["exchange"]
    result = yahoo_to_csv(symbol, exchange)
    if result is not None:
        df.at[symbol, "YahooExists"] = True
    else:
        notfound.append(symbol)

print(notfound)
notfound_df = pd.DataFrame(notfound)
notfound_df.to_csv("notfound.csv", mode="w", index=False, header=False )
# 3634 elements from yahoo in 1:11 hours


In [None]:
df.head(10)

In [None]:
def get_all_prices():
    conn = sqlite3.connect("TSX_Prices.sqlite")
    sql = f"SELECT * FROM prices_daily ORDER BY UPPER(Ticker) ASC, Date ASC"
    prices = pd.read_sql_query(sql, conn, index_col="Date")
    #prices = pd.read_sql_query(sql, conn, index_col="ticker")
    prices.drop("index", axis=1, inplace=True)
    
    return prices

In [None]:
# conn = sqlite3.connect("TSX_Prices.sqlite")
# sql = f"SELECT * FROM prices_daily ORDER BY Ticker ASC, Date ASC"
# prices = pd.read_sql_query(sql, conn, index_col="Date")
# prices.drop("index", axis=1, inplace=True)
prices = get_all_prices()
prices

In [None]:
# Add some indicators for Data Quality Analysis

# Identify the first price data for every ticker (to help in finding previous date for GAP analysis)
prices["new_ticker"] = np.where(prices["Ticker"] != prices["Ticker"].shift(1), "New", "")
prices["cur_date"]  = pd.to_datetime(prices.index, format="%Y-%m-%d", errors='coerce')
prices["prev_date"] = pd.to_datetime(np.where(prices["new_ticker"] != "New", prices["cur_date"].shift(1), None), format="%Y-%m-%d", errors='coerce')

# Calculate data gaps in prices using succesive dates for tickers in database
prices["GAP"] = prices["cur_date"] - prices["prev_date"]
prices["missing"] = prices["GAP"] > datetime.timedelta(days=5)

#filter = prices["new_ticker"] == "New"
#prices.loc[filter]
# df["Trend"] = np.where(df["Close"] > df["SMA200"], "Up", "Down")

In [None]:
filter = (prices["missing"] == True) & (prices["Ticker"] == "SHOP")
prices.loc[filter]