### Data Quality Analysis  
##### All features are wrapped in python functions (use Run All to define all functions and launch them individually)

In [7]:
import os
import sqlite3
import pandas as pd
import pandas_datareader as pdr
import numpy as np
import datetime
import plotly.graph_objects as go
import talib as ta 

#### RUN this feature only once if you do NOT have historical data  
All price data will stored in a CSV file under (CSV folder)

In [8]:
# Function to get historical data from Yahoo finance and store as a CSV file
# Also cumulates symbols that are not available on Yahoo Finance for further investigation
# TODO: Modify this function to pass start and end date for extraction if data before 2014 is required
def yahoo_to_csv(ticker, exchange):
    start_date = "2014-01-01"
    end_date   = str(datetime.datetime.now().strftime("%Y-%m-%d"))
    if exchange == "tsx":
        yahoo_symbol = ticker.replace(".", "-") + ".TO"
    else:
        yahoo_symbol = ticker.replace(".", "-") + ".V"

    try:
        data = pdr.DataReader(yahoo_symbol, "yahoo", start_date, end_date)
        data["Ticker"] = ticker
        data.index = pd.to_datetime(data.index)
        data.to_csv(f"CSV/{ticker}.csv", index_label="Date", mode="w", date_format="%Y-%m-%d %H:%M:%S")
        return data
    except Exception as e:
        print(f"Unable to read Data from Yahoo : {e}")
        return None

# yahoo_to_csv("SHOP")

In [None]:
# Export SQLite3 Prices to CSV file for github push using yearly export
def yearly_prices_to_csv(year):
    #year="2022"
    conn = sqlite3.connect("TSX_Quality.sqlite")
    sql = f"SELECT * FROM 'prices_daily' WHERE Date LIKE '{year}%' ORDER BY ticker ASC, Date DESC"
    data = pd.read_sql_query(sql, conn)
    data.drop(labels="index", axis=1, inplace=True)
    data["Date"] = pd.to_datetime(data["Date"], infer_datetime_format=True)
    data["Date"] = data["Date"].dt.date
    data.to_csv(f"prices_{year}.csv", index=False)

In [9]:
def csv_to_dataframe(file):
    data = pd.read_csv(f"CSV/{file}", index_col="Date")
    data.index = pd.to_datetime(data.index)
    data = data[["Ticker","Open","High","Low","Close","Volume"]]
    return data

def insert_new_prices_in_DB(new_prices, db_prices, conn):
    ticker = new_prices["Ticker"].values[0]
    existing_prices = db_prices.loc[db_prices["Ticker"] == ticker]
    filter = new_prices.index.isin(existing_prices.index)
    new_prices.drop(new_prices[filter].index, inplace = True)
    #conn = sqlite3.connect("TSX_Quality.sqlite")    
    new_prices.to_sql("Prices_Daily", conn, if_exists='append', index=True)
    return new_prices


def loop_through_local_csv(existing_prices, conn):
    csv_files = os.listdir("CSV")
    for file in csv_files:
        new_data = csv_to_dataframe(file)
        insert_new_prices_in_DB(new_data, existing_prices, conn)
        print(f"Finished processing : {file}")

# conn1 = sqlite3.connect("TSX_Quality.sqlite")
# sql = f"SELECT * FROM prices_daily ORDER BY Ticker ASC, Date ASC"
# db_prices = pd.read_sql_query(sql, conn1, index_col="Date")
# loop_through_local_csv(db_prices, conn1)

In [10]:
# Function to return a Dataframe with all symbols available in database
def get_all_symbols():
    """ Get all symbols in database as a DataFrame """
    conn = sqlite3.connect("TSX_Prices.sqlite")
    sql = f"SELECT * FROM symbols ORDER BY ticker"
    data = pd.read_sql_query(sql, conn, index_col="ticker")
    data.drop(labels=["index", "url", "yahoo"], axis=1, inplace=True )

    return data

In [20]:
# Heavy function to retreive all prices from database (more than 3 million rows, takes many seconds to execute)
def get_all_prices():
    conn = sqlite3.connect("TSX_Quality.sqlite")
    sql = f"SELECT * FROM prices_daily ORDER BY UPPER(Ticker) ASC, Date ASC"
    prices = pd.read_sql_query(sql, conn, index_col="Date")
    prices.index = pd.to_datetime(prices.index)
    #prices = pd.read_sql_query(sql, conn, index_col="ticker")
    #prices.drop("index", axis=1, inplace=True)
    
    return prices

In [12]:
# Cell to loop through all symbols and extract price data
def loop_through_symbols_on_yahoo():
    not_found_on_yahoo = []
    df = get_all_symbols()
    df["YahooExists"] = False
    #df_t = df[40:70]
    df_t = df

    for index, row in df_t.iterrows():
        symbol = index
        exchange = row["exchange"]
        result = yahoo_to_csv(symbol, exchange)
        if result is not None:
            df.at[symbol, "YahooExists"] = True
        else:
            not_found_on_yahoo.append(symbol)

    print(not_found_on_yahoo)
    notfound_df = pd.DataFrame(not_found_on_yahoo)
    notfound_df.to_csv("notfoundonyahoo.csv", mode="w", index=False, header=False )
# 3634 elements from yahoo in 1:11 hours

#loop_through_symbols_on_yahoo()

### DATA QUALITY INDICATORS

In [13]:
# Identify the first price data for every ticker (to help in finding previous date for GAP analysis)
# Prices must be sorted by ascending  ticker symbol and ascending dates
# For every ticker+date combination, insert the date of the previous price data fo rthe same ticker (to calculate the number of days between data and detect missing prices)
def detect_missing_prices(prices):
    prices["new_ticker"] = np.where(prices["Ticker"] != prices["Ticker"].shift(1), "New", "")
    prices["cur_date"]  = pd.to_datetime(prices.index, format="%Y-%m-%d", errors='coerce')
    prices["prev_date"] = pd.to_datetime(np.where(prices["new_ticker"] != "New", prices["cur_date"].shift(1), None), format="%Y-%m-%d", errors='coerce')

    # Calculate date gaps in prices using succesive dates for tickers in database
    prices["GAP"] = prices["cur_date"] - prices["prev_date"]
    prices["missing"] = prices["GAP"] > datetime.timedelta(days=5)

#detect_missing_prices(prices)

In [23]:
# Show date GAPS for a specific symbol
def show_missing_prices(prices, ticker=None):
    if ticker is None:
        filter = (prices["missing"] == True)
    else:
        filter = (prices["missing"] == True) & (prices["Ticker"] == ticker)
    
    missing_data = prices.loc[filter]
    return missing_data

prices = get_all_prices()
detect_missing_prices(prices)
missing_df = show_missing_prices(prices)
missing_df

Unnamed: 0_level_0,Ticker,Open,High,Low,Close,Volume,new_ticker,cur_date,prev_date,GAP,missing
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2021-10-25,ADCO.WT,-,-,-,0.155,0.0,,2021-10-25,2021-10-14,11 days,True
2017-12-28,AEC,0.379999995231628,0.449999988079071,0.370000004768372,0.370000004768372,57900.0,,2017-12-28,2017-12-22,6 days,True
2021-12-29,AFN.DB.E,100.0,100.02,99.76,99.76,75000.0,,2021-12-29,2021-12-10,19 days,True
2021-12-01,AKMY.WT,0.1,0.1,0.1,0.1,53000.0,,2021-12-01,2021-11-09,22 days,True
2021-12-01,ALLI.WT,0.48,0.58,0.415,0.42,1009851.0,,2021-12-01,2021-11-09,22 days,True
...,...,...,...,...,...,...,...,...,...,...,...
2021-05-18,YCM.PR.B,5.2,5.2,5.2,5.2,600.0,,2021-05-18,2021-04-20,28 days,True
2021-10-27,YCM.PR.B,-,-,-,5.46,0.0,,2021-10-27,2021-09-29,28 days,True
2018-01-09,ZJK,19.9748249053955,19.9748249053955,19.9395771026611,19.9395771026611,298.0,,2018-01-09,2017-12-22,18 days,True
2016-04-11,ZLH,21.7,21.7,21.4,21.41,33995.0,,2016-04-11,2016-03-14,28 days,True


In [15]:
#filter = prices["new_ticker"] == "New"
#prices.loc[filter]
# df["Trend"] = np.where(df["Close"] > df["SMA200"], "Up", "Down")

In [16]:
# Run all data quality indicators

# LOAD PRICES DATA AND RUN FEATURES FOR DATA QUALITY ANALYSIS
# Heavy extraction will take many seconds (more than 3 million rows)
# prices = get_all_prices()
# detect_missing_prices(prices)

# show_missing_prices()