### Data Quality Analysis  
##### All features are wrapped in python functions (use Run All to define all functions and launch them individually)

In [1]:
import sqlite3
import pandas as pd
import pandas_datareader as pdr
import numpy as np
import datetime
import plotly.graph_objects as go

#### RUN this feature only once if you do NOT have historical data  
All price data will stored in a CSV file under (CSV folder)

In [4]:
# Function to get historical data from Yahoo finance and store as a CSV file
# Also cumulates symbols that are not available on Yahoo Finance for further investigation
# TODO: Modify this function to pass start and end date for extraction if data before 2014 is required
def yahoo_to_csv(ticker, exchange):
    start_date = "2014-01-01"
    end_date   = str(datetime.datetime.now().strftime("%Y-%m-%d"))
    if exchange == "tsx":
        yahoo_symbol = ticker.replace(".", "-") + ".TO"
    else:
        yahoo_symbol = ticker.replace(".", "-") + ".V"

    try:
        data = pdr.DataReader(yahoo_symbol, "yahoo", start_date, end_date)
        data["Ticker"] = ticker
        data.index = pd.to_datetime(data.index)
        data.to_csv(f"CSV/{ticker}.csv", index_label="Date", mode="w", date_format="%Y-%m-%d %H:%M:%S")
        return data
    except Exception as e:
        print(f"Unable to read Data from Yahoo : {e}")
        return None

# yahoo_to_csv("SHOP")

In [13]:
# Open a CSV file using ticker symbol to insert missing values in prices_daily table
def add_prices_from_csv(ticker):
    pass

add_prices_from_csv("A")

In [2]:
# Function to return a Dataframe with all symbols available in database
def get_all_symbols():
    """ Get all symbols in database as a DataFrame """
    conn = sqlite3.connect("TSX_Prices.sqlite")
    sql = f"SELECT * FROM symbols ORDER BY ticker"
    data = pd.read_sql_query(sql, conn, index_col="ticker")
    data.drop(labels=["index", "url", "yahoo"], axis=1, inplace=True )

    return data

In [3]:
# Heavy function to retreive all prices from database (more than 3 million rows, takes many seconds to execute)
def get_all_prices():
    conn = sqlite3.connect("TSX_Prices.sqlite")
    sql = f"SELECT * FROM prices_daily ORDER BY UPPER(Ticker) ASC, Date ASC"
    prices = pd.read_sql_query(sql, conn, index_col="Date")
    #prices = pd.read_sql_query(sql, conn, index_col="ticker")
    prices.drop("index", axis=1, inplace=True)
    
    return prices

In [5]:
# Cell to loop through all symbols and extract price data
def loop_through_symbols_on_yahoo():
    not_found_on_yahoo = []
    df = get_all_symbols()
    df["YahooExists"] = False
    #df_t = df[40:70]
    df_t = df

    for index, row in df_t.iterrows():
        symbol = index
        exchange = row["exchange"]
        result = yahoo_to_csv(symbol, exchange)
        if result is not None:
            df.at[symbol, "YahooExists"] = True
        else:
            not_found_on_yahoo.append(symbol)

    print(not_found_on_yahoo)
    notfound_df = pd.DataFrame(not_found_on_yahoo)
    notfound_df.to_csv("notfoundonyahoo.csv", mode="w", index=False, header=False )
# 3634 elements from yahoo in 1:11 hours

#loop_through_symbols_on_yahoo()

### DATA QUALITY INDICATORS

In [7]:
# Identify the first price data for every ticker (to help in finding previous date for GAP analysis)
# Prices must be sorted by ascending  ticker symbol and ascending dates
# For every ticker+date combination, insert the date of the previous price data fo rthe same ticker (to calculate the number of days between data and detect missing prices)
def detect_missing_prices(prices):
    prices["new_ticker"] = np.where(prices["Ticker"] != prices["Ticker"].shift(1), "New", "")
    prices["cur_date"]  = pd.to_datetime(prices.index, format="%Y-%m-%d", errors='coerce')
    prices["prev_date"] = pd.to_datetime(np.where(prices["new_ticker"] != "New", prices["cur_date"].shift(1), None), format="%Y-%m-%d", errors='coerce')

    # Calculate date gaps in prices using succesive dates for tickers in database
    prices["GAP"] = prices["cur_date"] - prices["prev_date"]
    prices["missing"] = prices["GAP"] > datetime.timedelta(days=5)

#detect_missing_prices(prices)

In [8]:
# Show date GAPS for a specific symbol
def show_missing_prices(ticker=None):
    if ticker is None:
        filter = (prices["missing"] == True)
    else:
        filter = (prices["missing"] == True) & (prices["Ticker"] == ticker)
    
    missing_data = prices.loc[filter]
    return missing_data

#missing_df = show_missing_prices()
#missing_df

In [9]:
#filter = prices["new_ticker"] == "New"
#prices.loc[filter]
# df["Trend"] = np.where(df["Close"] > df["SMA200"], "Up", "Down")

In [12]:
# Run all data quality indicators

# LOAD PRICES DATA AND RUN FEATURES FOR DATA QUALITY ANALYSIS
# Heavy extraction will take many seconds (more than 3 million rows)
prices = get_all_prices()
detect_missing_prices(prices)

show_missing_prices()

Unnamed: 0_level_0,Ticker,Open,High,Low,Close,Volume,new_ticker,cur_date,prev_date,GAP,missing
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2020-04-28 00:00:00,ACB,13.2,13.32,12.12,12.48,5163884.0,,2020-04-28,2020-03-31,28 days,True
2021-11-08 00:00:00,AFN,30.05,30.34,29.3,29.8,53083.0,,2021-11-08,2021-10-29,10 days,True
2021-12-29 00:00:00,AFN.DB.E,100.0,100.02,99.76,99.76,75000.0,,2021-12-29,2021-12-10,19 days,True
2022-01-06 00:00:00,AT,4.45,4.51,4.26,4.36,354620.0,,2022-01-06,2021-12-31,6 days,True
2022-01-06 00:00:00,ATA,50.33,51.72,49.49,51.17,172496.0,,2022-01-06,2021-12-31,6 days,True
...,...,...,...,...,...,...,...,...,...,...,...
2019-02-28 00:00:00,ZZZ,20.47,21.47,20.43,21.41,369986.0,,2019-02-28,2019-01-31,28 days,True
2019-11-25 00:00:00,ZZZ,19.49,19.59,19.29,19.36,74039.0,,2019-11-25,2019-10-28,28 days,True
2021-10-27 00:00:00,ZZZ,34.83,35.25,34.43,34.52,58479.0,,2021-10-27,2021-09-29,28 days,True
2019-10-29 00:00:00,ZZZD,31.63,31.72,31.6,31.67,7358.0,,2019-10-29,2019-10-01,28 days,True
