### Data Quality Analysis  
##### All features are wrapped in python functions (use Run All to define all functions and launch them individually)

In [1]:
import os
import sqlite3
import pandas as pd
import pandas_datareader as pdr
import numpy as np
import datetime
import plotly.graph_objects as go
import talib as ta 

#### RUN this feature only once if you do NOT have historical data  
All price data will stored in a CSV file under (CSV folder)

In [4]:
# Function to get historical data from Yahoo finance and store as a CSV file
# Also cumulates symbols that are not available on Yahoo Finance for further investigation
# TODO: Modify this function to pass start and end date for extraction if data before 2014 is required
def yahoo_to_csv(ticker, exchange):
    start_date = "2014-01-01"
    end_date   = str(datetime.datetime.now().strftime("%Y-%m-%d"))
    if exchange == "tsx":
        yahoo_symbol = ticker.replace(".", "-") + ".TO"
    else:
        yahoo_symbol = ticker.replace(".", "-") + ".V"

    try:
        data = pdr.DataReader(yahoo_symbol, "yahoo", start_date, end_date)
        data["Ticker"] = ticker
        data.index = pd.to_datetime(data.index)
        data.to_csv(f"CSV/{ticker}.csv", index_label="Date", mode="w", date_format="%Y-%m-%d %H:%M:%S")
        return data
    except Exception as e:
        print(f"Unable to read Data from Yahoo : {e}")
        return None

# yahoo_to_csv("SHOP")

In [10]:
def csv_to_dataframe(file):
    data = pd.read_csv(f"CSV/{file}", index_col="Date")
    data.index = pd.to_datetime(data.index)
    data = data[["Ticker","Open","High","Low","Close","Volume"]]
    return data

def insert_new_prices_in_DB(new_prices, db_prices, conn):
    ticker = new_prices["Ticker"].values[0]
    existing_prices = db_prices.loc[db_prices["Ticker"] == ticker]
    filter = new_prices.index.isin(existing_prices.index)
    new_prices.drop(new_prices[filter].index, inplace = True)
    #conn = sqlite3.connect("TSX_Quality.sqlite")    
    new_prices.to_sql("Prices_Daily", conn, if_exists='append', index=True)
    return new_prices


def loop_through_local_csv(existing_prices, conn):
    csv_files = os.listdir("CSV")
    for file in csv_files:
        new_data = csv_to_dataframe(file)
        insert_new_prices_in_DB(new_data, existing_prices, conn)
        print(f"Finished processing : {file}")

conn1 = sqlite3.connect("TSX_Quality.sqlite")
sql = f"SELECT * FROM prices_daily ORDER BY Ticker ASC, Date ASC"
db_prices = pd.read_sql_query(sql, conn1, index_col="Date")
loop_through_local_csv(db_prices, conn1)

Finished processing : TTG.csv
Finished processing : TTP.csv
Finished processing : TTR.csv
Finished processing : TTS.csv
Finished processing : TTZ.csv
Finished processing : TUD.csv
Finished processing : TUED.csv
Finished processing : TUF.csv
Finished processing : TUHY.csv
Finished processing : TULB.csv
Finished processing : TULV.csv
Finished processing : TUO.csv
Finished processing : TUP.P.csv
Finished processing : TUSB.csv
Finished processing : TUSB.U.csv
Finished processing : TV.csv
Finished processing : TV.WT.csv
Finished processing : TVA.B.csv
Finished processing : TVC.csv
Finished processing : TVC.WT.csv
Finished processing : TVE.csv
Finished processing : TVI.csv
Finished processing : TVK.csv
Finished processing : TWC.csv
Finished processing : TWM.csv
Finished processing : TWR.csv
Finished processing : TWY.csv
Finished processing : TXF.B.csv
Finished processing : TXF.csv
Finished processing : TXG.csv
Finished processing : TXP.csv
Finished processing : TXT.UN.csv
Finished processing

In [8]:
csv_files = os.listdir("CSV")
csv_files[2]

'TTG.csv'

In [3]:
n = csv_to_dataframe("TTG.csv")
n

Unnamed: 0_level_0,Ticker,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-01-02,TTG,0.22,0.22,0.22,0.22,0.0
2014-01-03,TTG,0.22,0.22,0.22,0.22,0.0
2014-01-06,TTG,0.22,0.22,0.22,0.22,0.0
2014-01-07,TTG,0.22,0.22,0.22,0.22,0.0
2014-01-08,TTG,0.22,0.22,0.22,0.22,0.0
...,...,...,...,...,...,...
2021-12-30,TTG,0.10,0.10,0.10,0.10,60000.0
2021-12-31,TTG,0.10,0.10,0.10,0.10,0.0
2022-01-04,TTG,0.10,0.10,0.10,0.10,0.0
2022-01-05,TTG,0.10,0.10,0.10,0.10,0.0


In [6]:
# Function to return a Dataframe with all symbols available in database
def get_all_symbols():
    """ Get all symbols in database as a DataFrame """
    conn = sqlite3.connect("TSX_Prices.sqlite")
    sql = f"SELECT * FROM symbols ORDER BY ticker"
    data = pd.read_sql_query(sql, conn, index_col="ticker")
    data.drop(labels=["index", "url", "yahoo"], axis=1, inplace=True )

    return data

In [7]:
# Heavy function to retreive all prices from database (more than 3 million rows, takes many seconds to execute)
def get_all_prices():
    conn = sqlite3.connect("TSX_Prices.sqlite")
    sql = f"SELECT * FROM prices_daily ORDER BY UPPER(Ticker) ASC, Date ASC"
    prices = pd.read_sql_query(sql, conn, index_col="Date")
    #prices = pd.read_sql_query(sql, conn, index_col="ticker")
    prices.drop("index", axis=1, inplace=True)
    
    return prices

In [46]:
# Open a CSV file using ticker symbol to insert missing values in prices_daily table

# new_prices = {'Ticker': ["utest", "utest", "utest"], 'Open': [1.01, 2.01, 3.01], 'High': [12.02, 22.02, 32.02], 'Low': [13.03, 23.03, 33.03], 'Close': [14.04, 24.04, 34.04], 'Volume': [100001, 200002, 300003]}
# new_row = pd.DataFrame(data=new_prices, index=["2022-02-02 00:00:00","2022-02-03 00:00:00","2022-02-04 00:00:00"])
# new_row.index.name = "Date"
# filter = new_row.index.isin(prices.index)
# new_row.drop(new_row[filter].index, inplace = True)

# new_row.to_sql("Prices_Daily", conn2, if_exists='append', index=True)



In [31]:
prices

Unnamed: 0_level_0,Ticker,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-11-04 00:00:00,A,-,-,-,0.55,0.0
2021-11-05 00:00:00,A,0.61,0.61,0.61,0.61,1500.0
2021-11-08 00:00:00,A,-,-,-,0.61,100.0
2021-11-09 00:00:00,A,-,-,-,0.61,0.0
2021-11-10 00:00:00,A,0.62,0.62,0.56,0.56,1013.0
...,...,...,...,...,...,...
2021-12-30 00:00:00,ZZZD,28.27,28.27,28.25,28.21,930.0
2021-12-31 00:00:00,ZZZD,28.24,28.24,28.1,28.1,573.0
2022-01-04 00:00:00,ZZZD,28.19,28.35,28.19,28.34,5177.0
2022-01-05 00:00:00,ZZZD,28.37,28.46,28.18,28.18,3944.0


In [8]:
# Cell to loop through all symbols and extract price data
def loop_through_symbols_on_yahoo():
    not_found_on_yahoo = []
    df = get_all_symbols()
    df["YahooExists"] = False
    #df_t = df[40:70]
    df_t = df

    for index, row in df_t.iterrows():
        symbol = index
        exchange = row["exchange"]
        result = yahoo_to_csv(symbol, exchange)
        if result is not None:
            df.at[symbol, "YahooExists"] = True
        else:
            not_found_on_yahoo.append(symbol)

    print(not_found_on_yahoo)
    notfound_df = pd.DataFrame(not_found_on_yahoo)
    notfound_df.to_csv("notfoundonyahoo.csv", mode="w", index=False, header=False )
# 3634 elements from yahoo in 1:11 hours

#loop_through_symbols_on_yahoo()

### DATA QUALITY INDICATORS

In [9]:
# Identify the first price data for every ticker (to help in finding previous date for GAP analysis)
# Prices must be sorted by ascending  ticker symbol and ascending dates
# For every ticker+date combination, insert the date of the previous price data fo rthe same ticker (to calculate the number of days between data and detect missing prices)
def detect_missing_prices(prices):
    prices["new_ticker"] = np.where(prices["Ticker"] != prices["Ticker"].shift(1), "New", "")
    prices["cur_date"]  = pd.to_datetime(prices.index, format="%Y-%m-%d", errors='coerce')
    prices["prev_date"] = pd.to_datetime(np.where(prices["new_ticker"] != "New", prices["cur_date"].shift(1), None), format="%Y-%m-%d", errors='coerce')

    # Calculate date gaps in prices using succesive dates for tickers in database
    prices["GAP"] = prices["cur_date"] - prices["prev_date"]
    prices["missing"] = prices["GAP"] > datetime.timedelta(days=5)

#detect_missing_prices(prices)

In [10]:
# Show date GAPS for a specific symbol
def show_missing_prices(ticker=None):
    if ticker is None:
        filter = (prices["missing"] == True)
    else:
        filter = (prices["missing"] == True) & (prices["Ticker"] == ticker)
    
    missing_data = prices.loc[filter]
    return missing_data

#missing_df = show_missing_prices()
#missing_df

In [11]:
#filter = prices["new_ticker"] == "New"
#prices.loc[filter]
# df["Trend"] = np.where(df["Close"] > df["SMA200"], "Up", "Down")

In [12]:
# Run all data quality indicators

# LOAD PRICES DATA AND RUN FEATURES FOR DATA QUALITY ANALYSIS
# Heavy extraction will take many seconds (more than 3 million rows)
# prices = get_all_prices()
# detect_missing_prices(prices)

# show_missing_prices()