In [1]:
import os
import pandas as pd
import sqlalchemy as db
from dotenv import load_dotenv

In [2]:
load_dotenv()

# config and credentials
server = os.getenv('server')
database = os.getenv('database')
username = os.getenv('username')
password = os.getenv('password')

# connection
engine = db.create_engine(
    'mssql://{}:{}@{}/{}?driver=ODBC+Driver+18+for+SQL+Server'.format(
        username, password, server, database
    )
)

# establish connection
connection = engine.connect()

### Daily Distinct Cusips and Total Volume by Retail / Institunional Grouping

In [5]:
def distinctCusips_totalVolume_byRetailInstitutional(year_start, year_end, discard_nr):
    
    base_query = '''
        SELECT 
            TrdExctnDt,
            RetailThreshold,
            COUNT(DISTINCT CusipId) AS DistinctCusips,
            SUM(EntrdVolQt) AS TotalVolume
        FROM (
            SELECT
                TrdExctnDt,
                CASE WHEN EntrdVolQt < 250000 THEN 'R' WHEN EntrdVolQt >= 500000 THEN 'IN' END AS RetailThreshold,
                CusipId,
                EntrdVolQt
            FROM
                TraceFilteredWithRatings
            WHERE
                TrdExctnDt >= '{}-01-1' AND TrdExctnDt < '{}-01-01'
    '''.format(
        year_start,
        year_end + 1
    )
    
    if discard_nr:
        base_query += 'AND RatingNum <> 0'
    
    base_query += '''
        ) A
        GROUP BY
            TrdExctnDt,
            RetailThreshold
        ORDER BY
            TrdExctnDt, 
            RetailThreshold
    '''
    
    df = pd.read_sql(base_query, connection)
    
    return df

In [6]:
start, end = 2002, 2022

df = distinctCusips_totalVolume_byRetailInstitutional(year_start=start, year_end=end, discard_nr=True)
df.to_csv('source/distinctCusips_totalVolume_byRetailInstitutional_{}-{}.csv'.format(start, end), index=False)
df.head()

Unnamed: 0,TrdExctnDt,RetailThreshold,DistinctCusips,TotalVolume
0,2002-07-01,,266,159587000.0
1,2002-07-01,IN,687,7062970000.0
2,2002-07-01,R,1487,270179600.0
3,2002-07-02,,285,161637000.0
4,2002-07-02,IN,735,13176950000.0


### Daily Distinct Cusips and Total Volume

In [5]:
def distinctCusips_totalVolume(year_start, year_end, discard_nr):
    
    base_query = '''
        SELECT 
            TrdExctnDt,
            COUNT(DISTINCT CusipId) AS DistinctCusips,
            SUM(EntrdVolQt) AS TotalVolume
        FROM
            TraceFilteredWithRatings
        WHERE
            TrdExctnDt >= '{}-01-1' AND TrdExctnDt < '{}-01-01'
    '''.format(
        year_start,
        year_end + 1
    )
    
    if discard_nr:
        base_query += 'AND RatingNum <> 0'
    
    base_query += '''
        GROUP BY
            TrdExctnDt
        ORDER BY
            TrdExctnDt
    '''
    
    df = pd.read_sql(base_query, connection)
    
    return df

In [6]:
start, end = 2002, 2022

df = distinctCusips_totalVolume(year_start=start, year_end=end, discard_nr=True)
df.to_csv('source/distinctCusips_totalVolume_{}-{}.csv'.format(start, end), index=False)
df.head()

Unnamed: 0,TrdExctnDt,DistinctCusips,TotalVolume
0,2002-07-01,1840,7525011000.0
1,2002-07-02,1875,13713740000.0
2,2002-07-03,1533,4861909000.0
3,2002-07-05,980,2110523000.0
4,2002-07-08,1987,13933420000.0


### Daily Total Volume by Side (Buy/Sell) 

In [7]:
def totalVolume_bySide(year_start, year_end, discard_nr):
    
    base_query = '''
        SELECT 
            TrdExctnDt,
            RptSideCd,
            SUM(EntrdVolQt) AS TotalVolume,
            COUNT(DISTINCT CusipId) AS DistinctCusips
        FROM
            TraceFilteredWithRatings
        WHERE
            TrdExctnDt >= '{}-01-1' AND TrdExctnDt < '{}-01-01'
    '''.format(
        year_start,
        year_end + 1
    )
    
    if discard_nr:
        base_query += 'AND RatingNum <> 0'
            
    base_query += '''
        GROUP BY
            TrdExctnDt, 
            RptSideCd
        ORDER BY
            TrdExctnDt, 
            RptSideCd
    '''
    
    df = pd.read_sql(base_query, connection)
    
    return df

In [8]:
start, end = 2002, 2022

df = totalVolume_bySide(year_start=start, year_end=end, discard_nr=True)
df.to_csv('source/totalVolume_bySide_{}-{}.csv'.format(start, end), index=False)
df.head()

Unnamed: 0,TrdExctnDt,RptSideCd,TotalVolume,DistinctCusips
0,2002-07-01,B,3311436000.0,1378
1,2002-07-01,S,4213575000.0,1429
2,2002-07-02,B,7042704000.0,1432
3,2002-07-02,S,6671035000.0,1537
4,2002-07-03,B,2082034000.0,1056


### Daily Total Volume by Investment Grade

In [3]:
def totalVolume_byInvestmentGrade(year_start, year_end):
    
    base_query = '''
        SELECT 
            TrdExctnDt,
            InvestGrade,
            SUM(EntrdVolQt) AS TotalVolume
        FROM (
            SELECT
                TrdExctnDt,
                EntrdVolQt,
                CASE WHEN RatingNum < 11 THEN 'IG' ELSE 'HY' END AS InvestGrade
            FROM
                TraceFilteredWithRatings
            WHERE
                RatingNum <> 0
                AND TrdExctnDt >= '{}-01-1' AND TrdExctnDt < '{}-01-01'
        ) A
        GROUP BY
            TrdExctnDt, InvestGrade
        ORDER BY
            TrdExctnDt, InvestGrade
    '''.format(
        year_start,
        year_end + 1
    )
    
    df = pd.read_sql(base_query, connection)
    
    return df

In [4]:
start, end = 2002, 2022

df = totalVolume_byInvestmentGrade(year_start=start, year_end=end)
df.to_csv('source/totalVolume_byInvestGrade_{}-{}.csv'.format(start, end), index=False)
df.head()

Unnamed: 0,TrdExctnDt,InvestGrade,TotalVolume
0,2002-07-01,HY,2608434000.0
1,2002-07-01,IG,4884303000.0
2,2002-07-02,HY,6377276000.0
3,2002-07-02,IG,7299102000.0
4,2002-07-03,HY,972450900.0


### Daily Transaction Price

In [5]:
def cusip_transactionPrice(year_start, year_end):
    
    base_query = '''
        WITH TransactionCostPerCusipPerDay (TrdExctnDt, CusipId, TransactionCost) AS (
            SELECT
                TrdExctnDt,
                CusipId,
                LOG(RptdPr/RptdPrBench) * CASE WHEN RptSideCd = 'B' THEN -1 ELSE 1 END AS TransactionCost
            FROM (
                SELECT
                    A.TrdExctnDt,
                    A.CusipId,
                    A.RptdPr,
                    A.RptSideCd,
                    B.RptdPr AS RptdPrBench
                FROM
                    TraceFilteredWithRatings A
                INNER JOIN (
                    SELECT
                        A.TrdExctnDt,
                        A.CusipId,
                        A.RptdPr -- benchmark price
                    FROM
                        TraceFilteredWithRatings A
                    INNER JOIN (    
                        SELECT
                            TrdExctnDt,
                            MAX(TrdExctnTm) AS LastTimeTrade,
                            CusipId
                        FROM
                            TraceFilteredWithRatings
                        WHERE
                            CntraMpId = 'D'
                        GROUP BY 
                            TrdExctnDt,
                            CusipId
                    ) B ON A.TrdExctnDt = B.TrdExctnDt AND A.TrdExctnTm = B.LastTimeTrade AND A.CusipId = B.CusipId
                ) B ON A.TrdExctnDt = B.TrdExctnDt AND A.CusipId = B.CusipId
                WHERE
                    A.CntraMpId = 'C'
                    AND TrdExctnDt >= '{}-01-1' AND TrdExctnDt < '{}-01-01'
            ) A
        )
        SELECT
            TrdExctnDt,
            AVG(AverageTransactionCost) AS TransactionCost
        FROM (
            SELECT
                TrdExctnDt,
                AVG(TransactionCost * 10000) AS AverageTransactionCost
            FROM
                TransactionCostPerCusipPerDay
            GROUP BY
                TrdExctnDt, 
                CusipId
        ) A
        GROUP BY
            TrdExctnDt
        ORDER BY
            TrdExctnDt
    '''.format(
        year_start,
        year_end + 1
    )

    df = pd.read_sql(base_query, connection)
    
    return df

In [6]:
start, end = 2002, 2022

df = cusip_transactionPrice(start, end)
df.to_csv('source/cusip_transactionPrice_{}-{}.csv'.format(start, end), index=False)
df.head()

Unnamed: 0,TrdExctnDt,TransactionCost
0,2002-07-01,101.215777
1,2002-07-02,106.180431
2,2002-07-03,101.542214
3,2002-07-05,109.080228
4,2002-07-08,94.961871


### Industry Analysis on Cusips and Issuers

In [9]:
def distinctCusips_distinctIssuers_byIndustry(year_start, year_end, discard_nr):
    
    base_query = '''
        SELECT
            IndustryCode,
            DistinctCusips,
            DistinctIssuers
        FROM (
            SELECT 
                IndustryCode,
                COUNT(DISTINCT CusipId) AS DistinctCusips,
                COUNT(DISTINCT IssuerID) AS DistinctIssuers
            FROM
                TraceFilteredWithRatings
            WHERE
                TrdExctnDt >= '{}-01-1' AND TrdExctnDt < '{}-01-01'
    '''.format(
        year_start,
        year_end + 1
    )
    
    if discard_nr:
        base_query += 'AND RatingNum <> 0'
    
    base_query += '''
            GROUP BY
                IndustryCode 
        ) A
        ORDER BY
            IndustryCode
    '''
    
    df = pd.read_sql(base_query, connection)
    
    return df

In [10]:
start, end = 2002, 2022

df = distinctCusips_distinctIssuers_byIndustry(year_start=start, year_end=end, discard_nr=True)
df.to_csv('source/distinctCusips_distinctIssuers_byIndustry_{}-{}.csv'.format(start, end), index=False)
df.head()

Unnamed: 0,IndustryCode,DistinctCusips,DistinctIssuers
0,10,5778,851
1,11,1448,247
2,12,1596,294
3,13,82,11
4,14,1089,146


### Daily Distinct Cusips and Total Volume by Issuer

In [11]:
def distinctCusips_totalVolume_byIssuer(year_start, year_end, discard_nr):
    
    base_query = '''
        SELECT
            TrdExctnDt,
            IssuerId,
            COUNT(DISTINCT CusipId) AS DistinctCusips,
            SUM(EntrdVolQt) AS TotalVolume
        FROM 
            TraceFilteredWithRatings
        WHERE
            TrdExctnDt >= '{}-01-1' AND TrdExctnDt < '{}-01-01'
    '''.format(
        year_start,
        year_end + 1
    )
    
    if discard_nr:
        base_query += 'AND RatingNum <> 0'
    
    base_query += '''
        GROUP BY
            TrdExctnDt, 
            IssuerId
        ORDER BY
            TrdExctnDt, 
            IssuerId
    '''
    
    df = pd.read_sql(base_query, connection)
    
    return df

In [12]:
start, end = 2002, 2022

df = distinctCusips_totalVolume_byIssuer(year_start=start, year_end=end, discard_nr=True)
df.to_csv('source/distinctCusips_totalVolume_byIssuer_{}-{}.csv'.format(start, end), index=False)
df.head()

Unnamed: 0,TrdExctnDt,IssuerId,DistinctCusips,TotalVolume
0,2002-07-01,11,5,7968000.0
1,2002-07-01,20,1,5000.0
2,2002-07-01,26,13,38126000.0
3,2002-07-01,34,1,200000.0
4,2002-07-01,41,3,9613000.0


### Daily Distinct Cusips by Rating

In [13]:
def distinctCusips_byRating(year_start, year_end, discard_nr):
    
    base_query = '''
        SELECT
            TrdExctnDt,
            RatingNum,
            COUNT(DISTINCT CusipId) AS DistinctCusips
        FROM 
            TraceFilteredWithRatings A
        WHERE
            A.TrdExctnDt >= '{}-01-1' AND A.TrdExctnDt < '{}-01-01'
    '''.format(
        year_start,
        year_end + 1
    )
    
    if discard_nr:
        base_query += 'AND RatingNum <> 0'
    
    base_query += '''
        GROUP BY
            TrdExctnDt,
            RatingNum
        ORDER BY
            TrdExctnDt,
            RatingNum
    '''
    
    df = pd.read_sql(base_query, connection)
    
    return df

In [14]:
start, end = 2002, 2022

df = distinctCusips_byRating(year_start=start, year_end=end, discard_nr=True)
df.to_csv('source/distinctCusips_byRating_{}-{}.csv'.format(start, end), index=False)
df.head()

Unnamed: 0,TrdExctnDt,RatingNum,DistinctCusips
0,2002-07-01,1.0,58
1,2002-07-01,2.0,68
2,2002-07-01,3.0,112
3,2002-07-01,4.0,182
4,2002-07-01,5.0,186


### Distinct Cusips per Maturity Band

In [15]:
def distinctCusips_byMaturity(year_start, year_end, discard_nr):
    
    base_query = '''
        SELECT
            MaturityBand,
            COUNT(DISTINCT CusipId) AS DistinctCusips
        FROM (
            SELECT
                CASE 
                    WHEN ABS(DATEDIFF(DAY, Maturity, OfferingDate)) * 1.0 / 360 < 5 THEN 1
                    WHEN ABS(DATEDIFF(DAY, Maturity, OfferingDate)) * 1.0 / 360 < 15 THEN 2
                    ELSE 3
                END AS MaturityBand,
                CusipId
            FROM
                TraceFilteredWithRatings
            WHERE
                TrdExctnDt >= '{}-01-1' AND TrdExctnDt < '{}-01-01'
    '''.format(
        year_start,
        year_end + 1
    )
    
    if discard_nr:
        base_query += 'AND RatingNum <> 0'
    
    base_query += '''
        ) A
        GROUP BY
            MaturityBand
        ORDER BY
            MaturityBand
    '''
    
    df = pd.read_sql(base_query, connection)
    
    return df

In [16]:
start, end = 2002, 2022

df = distinctCusips_byMaturity(year_start=start, year_end=end, discard_nr=True)
df.to_csv('source/distinctCusips_byMaturity_{}-{}.csv'.format(start, end), index=False)
df.head()

Unnamed: 0,MaturityBand,DistinctCusips
0,1,4602
1,2,17477
2,3,4508


### Distinct Cusips per Investment Grade

In [17]:
def distinctCusips_byInvestmentGrade(year_start, year_end):
    
    base_query = '''
        SELECT
            InvestmentGrade,
            COUNT(DISTINCT CusipId) AS DistinctCusips
        FROM (
            SELECT
                CusipId,
                CASE
                    WHEN RatingNum = 0 THEN 'NR'
                    WHEN RatingNum < 11 THEN 'IG'
                    ELSE 'HY'
                END AS InvestmentGrade
            FROM 
                TraceFilteredWithRatings
            WHERE
                TrdExctnDt >= '{}-01-1' AND TrdExctnDt < '{}-01-01'
        ) A
        GROUP BY
            InvestmentGrade
        ORDER BY
            InvestmentGrade
    '''.format(
        year_start,
        year_end + 1
    )
    
    df = pd.read_sql(base_query, connection)
    
    return df

In [18]:
start, end = 2002, 2022

df = distinctCusips_byInvestmentGrade(year_start=start, year_end=end)
df.to_csv('source/distinctCusips_byInvestmentGrade_{}-{}.csv'.format(start, end), index=False)
df.head()

Unnamed: 0,InvestmentGrade,DistinctCusips
0,HY,5961
1,IG,23049
2,NR,45019


### Distinct Trading Days

In [19]:
def distinctTradingDays(year_start, year_end, discard_nr):
    
    base_query = '''
        SELECT
            COUNT(DISTINCT TrdExctnDt)
        FROM
            TraceFilteredWithRatings
        WHERE
            TrdExctnDt >= '{}-01-1' AND TrdExctnDt < '{}-01-01'
    '''.format(
        year_start,
        year_end + 1
    )
    
    if discard_nr:
        base_query += 'AND RatingNum <> 0'
    
    df = pd.read_sql(base_query, connection)
    
    return df

In [20]:
start, end = 2002, 2022 # MIN = 2002-07-01 | MAX = 2022-09-30

df = distinctTradingDays(year_start=start, year_end=end, discard_nr=True)
df.to_csv('source/distinctTradingDays_{}-{}.csv'.format(start, end), index=False)
df.head()

Unnamed: 0,Unnamed: 1
0,5099


In [7]:
def totalVolume_forInterdealer():
    
    base_query = '''
        SELECT
            TrdExctnDt,
            SUM(EntrdVolQt) AS TotaVolume
        FROM
            TraceFilteredWithRatings
        WHERE
            CntraMpId = 'D'
        GROUP BY
            TrdExctnDt
        ORDER BY 
            TrdExctnDt
    '''
    
    df = pd.read_sql(base_query, connection)
    
    return df

start, end = 2002, 2022

df = totalVolume_forInterdealer()
df.to_csv('source/totalVolume_forInterdealer_{}-{}.csv'.format(start, end), index=False)
df.head()

Unnamed: 0,TrdExctnDt,TotaVolume
0,2002-07-01,1611082000.0
1,2002-07-02,2698155000.0
2,2002-07-03,1295917000.0
3,2002-07-05,880201000.0
4,2002-07-08,4195696000.0
