In [1]:
import os
import pandas as pd
import sqlalchemy as db
from dotenv import load_dotenv

In [2]:
load_dotenv()

# config and credentials
server = os.getenv('server')
database = os.getenv('database')
username = os.getenv('user')
password = os.getenv('password')

# connection
engine = db.create_engine(
    'mssql://{}:{}@{}/{}?driver=ODBC+Driver+18+for+SQL+Server'.format(
        username, password, server, database
    )
)

# establish connection
connection = engine.connect()

### Daily Distinct Cusips and Total Volume by Retail / Institunional grouping

In [3]:
def distinctCusips_totalVolume_by_retail_institutional(year_start, year_end, discard_nr):
    
    base_query = '''
        SELECT 
            TrdExctnDt,
            RetailThreshold,
            COUNT(DISTINCT CusipId) AS DistinctCusips,
            SUM(EntrdVolQt) AS TotalVolume
        FROM (
            SELECT
                TrdExctnDt,
                CASE WHEN EntrdVolQt < 100000 THEN 'R' ELSE 'IN' END AS RetailThreshold,
                CusipId,
                EntrdVolQt
            FROM
                Trace_withRatings_filtered
            WHERE
                TrdExctnDt >= '{}-01-1' AND TrdExctnDt < '{}-01-01'
    '''.format(
        year_start,
        year_end + 1
    )
    
    if discard_nr:
        base_query += 'AND RatingNum <> 0'
    
    base_query += '''
        ) A
        GROUP BY
            TrdExctnDt,
            RetailThreshold
        ORDER BY
            TrdExctnDt, 
            RetailThreshold
    '''
    
    df = pd.read_sql(base_query, connection)
    
    return df

In [4]:
start, end = 2002, 2022

df = distinctCusips_totalVolume_by_retail_institutional(year_start=start, year_end=end, discard_nr=True)
df.to_csv('data/output/bonds/distinctCusips_totalVolume_by_retail_institutional_{}-{}.csv'.format(start, end), index=False)
df.head()

Unnamed: 0,TrdExctnDt,RetailThreshold,DistinctCusips,TotalVolume
0,2009-01-26,IN,1282,6047935000.0
1,2009-01-26,R,1694,221701400.0
2,2009-01-27,IN,1379,7549069000.0
3,2009-01-27,R,1773,250266800.0
4,2009-01-28,IN,1366,8008452000.0


### Daily Distinct Cusips and Total Volume

In [5]:
def distinctCusips_totalVolume(year_start, year_end, discard_nr):
    
    base_query = '''
        SELECT 
            TrdExctnDt,
            COUNT(DISTINCT CusipId) AS DistinctCusips,
            SUM(EntrdVolQt) AS TotalVolume
        FROM
            Trace_withRatings_filtered
        WHERE
            TrdExctnDt >= '{}-01-1' AND TrdExctnDt < '{}-01-01'
    '''.format(
        year_start,
        year_end + 1
    )
    
    if discard_nr:
        base_query += 'AND RatingNum <> 0'
    
    base_query += '''
        GROUP BY
            TrdExctnDt
        ORDER BY
            TrdExctnDt
    '''
    
    df = pd.read_sql(base_query, connection)
    
    return df

In [6]:
start, end = 2002, 2022

df = distinctCusips_totalVolume(year_start=start, year_end=end, discard_nr=True)
df.to_csv('data/output/bonds/distinctCusips_totalVolume_{}-{}.csv'.format(start, end), index=False)
df.head()

Unnamed: 0,TrdExctnDt,DistinctCusips,TotalVolume
0,2009-01-26,2141,6269636000.0
1,2009-01-27,2231,7799335000.0
2,2009-01-28,2158,8220242000.0
3,2009-01-29,2080,8333807000.0
4,2009-01-30,2019,9632648000.0


### Daily Total Volume by Side (Buy/Sell) 

In [7]:
def totalVolume_by_side(year_start, year_end, discard_nr):
    
    base_query = '''
        SELECT 
            TrdExctnDt,
            RptSideCd,
            SUM(EntrdVolQt) as TotalVolume
        FROM
            Trace_withRatings_filtered
        WHERE
            TrdExctnDt >= '{}-01-1' AND TrdExctnDt < '{}-01-01'
    '''.format(
        year_start,
        year_end + 1
    )
    
    if discard_nr:
        base_query += 'AND RatingNum <> 0'
            
    base_query += '''
        GROUP BY
            TrdExctnDt, 
            RptSideCd
        ORDER BY
            TrdExctnDt, 
            RptSideCd
    '''
    
    df = pd.read_sql(base_query, connection)
    
    return df

In [8]:
start, end = 2002, 2022

df = totalVolume_by_side(year_start=start, year_end=end, discard_nr=True)
df.to_csv('data/output/bonds/totalVolume_by_side_{}-{}.csv'.format(start, end), index=False)
df.head()

Unnamed: 0,TrdExctnDt,RptSideCd,TotalVolume
0,2009-01-26,B,3218375000.0
1,2009-01-26,S,3051261000.0
2,2009-01-27,B,4121970000.0
3,2009-01-27,S,3677366000.0
4,2009-01-28,B,3929067000.0


### Industry analysis on Cusips and Issuers

In [9]:
def industry_analysis_on_cusips_issuers(year_start, year_end, discard_nr):
    
    base_query = '''
        SELECT
            IndustryCode,
            DistinctCusips,
            DistinctIssuers
        FROM (
            SELECT 
                IndustryCode,
                COUNT(DISTINCT CusipId) AS DistinctCusips,
                COUNT(DISTINCT IssuerID) AS DistinctIssuers
            FROM
                Trace_withRatings_filtered
            WHERE
                TrdExctnDt >= '{}-01-1' AND TrdExctnDt < '{}-01-01'
    '''.format(
        year_start,
        year_end + 1
    )
    
    if discard_nr:
        base_query += 'AND RatingNum <> 0'
    
    base_query += '''
            GROUP BY
                IndustryCode 
        ) A
        ORDER BY
            IndustryCode
    '''
    
    df = pd.read_sql(base_query, connection)
    
    return df

In [10]:
start, end = 2002, 2022

df = industry_analysis_on_cusips_issuers(year_start=start, year_end=end, discard_nr=True)
df.to_csv('data/output/bonds/industry_analysis_on_cusips_issuers_{}-{}.csv'.format(start, end), index=False)
df.head()

Unnamed: 0,IndustryCode,DistinctCusips,DistinctIssuers
0,10,4366,631
1,11,936,152
2,12,1303,244
3,13,61,8
4,14,806,109


### Daily Distinct Cusips and Total Volume by Issuer

In [11]:
def distinctCusips_totalVolume_by_issuer(year_start, year_end, discard_nr):
    
    base_query = '''
        SELECT
            TrdExctnDt,
            IssuerId,
            COUNT(DISTINCT CusipId) AS DistinctCusips,
            SUM(EntrdVolQt) AS TotalVolume
        FROM 
            Trace_withRatings_filtered
        WHERE
            TrdExctnDt >= '{}-01-1' AND TrdExctnDt < '{}-01-01'
    '''.format(
        year_start,
        year_end + 1
    )
    
    if discard_nr:
        base_query += 'AND RatingNum <> 0'
    
    base_query += '''
        GROUP BY
            TrdExctnDt, 
            IssuerId
        ORDER BY
            TrdExctnDt, 
            IssuerId
    '''
    
    df = pd.read_sql(base_query, connection)
    
    return df

In [12]:
start, end = 2002, 2022

df = distinctCusips_totalVolume_by_issuer(year_start=start, year_end=end, discard_nr=True)
df.to_csv('data/output/bonds/distinctCusips_totalVolume_by_issuer_{}-{}.csv'.format(start, end), index=False)
df.head()

Unnamed: 0,TrdExctnDt,IssuerId,DistinctCusips,TotalVolume
0,2009-01-26,11,3,12404000.0
1,2009-01-26,13,1,1326000.0
2,2009-01-26,25,2,12030000.0
3,2009-01-26,26,3,374000.0
4,2009-01-26,41,8,9591000.0


### Daily Distinct Cusips by Rating

In [13]:
def distinctCusips_by_rating(year_start, year_end, discard_nr):
    
    base_query = '''
        SELECT
            TrdExctnDt,
            RatingNum,
            COUNT(DISTINCT CusipId) AS DistinctCusips
        FROM 
            Trace_withRatings_filtered A
        WHERE
            A.TrdExctnDt >= '{}-01-1' AND A.TrdExctnDt < '{}-01-01'
    '''.format(
        year_start,
        year_end + 1
    )
    
    if discard_nr:
        base_query += 'AND RatingNum <> 0'
    
    base_query += '''
        GROUP BY
            TrdExctnDt,
            RatingNum
        ORDER BY
            TrdExctnDt,
            RatingNum
    '''
    
    df = pd.read_sql(base_query, connection)
    
    return df

In [14]:
start, end = 2002, 2022

df = distinctCusips_by_rating(year_start=start, year_end=end, discard_nr=True)
df.to_csv('data/output/bonds/distinctCusips_by_rating_{}-{}.csv'.format(start, end), index=False)
df.head()

Unnamed: 0,TrdExctnDt,RatingNum,DistinctCusips
0,2009-01-26,1,97
1,2009-01-26,2,4
2,2009-01-26,3,38
3,2009-01-26,4,107
4,2009-01-26,5,255


### Distinct Cusips per Maturity Band

In [15]:
def distinctCusips_by_maturity(year_start, year_end, discard_nr):
    
    base_query = '''
        SELECT
            MaturityBand,
            COUNT(DISTINCT CusipId) AS DistinctCusips
        FROM (
            SELECT
                CASE 
                    WHEN ABS(DATEDIFF(DAY, Maturity, OfferingDate)) * 1.0 / 360 < 5 THEN 1
                    WHEN ABS(DATEDIFF(DAY, Maturity, OfferingDate)) * 1.0 / 360 < 15 THEN 2
                    ELSE 3
                END AS MaturityBand,
                CusipId
            FROM
                Trace_withRatings_filtered
            WHERE
                TrdExctnDt >= '{}-01-1' AND TrdExctnDt < '{}-01-01'
    '''.format(
        year_start,
        year_end + 1
    )
    
    if discard_nr:
        base_query += 'AND RatingNum <> 0'
    
    base_query += '''
        ) A
        GROUP BY
            MaturityBand
        ORDER BY
            MaturityBand
    '''
    
    df = pd.read_sql(base_query, connection)
    
    return df

In [16]:
start, end = 2002, 2022

df = distinctCusips_by_maturity(year_start=start, year_end=end, discard_nr=True)
df.to_csv('data/output/bonds/distinctCusips_by_maturity_{}-{}.csv'.format(start, end), index=False)
df.head()

Unnamed: 0,MaturityBand,DistinctCusips
0,1,2445
1,2,11548
2,3,3607


### Distinct Cusips per Investment Grade

In [17]:
def distinctCusips_by_investmentGrade(year_start, year_end):
    
    base_query = '''
        SELECT
            InvestmentGrade,
            COUNT(DISTINCT CusipId) AS DistinctCusips
        FROM (
            SELECT
                CusipId,
                CASE
                    WHEN RatingNum = 0 THEN 'NR'
                    WHEN RatingNum < 11 THEN 'Y'
                    ELSE 'N'
                END AS InvestmentGrade
            FROM 
                Trace_withRatings_filtered
            WHERE
                TrdExctnDt >= '{}-01-1' AND TrdExctnDt < '{}-01-01'
        ) A
        GROUP BY
            InvestmentGrade
        ORDER BY
            InvestmentGrade
    '''.format(
        year_start,
        year_end + 1
    )
    
    df = pd.read_sql(base_query, connection)
    
    return df

In [19]:
start, end = 2002, 2022

df = distinctCusips_by_investmentGrade(year_start=start, year_end=end)
df.to_csv('data/output/bonds/distinctCusips_by_investmentGrade_{}-{}.csv'.format(start, end), index=False)
df.head()

Unnamed: 0,InvestmentGrade,DistinctCusips
0,N,3967
1,NR,26119
2,Y,15207


### Distinct Trading Days

In [20]:
def distinctTradingDays(year_start, year_end, discard_nr):
    
    base_query = '''
        SELECT
            COUNT(DISTINCT TrdExctnDt)
        FROM
            Trace_withRatings_filtered
        WHERE
            TrdExctnDt >= '{}-01-1' AND TrdExctnDt < '{}-01-01'
    '''.format(
        year_start,
        year_end + 1
    )
    
    if discard_nr:
        base_query += 'AND RatingNum <> 0'
    
    df = pd.read_sql(base_query, connection)
    
    return df

In [21]:
start, end = 2002, 2022 # MIN = 2002-07-01 | MAX = 2022-09-30

df = distinctTradingDays(year_start=start, year_end=end, discard_nr=False)
df.to_csv('data/output/bonds/distinctTradingDays_{}-{}.csv'.format(start, end), index=False)
df.head()

Unnamed: 0,Unnamed: 1
0,683
