In [1]:
import os
import pandas as pd
import sqlalchemy as db
from dotenv import load_dotenv

In [2]:
load_dotenv()

# config and credentials
server = os.getenv('server')
database = os.getenv('database')
username = os.getenv('user')
password = os.getenv('password')

# connection
engine = db.create_engine(
    'mssql://{}:{}@{}/{}?driver=ODBC+Driver+18+for+SQL+Server'.format(
        username, password, server, database
    )
)

# establish connection
connection = engine.connect()

### Daily Distinct Cusips and Total Volume by Retail / Institunional grouping

In [3]:
def distinctCusips_totalVolume_by_retail_institutional(year_start, year_end, discard_nr):
    
    base_query = '''
        SELECT 
            TrdExctnDt,
            RetailThreshold,
            COUNT(DISTINCT CusipId) AS DistinctCusips,
            SUM(EntrdVolQt) AS TotalVolume
        FROM (
            SELECT
                TrdExctnDt,
                CASE WHEN EntrdVolQt < 100000 THEN 'R' ELSE 'IN' END AS RetailThreshold,
                CusipId,
                EntrdVolQt
            FROM
                Trace_withRatings_filtered
            WHERE
                TrdExctnDt >= '{}-01-1' AND TrdExctnDt < '{}-01-01'
    '''.format(
        year_start,
        year_end + 1
    )
    
    if discard_nr:
        base_query += 'AND RatingNum <> 0'
    
    base_query += '''
        ) A
        GROUP BY
            TrdExctnDt,
            RetailThreshold
        ORDER BY
            TrdExctnDt, 
            RetailThreshold
    '''
    
    df = pd.read_sql(base_query, connection)
    
    return df

In [4]:
start, end = 2002, 2022

df = distinctCusips_totalVolume_by_retail_institutional(year_start=start, year_end=end, discard_nr=True)
df.to_csv('data/output/bonds/distinctCusips_totalVolume_by_retail_institutional_{}-{}.csv'.format(start, end), index=False)
df.head()

Unnamed: 0,TrdExctnDt,RetailThreshold,DistinctCusips,TotalVolume
0,2002-07-26,IN,982,7142402000.0
1,2002-07-26,R,1350,116868500.0
2,2002-07-29,IN,1099,11855980000.0
3,2002-07-29,R,1350,130587300.0
4,2002-07-30,IN,1302,17864700000.0


### Daily Distinct Cusips and Total Volume

In [5]:
def distinctCusips_totalVolume(year_start, year_end, discard_nr):
    
    base_query = '''
        SELECT 
            TrdExctnDt,
            COUNT(DISTINCT CusipId) AS DistinctCusips,
            SUM(EntrdVolQt) AS TotalVolume
        FROM
            Trace_withRatings_filtered
        WHERE
            TrdExctnDt >= '{}-01-1' AND TrdExctnDt < '{}-01-01'
    '''.format(
        year_start,
        year_end + 1
    )
    
    if discard_nr:
        base_query += 'AND RatingNum <> 0'
    
    base_query += '''
        GROUP BY
            TrdExctnDt
        ORDER BY
            TrdExctnDt
    '''
    
    df = pd.read_sql(base_query, connection)
    
    return df

In [6]:
start, end = 2002, 2022

df = distinctCusips_totalVolume(year_start=start, year_end=end, discard_nr=True)
df.to_csv('data/output/bonds/distinctCusips_totalVolume_{}-{}.csv'.format(start, end), index=False)
df.head()

Unnamed: 0,TrdExctnDt,DistinctCusips,TotalVolume
0,2002-07-26,1808,7259270000.0
1,2002-07-29,1860,11986570000.0
2,2002-07-30,2061,18016780000.0
3,2002-07-31,2056,21678040000.0
4,2002-08-26,1857,4257617000.0


### Daily Total Volume by Side (Buy/Sell) 

In [7]:
def totalVolume_by_side(year_start, year_end, discard_nr):
    
    base_query = '''
        SELECT 
            TrdExctnDt,
            RptSideCd,
            SUM(EntrdVolQt) as TotalVolume
        FROM
            Trace_withRatings_filtered
        WHERE
            TrdExctnDt >= '{}-01-1' AND TrdExctnDt < '{}-01-01'
    '''.format(
        year_start,
        year_end + 1
    )
    
    if discard_nr:
        base_query += 'AND RatingNum <> 0'
            
    base_query += '''
        GROUP BY
            TrdExctnDt, 
            RptSideCd
        ORDER BY
            TrdExctnDt, 
            RptSideCd
    '''
    
    df = pd.read_sql(base_query, connection)
    
    return df

In [8]:
start, end = 2002, 2022

df = totalVolume_by_side(year_start=start, year_end=end, discard_nr=True)
df.to_csv('data/output/bonds/totalVolume_by_side_{}-{}.csv'.format(start, end), index=False)
df.head()

Unnamed: 0,TrdExctnDt,RptSideCd,TotalVolume
0,2002-07-26,B,3555662000.0
1,2002-07-26,S,3703608000.0
2,2002-07-29,B,7912743000.0
3,2002-07-29,S,4073825000.0
4,2002-07-30,B,10148390000.0


### Industry analysis on Cusips and Issuers

In [9]:
def industry_analysis_on_cusips_issuers(year_start, year_end, discard_nr):
    
    base_query = '''
        SELECT
            IndustryCode,
            DistinctCusips,
            DistinctIssuers
        FROM (
            SELECT 
                IndustryCode,
                COUNT(DISTINCT CusipId) AS DistinctCusips,
                COUNT(DISTINCT IssuerID) AS DistinctIssuers
            FROM
                Trace_withRatings_filtered
            WHERE
                TrdExctnDt >= '{}-01-1' AND TrdExctnDt < '{}-01-01'
    '''.format(
        year_start,
        year_end + 1
    )
    
    if discard_nr:
        base_query += 'AND RatingNum <> 0'
    
    base_query += '''
            GROUP BY
                IndustryCode 
        ) A
        ORDER BY
            IndustryCode
    '''
    
    df = pd.read_sql(base_query, connection)
    
    return df

In [10]:
start, end = 2002, 2022

df = industry_analysis_on_cusips_issuers(year_start=start, year_end=end, discard_nr=True)
df.to_csv('data/output/bonds/industry_analysis_on_cusips_issuers_{}-{}.csv'.format(start, end), index=False)
df.head()

Unnamed: 0,IndustryCode,DistinctCusips,DistinctIssuers
0,10,5413,806
1,11,1387,238
2,12,1518,284
3,13,74,11
4,14,1009,136


### Daily Distinct Cusips and Total Volume by Issuer

In [11]:
def distinctCusips_totalVolume_by_issuer(year_start, year_end, discard_nr):
    
    base_query = '''
        SELECT
            TrdExctnDt,
            IssuerId,
            COUNT(DISTINCT CusipId) AS DistinctCusips,
            SUM(EntrdVolQt) AS TotalVolume
        FROM 
            Trace_withRatings_filtered
        WHERE
            TrdExctnDt >= '{}-01-1' AND TrdExctnDt < '{}-01-01'
    '''.format(
        year_start,
        year_end + 1
    )
    
    if discard_nr:
        base_query += 'AND RatingNum <> 0'
    
    base_query += '''
        GROUP BY
            TrdExctnDt, 
            IssuerId
        ORDER BY
            TrdExctnDt, 
            IssuerId
    '''
    
    df = pd.read_sql(base_query, connection)
    
    return df

In [12]:
start, end = 2002, 2022

df = distinctCusips_totalVolume_by_issuer(year_start=start, year_end=end, discard_nr=True)
df.to_csv('data/output/bonds/distinctCusips_totalVolume_by_issuer_{}-{}.csv'.format(start, end), index=False)
df.head()

Unnamed: 0,TrdExctnDt,IssuerId,DistinctCusips,TotalVolume
0,2002-07-26,11,6,49816000.0
1,2002-07-26,20,2,1704000.0
2,2002-07-26,26,13,24934000.0
3,2002-07-26,34,1,50000.0
4,2002-07-26,41,6,12102000.0


### Daily Distinct Cusips by Rating

In [13]:
def distinctCusips_by_rating(year_start, year_end, discard_nr):
    
    base_query = '''
        SELECT
            TrdExctnDt,
            RatingNum,
            COUNT(DISTINCT CusipId) AS DistinctCusips
        FROM 
            Trace_withRatings_filtered A
        WHERE
            A.TrdExctnDt >= '{}-01-1' AND A.TrdExctnDt < '{}-01-01'
    '''.format(
        year_start,
        year_end + 1
    )
    
    if discard_nr:
        base_query += 'AND RatingNum <> 0'
    
    base_query += '''
        GROUP BY
            TrdExctnDt,
            RatingNum
        ORDER BY
            TrdExctnDt,
            RatingNum
    '''
    
    df = pd.read_sql(base_query, connection)
    
    return df

In [14]:
start, end = 2002, 2022

df = distinctCusips_by_rating(year_start=start, year_end=end, discard_nr=True)
df.to_csv('data/output/bonds/distinctCusips_by_rating_{}-{}.csv'.format(start, end), index=False)
df.head()

Unnamed: 0,TrdExctnDt,RatingNum,DistinctCusips
0,2002-07-26,1,57
1,2002-07-26,2,62
2,2002-07-26,3,109
3,2002-07-26,4,179
4,2002-07-26,5,204


### Distinct Cusips per Maturity Band

In [15]:
def distinctCusips_by_maturity(year_start, year_end, discard_nr):
    
    base_query = '''
        SELECT
            MaturityBand,
            COUNT(DISTINCT CusipId) AS DistinctCusips
        FROM (
            SELECT
                CASE 
                    WHEN ABS(DATEDIFF(DAY, Maturity, OfferingDate)) * 1.0 / 360 < 5 THEN 1
                    WHEN ABS(DATEDIFF(DAY, Maturity, OfferingDate)) * 1.0 / 360 < 15 THEN 2
                    ELSE 3
                END AS MaturityBand,
                CusipId
            FROM
                Trace_withRatings_filtered
            WHERE
                TrdExctnDt >= '{}-01-1' AND TrdExctnDt < '{}-01-01'
    '''.format(
        year_start,
        year_end + 1
    )
    
    if discard_nr:
        base_query += 'AND RatingNum <> 0'
    
    base_query += '''
        ) A
        GROUP BY
            MaturityBand
        ORDER BY
            MaturityBand
    '''
    
    df = pd.read_sql(base_query, connection)
    
    return df

In [16]:
start, end = 2002, 2022

df = distinctCusips_by_maturity(year_start=start, year_end=end, discard_nr=True)
df.to_csv('data/output/bonds/distinctCusips_by_maturity_{}-{}.csv'.format(start, end), index=False)
df.head()

Unnamed: 0,MaturityBand,DistinctCusips
0,1,3444
1,2,15647
2,3,4207


### Distinct Cusips per Investment Grade

In [17]:
def distinctCusips_by_investmentGrade(year_start, year_end):
    
    base_query = '''
        SELECT
            InvestmentGrade,
            COUNT(DISTINCT CusipId) AS DistinctCusips
        FROM (
            SELECT
                CusipId,
                CASE
                    WHEN RatingNum = 0 THEN 'NR'
                    WHEN RatingNum < 11 THEN 'Y'
                    ELSE 'N'
                END AS InvestmentGrade
            FROM 
                Trace_withRatings_filtered
            WHERE
                TrdExctnDt >= '{}-01-1' AND TrdExctnDt < '{}-01-01'
        ) A
        GROUP BY
            InvestmentGrade
        ORDER BY
            InvestmentGrade
    '''.format(
        year_start,
        year_end + 1
    )
    
    df = pd.read_sql(base_query, connection)
    
    return df

In [18]:
start, end = 2002, 2022

df = distinctCusips_by_investmentGrade(year_start=start, year_end=end)
df.to_csv('data/output/bonds/distinctCusips_by_investmentGrade_{}-{}.csv'.format(start, end), index=False)
df.head()

Unnamed: 0,InvestmentGrade,DistinctCusips
0,N,5815
1,NR,27425
2,Y,20088


### Distinct Trading Days

In [19]:
def distinctTradingDays(year_start, year_end, discard_nr):
    
    base_query = '''
        SELECT
            COUNT(DISTINCT TrdExctnDt)
        FROM
            Trace_withRatings_filtered
        WHERE
            TrdExctnDt >= '{}-01-1' AND TrdExctnDt < '{}-01-01'
    '''.format(
        year_start,
        year_end + 1
    )
    
    if discard_nr:
        base_query += 'AND RatingNum <> 0'
    
    df = pd.read_sql(base_query, connection)
    
    return df

In [20]:
start, end = 2002, 2022 # MIN = 2002-07-01 | MAX = 2022-09-30

df = distinctTradingDays(year_start=start, year_end=end, discard_nr=True)
df.to_csv('data/output/bonds/distinctTradingDays_{}-{}.csv'.format(start, end), index=False)
df.head()

Unnamed: 0,Unnamed: 1
0,1007
