In [1]:
import os
import pandas as pd
import sqlalchemy as db
from dotenv import load_dotenv

In [2]:
load_dotenv()

# config and credentials
server = os.getenv('server')
database = os.getenv('database')
username = os.getenv('username')
password = os.getenv('password')

# connection
engine = db.create_engine(
    'mssql://{}:{}@{}/{}?driver=ODBC+Driver+18+for+SQL+Server'.format(
        username, password, server, database
    )
)

# establish connection
connection = engine.connect()

### Herding Analysis

In [3]:
def herding_stockMarket(year_start, year_end):
    
    base_query = '''
    
        DECLARE @LeftTail FLOAT
        SET @LeftTail = (
            SELECT DISTINCT PERCENTILE_CONT(0.05) WITHIN GROUP(ORDER BY Rm) OVER () FROM [dbo].[CrspcFactors]
        )

        DECLARE @RightTail FLOAT
        SET @RightTail = (
            SELECT DISTINCT PERCENTILE_CONT(0.95) WITHIN GROUP(ORDER BY Rm) OVER () FROM [dbo].[CrspcFactors]
        )

        SELECT
            Datadate,
            MktRf, Smb, Hml, Rmw, Cma, Rf, Rm, 
            ABS(Rm) AS AbsoluteRm,
            POWER(Rm, 2) AS SquaredRm, 
            Sum / Count AS Measure,
            CASE 
                WHEN Rm <= @LeftTail THEN 1
                ELSE 0
            END AS LeftTail,
            CASE 
                WHEN Rm >= @RightTail THEN 1
                ELSE 0
            END AS RightTail
        FROM (
            SELECT
                DataDate, 
                MktRf, Smb, Hml, Rmw, Cma, Rf, Rm,
                ABS(SUM(DReturn) - Rm) AS Sum,
                COUNT(DISTINCT Cusip) AS Count
            FROM (
                SELECT
                    A.Cusip,
                    A.DataDate,
                    B.*,
                    LOG(PrcCd, EXP(1)) - LOG(LAG(A.PrcCd) OVER (PARTITION BY A.Cusip ORDER BY A.DataDate), EXP(1)) AS DReturn
                FROM
                    [dbo].[CrspcSecuritiesDaily] A
                INNER JOIN
                    [dbo].[CrspcFactors] B ON A.DataDate = B.Date
                WHERE
                    PrcCd <> 0
                    AND B.Date >= '{}-01-1' AND B.Date < '{}-01-01'
            ) A
            GROUP BY
                DataDate,
                MktRf, Smb, Hml, Rmw, Cma, Rf, Rm
        ) B
        ORDER BY
            DataDate
    '''.format(
        year_start,
        year_end + 1
    )
    
    df = pd.read_sql(base_query, connection)
    
    return df

In [4]:
start, end = 2002, 2022

df = herding_stockMarket(year_start=start, year_end=end)
df.to_csv('data/output/herding_stockMarket_{}-{}.csv'.format(start, end), index=False)
df.head()

Unnamed: 0,Datadate,MktRf,Smb,Hml,Rmw,Cma,Rf,Rm,AbsoluteRm,SquaredRm,Measure,LeftTail,RightTail
0,2002-01-02,0.42,-0.72,0.17,-0.26,-0.41,0.007,0.427,0.427,0.182329,,0,0
1,2002-01-03,0.99,0.77,-0.4,-0.3,-0.8,0.007,0.997,0.997,0.994009,0.004376,0,0
2,2002-01-04,0.7,0.2,0.39,-0.08,-0.23,0.007,0.707,0.707,0.499849,0.001476,0,0
3,2002-01-07,-0.7,-0.24,0.85,-0.07,0.16,0.007,-0.693,0.693,0.480249,0.002739,0,0
4,2002-01-08,-0.23,1.21,0.29,-0.21,-0.01,0.007,-0.223,0.223,0.049729,0.000256,0,0


### Auxiliary Measures

In [5]:
def herding_auxDistinctCusips(year_start, year_end):
    
    base_query = '''
        SELECT
            COUNT(DISTINCT Cusip)
        FROM
            [dbo].[CrspcSecuritiesDaily]
        WHERE
            DataDate >= '{}-01-1' AND DataDate < '{}-01-01'
    '''.format(
        year_start,
        year_end + 1
    )
    
    df = pd.read_sql(base_query, connection)
    
    return df

In [6]:
start, end = 2002, 2022

df = herding_auxDistinctCusips(year_start=start, year_end=end)
df.to_csv('data/output/herding_auxDistinctCusips_{}-{}.csv'.format(start, end), index=False)
df.head()

Unnamed: 0,Unnamed: 1
0,3719


In [7]:
def herding_auxDistinctCusips_by_exchange(year_start, year_end):
    
    base_query = '''
        SELECT
            Exchange,
            COUNT(DISTINCT Cusip)
        FROM
            [dbo].[CrspcSecuritiesDaily]
        WHERE
            DataDate >= '{}-01-1' AND DataDate < '{}-01-01'
        GROUP BY
            Exchange
    '''.format(
        year_start,
        year_end + 1
    )
    
    df = pd.read_sql(base_query, connection)
    
    return df

In [8]:
start, end = 2002, 2022

df = herding_auxDistinctCusips_by_exchange(year_start=start, year_end=end)
df.to_csv('data/output/herding_auxDistinctCusips_by_exchange_{}-{}.csv'.format(start, end), index=False)
df.head()

Unnamed: 0,Exchange,Unnamed: 2
0,AMEX & in the S&P Transportation Index,2853
1,AMEX & in the S&P Utilities Index,1
2,AMEX & in the S&P Financial Index,6
3,NYSE & in the S&P Financial Index,1
4,NASDAQ & in the S&P Industrial Index,717


In [9]:
def herding_auxDistinctCusips_by_industry(year_start, year_end):
    
    base_query = '''
        SELECT
            Industry,
            COUNT(DISTINCT Cusip)
        FROM
            [dbo].[CrspcSecuritiesDaily]
        WHERE
            DataDate >= '{}-01-1' AND DataDate < '{}-01-01'
        GROUP BY
            Industry
    '''.format(
        year_start,
        year_end + 1
    )
    
    df = pd.read_sql(base_query, connection)
    
    return df

In [10]:
start, end = 2002, 2022

df = herding_auxDistinctCusips_by_industry(year_start=start, year_end=end)
df.to_csv('data/output/herding_auxDistinctCusips_by_industry_{}-{}.csv'.format(start, end), index=False)
df.head()

Unnamed: 0,Industry,Unnamed: 2
0,Mining,19
1,Public,4
2,Services,19
3,Retail,6
4,Wholesale,5
