In [7]:
import os
import pandas as pd
import sqlalchemy as db
from dotenv import load_dotenv

In [8]:
load_dotenv()

# config and credentials
server = os.getenv('server')
database = os.getenv('database')
username = os.getenv('username')
password = os.getenv('password')

# connection
engine = db.create_engine(
    'mssql://{}:{}@{}/{}?driver=ODBC+Driver+18+for+SQL+Server'.format(
        username, password, server, database
    )
)

# establish connection
connection = engine.connect()

### CSAD Stocks

In [13]:
query = '''
    SELECT
        Datadate,
        MktRf, Smb, Hml, Rmw, Cma, Rf, Rm,
        ABS(Rm) AS AbsoluteRm,
        POWER(Rm, 2) AS SquaredRm, 
        Sum / Count AS Csad
    FROM (
        SELECT
            DataDate,
            MktRf, Smb, Hml, Rmw, Cma, Rf, Rm,
            ABS(SUM(MonthlyReturns) - Rm) AS Sum,
            COUNT(DISTINCT LPermNo) AS Count
        FROM (
            SELECT
                A.LPermNo,
                EOMONTH(A.DataDate) AS DataDate,
                (PrcCd / LAG(A.PrcCd)  OVER (PARTITION BY A.LPermNo ORDER BY EOMONTH(A.DataDate))) - 1 AS MonthlyReturns,
                C.*
            FROM
                CrspcSecuritiesDaily A
            INNER JOIN (
                SELECT
                    LPermNo,
                    MAX(DataDate) AS MaxDate
                FROM
                    CrspcSecuritiesDaily A
                GROUP BY
                    LPermNo,
                    EOMONTH(DataDate)
            ) B ON A.LPermNo = B.LPermNo AND A.DataDate = B.MaxDate
            INNER JOIN
                MarketFactors C ON EOMONTH(A.DataDate) = C.Date
        ) A
        GROUP BY
            DataDate,
            MktRf, Smb, Hml, Rmw, Cma, Rf, Rm
    ) A
    ORDER BY
        DataDate
'''

# read sql
df = pd.read_sql(query, connection)
df.to_csv('source/herding-csad-stocks.csv', index=False)

### Distinct Stocks

In [10]:
query = '''
    SELECT
        COUNT(DISTINCT LPermNo)
    FROM
        CrspcSecuritiesDaily A
'''

df = pd.read_sql(query, connection)
df.head()

Unnamed: 0,Unnamed: 1
0,5646


### Distinct Stocks per Exchange

In [11]:
query = '''
    SELECT
        Exchange,
        COUNT(LPermNo) AS ExCount
    FROM (
        SELECT
            LPermNo, 
            Exchange, 
            Industry,
            ROW_NUMBER() OVER (PARTITION BY LPermNo ORDER BY DataDate) AS Ranking
        FROM 
            CrspcSecuritiesDaily
    ) A
    WHERE
        Ranking = 1
    GROUP BY
        Exchange
'''

df = pd.read_sql(query, connection)
df.head()

Unnamed: 0,Exchange,ExCount
0,NASDAQ,3436
1,NYSE,1977
2,AMEX,233


### Distinct Stocks per Industry

In [12]:
query = '''
    SELECT
        Industry,
        COUNT(LPermNo) AS ExCount
    FROM (
        SELECT
            LPermNo, 
            Exchange, 
            Industry,
            ROW_NUMBER() OVER (PARTITION BY LPermNo ORDER BY DataDate) AS Ranking
        FROM 
            CrspcSecuritiesDaily
    ) A
    WHERE
        Ranking = 1
    GROUP BY
        Industry
'''

df = pd.read_sql(query, connection)
df.head(12)

Unnamed: 0,Industry,ExCount
0,Mining,188
1,Public,515
2,Services,967
3,Retail,255
4,Wholesale,98
5,Missing,14
6,Transportation,224
7,Construction,45
8,Manufacturing,2038
9,Utilities,162
