### Imports

In [1]:
import polars as pl
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import gc

import datetime as dt
import numpy as np

#for cos similarity
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

#progress bar
from tqdm import tqdm


#for alphas
import sf_quant.data as sfd
import sf_quant.optimizer as sfo
import sf_quant.backtester as sfb
import sf_quant.performance as sfp


In [2]:
DATA_FILE_PATH=r'/home/porter77/sf_fall_2025/sf-quant-labs/edgar_benchmark'
BENCH_CIK_PATH=r'/home/porter77/sf_fall_2025/sf-quant-labs/benchmark_cik.csv'

In [3]:
start=dt.date(2004,1,1)
end=dt.date(2025,12,31)
barra_data=sfd.load_assets(start=start,end=end,columns=['date','barrid','specific_risk','predicted_beta','price'],in_universe=True)

# ['date','barrid','specific_risk','cusip','price','return','predicted_beta','historical_beta']


In [4]:
bench_df = (
    pl.read_csv(BENCH_CIK_PATH, columns=["barrid", "cik"])
    .unique(subset=["barrid"])
)

# 3) do the join in a lazy pipeline + streaming collect
barra_w_cik = (
    barra_data.lazy()
    .join(bench_df.lazy(), on="barrid", how="left")
    .collect(streaming=True)
)

del barra_data
gc.collect()

  .collect(streaming=True)


0

In [5]:
barra_w_cik

date,barrid,specific_risk,predicted_beta,price,cik
date,str,f64,f64,f64,i64
2004-01-02,"""USA3GE1""",47.984662,1.949757,11.46,1098972
2004-01-05,"""USA3GE1""",47.941945,1.877678,11.31,1098972
2004-01-06,"""USA3GE1""",47.84906,1.890976,11.34,1098972
2004-01-07,"""USA3GE1""",48.083044,1.887955,11.21,1098972
2004-01-08,"""USA3GE1""",47.961206,1.883265,11.32,1098972
…,…,…,…,…,…
2025-07-24,"""USBF341""",84.829652,1.545474,3.13,
2025-07-25,"""USBF341""",84.63905,1.542865,3.15,
2025-07-28,"""USBF341""",84.704746,1.544187,3.1,
2025-07-29,"""USBF341""",84.748616,1.548718,2.95,


In [6]:
ITEM=input('Enter the item you want to compute alpha for: ')


In [7]:
subset_schema = {
    "cik": pl.Int64,
    "cusip": pl.Utf8,
    "filing_date": pl.Date,
    ITEM: pl.Utf8,
    "year": pl.Int64,
}

data = pl.DataFrame(schema=subset_schema)

for i in range(4, 25):
    year = 2000 + i
    sub = (
        pl.read_parquet(
            rf"{DATA_FILE_PATH}/{year}_10k_items.parquet",
            columns=["cik", "cusip", "filing_date", ITEM],
        )
        .with_columns(pl.col("filing_date").cast(pl.Date))
        .with_columns(pl.col("filing_date").dt.year().alias("year"))
        .filter(pl.col("filing_date").dt.year() == year)
        .select(list(subset_schema.keys()))  # ensure same col order
    )
    data = pl.concat([data, sub], how="vertical_relaxed")

data

cik,cusip,filing_date,Item 1A,year
i64,str,date,str,i64
20,"""482730108""",2004-04-01,,2004
2178,"""006351308""",2004-03-24,,2004
2034,"""004446100""",2004-09-10,,2004
3116,"""009728106""",2004-03-30,,2004
2969,"""009158106""",2004-12-13,,2004
…,…,…,…,…
1981792,"""44267T102""",2024-02-27,"""Item 1A. Risk Factors The risk…",2024
1993004,"""668074305""",2024-02-15,"""ITEM 1A. RISK FACTORS You sh…",2024
1991792,"""156944100""",2024-03-26,"""Item 1A. Risk Factors.  You sh…",2024
1984060,"""642045108""",2024-02-27,"""Item 1A. Risk Factors.  There …",2024


In [8]:
#clean data

#drop null tickers
data=data.filter(pl.col('cusip').is_not_null())

#make cusip length8
data = data.with_columns(
    pl.col("cusip").str.slice(0, 8).alias("cusip8"))

#drop where filing date is null
data=data.filter(pl.col('filing_date').is_not_null())

#rename filing date to date
data=data.rename({'filing_date':'date'})

#sort ticker ascending and year descending to make operations easier down the line
data = data.sort(
    ["cik", "year"],
    descending=[False, True]
)

data=data.select(pl.exclude(['cusip','cusip8']))
data

cik,date,Item 1A,year
i64,date,str,i64
20,2009-03-13,"""Item 1A. Risk …",2009
20,2008-03-12,"""Item 1A. Risk …",2008
20,2007-03-09,"""Item 1A. Risk Factors.  Ou…",2007
20,2006-03-23,"""Item 1A. Risk Factors. Risks A…",2006
20,2005-03-31,,2005
…,…,…,…
1981792,2024-02-27,"""Item 1A. Risk Factors The risk…",2024
1984060,2024-02-27,"""Item 1A. Risk Factors.  There …",2024
1991792,2024-03-26,"""Item 1A. Risk Factors.  You sh…",2024
1993004,2024-02-15,"""ITEM 1A. RISK FACTORS You sh…",2024


In [9]:
#vectorizer
vectorizer = CountVectorizer(
    lowercase=True,
    token_pattern=r'(?u)\b[a-zA-Z]{2,}\b'
)

#there is most definitely a faster way to do this, but I am not a CS major so this works for me haha

cik_list=data['cik'].unique().to_list() 
cik_list.sort() #sort it so we match how our data df is

# cik_list = cik_list[:3] #first three for testign

sim_list=[]
for cik in tqdm(cik_list): #this method assumes that my data is sorted on cik and year where cik is increasing and year is decreasing (within cik)
    sub=data.filter(pl.col('cik')==cik) #filter to cik
    year_item=sub.select([ITEM,'year']) #filter to item
    year_list=year_item['year'].to_list() #get a list of the years so we can iterate on it too
    sim_list_cik=[]
    for year in year_list:
        try:
            doc1 = year_item.filter(pl.col('year') == year).select(ITEM).item() #get this year
            doc2 = year_item.filter(pl.col('year') == (year - 1)).select(ITEM).item() #last years item

        except ValueError:
            sim_list_cik.append(None) #when compariing our earliest year to the year before it throws an error, so we append a none (null) value
            continue

        doc_list=[doc1,doc2]
        try:
            X = vectorizer.fit_transform(doc_list) #vectorize it
            similarity = cosine_similarity(X[0], X[1])[0][0] # compute the cos similarity for each to the previous year
        except AttributeError:
            sim_list_cik.append(None) #if a cik is missing item this will get thrown because we will not be able to do cos similarity with a null
            continue


        sim_list_cik.append(similarity)
    sim_list.extend(sim_list_cik) #add all the similarities for the cik to the overall list
    

  0%|          | 0/6285 [00:00<?, ?it/s]

100%|██████████| 6285/6285 [12:41<00:00,  8.25it/s]


In [10]:
#add sim list as a column to data df
data=data.with_columns(pl.Series(f'{ITEM}_cos_sim',sim_list))


In [11]:
data=data.select(pl.exclude(['year',ITEM]))
data=data.filter(pl.col(f'{ITEM}_cos_sim').is_not_null())
# data=data.filter(pl.col('date')>dt.date(2006,12,31))
data

cik,date,Item 1A_cos_sim
i64,date,f64
20,2009-03-13,0.994316
20,2008-03-12,0.994438
20,2007-03-09,0.990985
1750,2024-07-19,0.991075
1750,2023-07-18,0.997309
…,…,…
1944558,2024-02-26,0.99884
1948455,2024-09-27,0.999298
1949543,2024-02-29,0.998087
1952073,2024-08-16,0.997052


In [12]:
merged=barra_w_cik.join(data,on=['cik','date'],how='left')

In [13]:
IC=.05

merged = merged.sort(["barrid","date"])

merged=merged.with_columns(
    ((pl.col(f"{ITEM}_cos_sim") - pl.col(f"{ITEM}_cos_sim").mean().over("date"))
 / pl.col(f"{ITEM}_cos_sim").std().over("date")).alias('score')
)

merged=merged.with_columns(
    (pl.col('specific_risk')*pl.col('score')*IC).shift(1).over('barrid').alias('alpha')
)

# 

In [14]:
merged = merged.sort(['date','barrid'])

merged=merged.with_columns(
    pl.col("alpha").forward_fill(limit=250).over("barrid").alias("alpha") #6mo holding
)


In [15]:
# merged=merged.filter(pl.col('alpha').is_not_null())
#also can fill null alpha with 0
merged = merged.with_columns(
    pl.col('alpha').fill_null(0)
)

In [16]:
merged=merged.select(['date','barrid','predicted_beta','alpha']).sort(['date','barrid'])
merged.write_parquet(f'/home/porter77/sf_fall_2025/sf-quant-labs/{ITEM}_alphas.parquet')
merged


date,barrid,predicted_beta,alpha
date,str,f64,f64
2004-01-02,"""USA1151""",1.754327,0.0
2004-01-02,"""USA11I1""",1.469133,0.0
2004-01-02,"""USA12I1""",0.716027,0.0
2004-01-02,"""USA1371""",0.716083,0.0
2004-01-02,"""USA14R1""",1.20182,0.0
…,…,…,…
2025-10-16,"""USBRJZ1""",1.165327,0.0
2025-10-16,"""USBRKA1""",0.33935,0.0
2025-10-16,"""USBRKA2""",0.270034,0.0
2025-10-16,"""USBRL91""",0.863092,0.0


In [17]:
alphas=pl.read_parquet(r'/home/porter77/sf_fall_2025/sf-quant-labs/Item 1A_alphas.parquet')
alphas.filter(pl.col('alpha').is_null())

date,barrid,predicted_beta,alpha
date,str,f64,f64


In [18]:
from dotenv import load_dotenv
import os

load_dotenv()  # loads variables from .env file into os.environ


from sf_backtester import BacktestRunner, BacktestConfig, SlurmConfig

slurm_config = SlurmConfig(
    n_cpus=8,
    mem="32G",
    time="03:00:00",
    mail_type="BEGIN,END,FAIL",
    max_concurrent_jobs=30,
)

config = BacktestConfig(
    signal_name="10k-item1a-12-mo-noLBIAS",
    gamma=4200,
    data_path=f"/home/porter77/sf_fall_2025/sf-quant-labs/Item 1A_alphas.parquet",
    project_root="/home/porter77/sf_fall_2025/sf-quant-labs/labs",
    byu_email="porter77@byu.edu",
    constraints=["ZeroBeta", "ZeroInvestment"],
    slurm=slurm_config,
)

runner = BacktestRunner(config)


# Preview the sbatch script
# print(runner.submit())

# Submit to SLURM
runner.submit()

Preparing backtest for 22 years: 2004-2025
Signal: 10k-item1a-12-mo-noLBIAS
Gamma: 4200
Constraints: ['ZeroBeta', 'ZeroInvestment']
Output directory: /home/porter77/sf_fall_2025/sf-quant-labs/labs/weights/10k-item1a-12-mo-noLBIAS/4200
Job submitted successfully!
sbatch output: Submitted batch job 10411547

