# Quality Factor Distribution

This notebook uses `cb_cache` to load the `QUALITY` factor for all stocks on a single date, then summarizes the distribution and gives a quick IC check vs. forward returns.


In [None]:
import sys
import h5py as h5
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

sys.path.insert(0, '/home/ubuntu/code/cb_cache')
import cb_cache as cbc

H5_PATH = 'daily.hdf'
ROOT_PATH = '.'

with h5.File(H5_PATH, 'r') as f:
    dates = f['dates'][:]
    symbols = [s.decode('utf-8') for s in f['symbols'][:]]

# Choose a date present in daily.hdf
target_date = int(dates[-1])
print('target_date', target_date, 'num_symbols', len(symbols))


In [None]:
cache = cbc.EqCache(symbols, target_date, target_date, root_path=ROOT_PATH)
quality = cache.daily.QUALITY
quality = quality.reshape(-1)

q_series = pd.Series(quality, index=symbols, name='QUALITY')
q_valid = q_series.replace([np.inf, -np.inf], np.nan).dropna()

print('coverage', len(q_valid), '/', len(q_series), 'missing', q_series.isna().mean())
print(q_valid.describe(percentiles=[0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99]))


In [None]:
plt.figure(figsize=(8, 4))
q_valid.hist(bins=60)
plt.title(f'QUALITY distribution on {target_date}')
plt.xlabel('QUALITY')
plt.ylabel('count')
plt.grid(alpha=0.2)
plt.show()


In [None]:
# Quick IC check vs forward returns
forward_days = 20
date_idx = int(np.where(dates == target_date)[0][0])
if date_idx + forward_days >= len(dates):
    raise SystemExit('Not enough future dates for IC check')
end_date = int(dates[date_idx + forward_days])

cache_fwd = cbc.EqCache(symbols, target_date, end_date, root_path=ROOT_PATH)
close = cache_fwd.daily.close
ret_fwd = close[-1, :] / close[0, :] - 1.0
ret_series = pd.Series(ret_fwd, index=symbols, name='ret_fwd')

df = pd.concat([q_series, ret_series], axis=1).replace([np.inf, -np.inf], np.nan).dropna()
ic = df['QUALITY'].corr(df['ret_fwd'], method='spearman')
print('forward_days', forward_days, 'end_date', end_date, 'ic_spearman', ic)
