In [1]:
import numpy as np
import pandas as pd
from pathmgmt import pathmgmt as myPath
import os

In [2]:
# Combine all alphas
def get_stat():
    res = pd.DataFrame()
    for folderName in os.listdir(myPath.PLOT_DIR):
        # print(folderName)
        folder = myPath.PLOT_DIR/folderName/'statistics'
        for fileName in os.listdir(folder):
            # print(fileName)
            file = folder/fileName
            df = pd.read_csv(file)
            df.index = [f'{folderName};{fileName[:-15]}']
            # print(df)
            if res.empty:
                res = df
            else:
                res = pd.concat([res, df])
    return res

In [207]:
stats = get_stat()

In [208]:
stats.describe()

Unnamed: 0,Annualized Return,Annualized Excess Return,IR,IR long only,IC,Max Drawdown,Max Drawdown long only,daily Turnover
count,720.0,720.0,720.0,720.0,720.0,720.0,720.0,720.0
mean,0.044428,-0.068282,0.564155,-1.168306,0.003999,0.159505,0.325737,0.164064
std,0.107357,0.065585,1.295154,1.3561,0.008709,0.141975,0.140119,0.177475
min,-0.251808,-0.270461,-2.642853,-5.404325,-0.0186,0.01159,0.107087,0.01422
25%,-0.022203,-0.104098,-0.365392,-1.66738,-0.002467,0.069693,0.212837,0.050164
50%,0.054708,-0.052383,0.857209,-0.697302,0.006858,0.102951,0.300963,0.100103
75%,0.11404,-0.01982,1.536477,-0.185297,0.010554,0.191944,0.419948,0.209235
max,0.313484,0.063304,3.158405,0.855123,0.019801,0.772929,0.762976,1.236467


> We first select alphas with IR larger than 2

In [209]:
good_alphas = list(stats.loc[stats.IR > 2].index)

In [210]:
good_alphas

['alpha.YaoReV001-10days-zz1000;20180101-20201231-Rank-Industry-holding20days',
 'alpha.YaoReV001-10days-zz1000;20180101-20201231-Rank-Industry_with_weighted_cap-holding20days',
 'alpha.YaoReV001-20days-zz1000;20180101-20201231-Rank-Industry-holding10days',
 'alpha.YaoReV001-20days-zz1000;20180101-20201231-Rank-Industry-holding1days',
 'alpha.YaoReV001-20days-zz1000;20180101-20201231-Rank-Industry-holding20days',
 'alpha.YaoReV001-20days-zz1000;20180101-20201231-Rank-Industry-holding3days',
 'alpha.YaoReV001-20days-zz1000;20180101-20201231-Rank-Industry-holding5days',
 'alpha.YaoReV001-20days-zz1000;20180101-20201231-Rank-Industry_with_weighted_cap-holding10days',
 'alpha.YaoReV001-20days-zz1000;20180101-20201231-Rank-Industry_with_weighted_cap-holding1days',
 'alpha.YaoReV001-20days-zz1000;20180101-20201231-Rank-Industry_with_weighted_cap-holding3days',
 'alpha.YaoReV001-20days-zz1000;20180101-20201231-Rank-Industry_with_weighted_cap-holding5days',
 'alpha.YaoReV002-10days-20days-zz10

In [211]:
len(good_alphas)

84

In [212]:
# combine daily pnl
def get_pnl():
    res = pd.DataFrame()
    for folderName in os.listdir(myPath.PLOT_DIR):
        # print(folderName)
        folder = myPath.PLOT_DIR/folderName/'PnL_results'
        for fileName in os.listdir(folder):
            file = folder/fileName
            df = pd.read_csv(file)
            # df = df.set_index('time')
            # df = df['pnl']
            df = df[['time', 'pnl']]
            df.rename(
                columns={'pnl': f'{folderName};{fileName[:-13]}'}, inplace=True)
            if res.empty:
                res = df
            else:
                res = res.merge(df, on='time', how='outer')
    res = res.set_index('time').sort_index()
    return res

In [213]:
pnls = get_pnl()

> Compute correlation matrix on good alpha candidates

In [214]:
corr = pnls[good_alphas].corr()

> We want to select alphas with less correlations

In [215]:
(corr.mask(np.eye(len(corr), dtype=bool)).abs() < 0.6).sum()

alpha.YaoReV001-10days-zz1000;20180101-20201231-Rank-Industry-holding20days                      15
alpha.YaoReV001-10days-zz1000;20180101-20201231-Rank-Industry_with_weighted_cap-holding20days    29
alpha.YaoReV001-20days-zz1000;20180101-20201231-Rank-Industry-holding10days                      14
alpha.YaoReV001-20days-zz1000;20180101-20201231-Rank-Industry-holding1days                        5
alpha.YaoReV001-20days-zz1000;20180101-20201231-Rank-Industry-holding20days                      37
                                                                                                 ..
alpha.YaoReV007-20days-zz1000;20180101-20201231-Rank-Industry-holding3days                        6
alpha.YaoReV007-20days-zz1000;20180101-20201231-Rank-Industry-holding5days                        7
alpha.YaoReV007-20days-zz1000;20180101-20201231-Rank-Industry_with_weighted_cap-holding10days    39
alpha.YaoReV007-20days-zz1000;20180101-20201231-Rank-Industry_with_weighted_cap-holding3days     31


In [216]:
# small_corr = (corr.mask(np.eye(len(corr), dtype=bool)).abs() < 0.6).sum() >= 5
# corr.loc[small_corr, small_corr]

> one set of alphas with all pairwise correlations less than 0.6

In [217]:
from itertools import combinations
import random
res = []
num = 4
random.shuffle(good_alphas)
for cols in combinations(good_alphas, num):
    corr_small = corr.loc[cols, cols]
    if (~(corr_small.mask(np.eye(len(corr_small), dtype=bool)).abs() > 0.6).any()).sum() == num:
        break

In [219]:
corr_small

Unnamed: 0,alpha.YaoReV001-20days-zz1000;20180101-20201231-Rank-Industry-holding20days,alpha.YaoReV006-20days-zz1000;20180101-20201231-Rank-Industry-holding5days,alpha.YaoReV005-10days-zz1000;20180101-20201231-Rank-Industry_with_weighted_cap-holding60days,alpha.YaoReV001-20days-zz1000;20180101-20201231-Rank-Industry_with_weighted_cap-holding1days
alpha.YaoReV001-20days-zz1000;20180101-20201231-Rank-Industry-holding20days,1.0,0.540033,0.458964,0.547046
alpha.YaoReV006-20days-zz1000;20180101-20201231-Rank-Industry-holding5days,0.540033,1.0,0.425605,0.584366
alpha.YaoReV005-10days-zz1000;20180101-20201231-Rank-Industry_with_weighted_cap-holding60days,0.458964,0.425605,1.0,0.595894
alpha.YaoReV001-20days-zz1000;20180101-20201231-Rank-Industry_with_weighted_cap-holding1days,0.547046,0.584366,0.595894,1.0


In [220]:
# num = 5
# random.shuffle(good_alphas)
# for cols in combinations(good_alphas, num):
#     corr_small = corr.loc[cols, cols]
#     if (~(corr_small.mask(np.eye(len(corr_small), dtype=bool)).abs() > 0.6).any()).sum() == num:
#         break

> one of our choice for this batch is as follows:

In [221]:
batch1 = corr.loc[cols, cols].columns

In [222]:
stats.T[batch1]

Unnamed: 0,alpha.YaoReV001-20days-zz1000;20180101-20201231-Rank-Industry-holding20days,alpha.YaoReV006-20days-zz1000;20180101-20201231-Rank-Industry-holding5days,alpha.YaoReV005-10days-zz1000;20180101-20201231-Rank-Industry_with_weighted_cap-holding60days,alpha.YaoReV001-20days-zz1000;20180101-20201231-Rank-Industry_with_weighted_cap-holding1days
Annualized Return,0.147113,0.197159,0.062174,0.306856
Annualized Excess Return,0.011869,-0.033943,0.015858,0.017413
IR,2.079685,2.809481,2.098212,2.244268
IR long only,0.134,-0.756335,0.086331,0.194662
IC,0.013303,0.014905,0.009396,0.019425
Max Drawdown,0.074698,0.051291,0.024678,0.143059
Max Drawdown long only,0.197889,0.157178,0.392852,0.134623
daily Turnover,0.068344,0.137743,0.022273,0.307028
