In [1]:
#first, ready the data for analysis. store it in folders.

import csv
import numpy as np
import pandas as pd

#find a way to filter the input data...
# to modify: can add input / output files as fn input/output.
def input_df_from_csv():
    stocks_df = pd.read_csv('data/stocks_daily_raw_10yr.csv')
    print("input data head:")
    print(stocks_df.head())
    
    #filter inputs & recreate index
    stocks_df_5yr = stocks_df[1511:]
    stocks_df_5yr = stocks_df_5yr.reset_index(drop=True)
    print("\n=================\nfiltered (x yr) data head:")
    print(stocks_df_5yr.head())
    
    
    #filter out the nulls
    stocks_ser = stocks_df_5yr.isnull().sum()
    stocks_ser_nonnulls = stocks_ser[stocks_ser <= 20]
    nnullkeys = stocks_ser_nonnulls.keys()
    print("\nNon-null keys: ", nnullkeys)
    nnull_stocks_df = stocks_df_5yr[nnullkeys]
    #print(nnull_stocks_df.head())

    #convert NaN's to 0.
    #"X" should be ready for pca
    X = nnull_stocks_df.iloc[:,1:-1]
    X = X.fillna(0)
    
    print(X.head())
    
    #perform cumulative product:
    X_cp = (1+X).cumprod()
    print("\n=================\ncomputing cumulative returns:")
    print(X_cp.tail())
    X = X_cp
    
    #do a null check: if nulls, throw error?
    print("\n=================\ncheckign for nulls:")
    nullsums = X.isnull().sum()
    print(nullsums[nullsums >0])
    
    #store X in a csv
    X.to_csv('data/input_stocks_data_X.csv')

    
#run (test) the above
input_df_from_csv()

input data head:
         Date         A        AA      AAME       AAN      AAON       AAP  \
0  2010-01-05 -0.010863 -0.031231  0.060606  0.009062 -0.029015 -0.005943   
1  2010-01-06 -0.003553  0.052077  0.000000  0.000000 -0.039670  0.008719   
2  2010-01-07 -0.001297 -0.021214 -0.057143  0.008083  0.037017 -0.000247   
3  2010-01-08 -0.000324  0.024684  0.053030 -0.006681  0.011898  0.003953   
4  2010-01-11  0.000649  0.025264  0.043166  0.015695  0.027096 -0.009843   

   AAT      AAPL      AAWW  ...      ZBRA      ZEUS      ZION      ZIOP  \
0  NaN  0.001729  0.055236  ... -0.001744 -0.006500  0.035259  0.033898   
1  NaN -0.015906 -0.041677  ... -0.007687  0.039251  0.086957  0.009836   
2  NaN -0.001849  0.008284  ... -0.025000 -0.012876  0.112000 -0.012987   
3  NaN  0.006648  0.008216  ... -0.003250  0.031305 -0.016187  0.019737   
4  NaN -0.008821  0.003056  ...  0.003261 -0.027544  0.006094  0.029032   

       ZIXI        ZN  ZNGA      ZOLT      ZUMZ        SP  
0 -0.0449

In [8]:
#next, compute the pca and come up with the eigen-portfolios
# eigen-portfolios should be stored in a pickled list object.
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pickle as pkl

def find_eigen_portfolios(colnames, pcs):
    epfs = []
    epfs_inv = []

    for i in range(len(pcs)):
        #want to inspect & create cutoffs.
        ci = pcs[i]

        #debug
        #print(len(ci))

        maxi = max(ci)
        mini = min(ci)
        print("***** starting with pc #{}".format(i))
        print("max: ", max(ci), "min: ", min(ci))

        #find # correlations greater than half of max, or less than half of min.
        csig_pos = np.arange(len(ci))[ci > maxi/2]
        csig_neg = np.arange(len(ci))[ci < mini/2]
        print("significance lengths")
        print(len(csig_pos))
        print(len(csig_neg))

        #create lists of tickers.
        csig = csig_pos if len(csig_pos) >= len(csig_neg) else csig_neg
        csig_inv = csig_neg if len(csig_pos) >= len(csig_neg) else csig_pos
        ptiks = [colnames[i] for i in csig]
        ptiks_inv = [colnames[i] for i in csig_inv]

        epfs.append(ptiks)
        epfs_inv.append(ptiks_inv)
        print(ptiks[:20])
        
    with open('data/pcs/eigenportfolios.pkl', 'wb') as fout:
        pkl.dump(epfs, fout)
    with open('data/pcs/eigenportfolios_inv.pkl', 'wb') as fout:
        pkl.dump(epfs_inv, fout)
    
    

def compute_pca_eigen_portfolios():
    X = pd.read_csv('data/input_stocks_data_X.csv')
    colnames = list(X.columns)
    print(colnames)
    #scale X
    scaler = StandardScaler()
    scaled_x = scaler.fit_transform(X)
    #perform PCA
    print("\n=================\nPerforming PCA...")
    pca = PCA(n_components=5)
    Xpca = pca.fit_transform(scaled_x)
    print("transformed data (Xpca) shape & values:")
    print(Xpca.shape)
    print(Xpca)
    
    #results analysis:
    #print the variance ratio
    print("\n\nexplained variance: ", pca.explained_variance_ratio_)
    #print the singular values
    print("\nsingular values: ", pca.singular_values_)
    #print pca components and their shape:
    print("\ncomponents shape & values: ", pca.components_.shape)
    print(pca.components_)
    
    #save pc's in a file.
    pc_df = pd.DataFrame(data=pca.components_)
    with open('data/pcs/principal_components.csv', 'w') as fout:
        pc_df.to_csv(fout)
    
    #next, come up with the eigen portfolios as identified by the PC's
    find_eigen_portfolios(colnames, pca.components_)

compute_pca_eigen_portfolios()

['Unnamed: 0', 'A', 'AA', 'AAME', 'AAN', 'AAON', 'AAP', 'AAT', 'AAPL', 'AAWW', 'ABC', 'ABCB', 'AB', 'ABG', 'ABIO', 'ABM', 'ABMC', 'ABMD', 'ABT', 'ACAD', 'ABR', 'ACC', 'ACFN', 'ACCO', 'ACGL', 'ACHC', 'ACIW', 'ACM', 'ACLS', 'ACN', 'ACNB', 'ACOR', 'ACRE', 'ACRX', 'ACTG', 'ACU', 'ACUR', 'ACY', 'ADBE', 'ADC', 'ADES', 'ADI', 'ADM', 'ADP', 'ADS', 'ADSK', 'ADUS', 'ADTN', 'AE', 'AEE', 'AEGN', 'AEHR', 'AEIS', 'AEL', 'AEO', 'AEP', 'AERG', 'AES', 'AEY', 'AFFY', 'AFH', 'AFG', 'AFL', 'AGCO', 'AGEN', 'AGM', 'AGN', 'AGNC', 'AGO', 'AGX', 'AGYS', 'AHC', 'AHL', 'AHPI', 'AHT', 'AI', 'AIG', 'AIMC', 'AIR', 'AIRT', 'AIT', 'AIV', 'AIZ', 'AJG', 'AKR', 'AKAM', 'AKRX', 'AKS', 'ALB', 'ALCO', 'ALE', 'ALGT', 'ALGN', 'ALK', 'ALKS', 'ALL', 'ALG', 'ALEX', 'ALNY', 'ALOT', 'ALSE', 'ALSK', 'ALSN', 'ALV', 'ALXN', 'AMAG', 'AMAT', 'ALX', 'AMBA', 'AMCF', 'AMCX', 'AMD', 'AME', 'AMED', 'AMG', 'AMGN', 'AMIN', 'AMKR', 'AMNB', 'AMOT', 'AMP', 'AMPE', 'AMRB', 'AMRC', 'AMRS', 'AMSC', 'AMSF', 'AMS', 'AMSWA', 'AMT', 'AMTD', 'AMTY', 'A

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [9]:
#doublecheck the eigenportfolios found above.
with open('data/pcs/eigenportfolios.pkl', 'rb') as fin:
    epfs = pkl.load(fin)
    
    print(epfs)
print(epfs[0])

[['Unnamed: 0', 'A', 'AAN', 'AAON', 'AAPL', 'ABCB', 'AB', 'ABG', 'ABMD', 'ABT', 'ABR', 'ACFN', 'ACGL', 'ACIW', 'ACLS', 'ACN', 'ACNB', 'ACRE', 'ADBE', 'ADC', 'ADES', 'ADI', 'ADM', 'ADP', 'ADSK', 'ADUS', 'AEE', 'AEL', 'AEO', 'AEP', 'AERG', 'AES', 'AFG', 'AFL', 'AGCO', 'AGM', 'AGNC', 'AGO', 'AGYS', 'AHL', 'AIR', 'AIRT', 'AIT', 'AIV', 'AIZ', 'AJG', 'AKAM', 'ALCO', 'ALE', 'ALGN', 'ALL', 'ALG', 'ALNY', 'ALOT', 'ALSN', 'AMAT', 'AMD', 'AME', 'AMED', 'AMGN', 'AMNB', 'AMOT', 'AMP', 'AMRB', 'AMRC', 'AMSF', 'AMSWA', 'AMT', 'AMTD', 'AMZN', 'ANAT', 'ANGI', 'ANGO', 'ANSS', 'ANV', 'AOS', 'APAM', 'APD', 'APEI', 'APH', 'APO', 'APT', 'ARI', 'ARL', 'ARNA', 'AROW', 'ARQL', 'ARR', 'ARTNA', 'ARE', 'ARW', 'ARWR', 'ASH', 'ASRV', 'ATI', 'ATO', 'ATR', 'ATLO', 'ATRO', 'ATRS', 'ATRC', 'ATRI', 'ATVI', 'ATW', 'AUBN', 'AVA', 'AVAV', 'AVB', 'AVGO', 'AVNW', 'AVX', 'AVY', 'AWI', 'AWK', 'AXP', 'AZPN', 'B', 'BA', 'BAC', 'BAH', 'BANF', 'BANR', 'BASI', 'BAX', 'BBSI', 'BBX', 'BC', 'BCC', 'BCO', 'BCOR', 'BCPC', 'BDN', 'BDR', 

In [3]:
'''
method for encouraging diversity:
    - match each portfolio with the eigenportfolios and determine its similarity.

*> a better method might be to calculate squared sum of all the proper factor loadings.
    - then normalize the resulting vector. This might give you the ratio of the variance.

> or: perform a change of basis from portfolio market performance to principal components?
'''

#potentially todo: have a different input path for the principal components?
def diversity_recommendation():
    #first, identify the factor loadings of the portfolio stocks in each principal component.
    pcs = pd.read_csv('data/pcs/principal_components.csv')
    print(pcs.head())
    np_pcs = pcs.iloc[:,1:].to_numpy()

    #find all the right factor loadings in this result.
    portfolio = ['AAPL', 'AMZN', 'MSFT', 'INTC']

    X = pd.read_csv('data/input_stocks_data_X.csv')
    colnames = list(X.columns)
    #find index of all portfolio elements
    pf_inds = [colnames.index(p) for p in portfolio]
    print(pf_inds)

    #find the factor loadings of all pcs. Then do a squared sum.
    facloads = np_pcs[:, pf_inds]
    print("factor loadings: ", facloads)
    variances = (facloads**2).sum(1) #squares & sums across rows
    norm_vcs = variances / (sum(variances))
    print("\n")
    print("normalized var's in all pc's: ", norm_vcs)

    #print diff between correspondence in this pf, and overall market. Capture the max diff.
    mkt_vars = [0.48753289, 0.1872592,  0.07927921, 0.04877565, 0.03629214]
        ## TODO: NEED TO DO THIS^^ FOR THE COMPUTED PCA VARIANCES.
    missing_var = mkt_vars-norm_vcs
    print("where is pf variance behind market: ", missing_var)

    ### == return this result! == ###
    print("eigen pf to invest in: ", np.argmax(missing_var))
    print("eigen pf diversity weight: ", np.max(missing_var))

#run the above
diversity_recommendation()

   Unnamed: 0         0         1         2         3         4         5  \
0           0 -0.030085 -0.029522 -0.007507  0.027393 -0.028865 -0.027047   
1           1  0.010617  0.000309 -0.045032 -0.011908  0.011115  0.011630   
2           2 -0.001024 -0.003677  0.017107 -0.004644 -0.006589 -0.024006   
3           3  0.001450 -0.011142 -0.014567 -0.008891 -0.006681  0.008691   
4           4  0.005724  0.011646 -0.003187  0.005127  0.004913  0.003913   

          6         7         8  ...      2136      2137      2138      2139  \
0  0.000217 -0.014549 -0.029255  ... -0.009541  0.028054 -0.028925  0.007041   
1  0.030397  0.030052  0.004319  ... -0.032078 -0.006984  0.014673 -0.030121   
2  0.012653 -0.035652 -0.003941  ...  0.031274  0.004798  0.001098  0.004616   
3  0.066173  0.028177 -0.003515  ... -0.034547  0.012274  0.003193  0.035114   
4 -0.014199  0.007112 -0.015143  ... -0.006871 -0.014853 -0.000548 -0.039419   

       2140      2141      2142      2143      2144     

In [29]:
mkt_vars = [0.48753289, 0.1872592,  0.07927921, 0.04877565, 0.03629214]
print(mkt_vars)
print(norm_vcs)
print(mkt_vars-norm_vcs)

#===================#
missing_var = mkt_vars-norm_vcs
print(np.argmax(missing_var))

[0.48753289, 0.1872592, 0.07927921, 0.04877565, 0.03629214]
[0.69527492 0.06633969 0.08390617 0.01232904 0.14215018]
[-0.20774203  0.12091951 -0.00462696  0.03644661 -0.10585804]
1


In [18]:
pcs.iloc[:, 1:].head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2136,2137,2138,2139,2140,2141,2142,2143,2144,2145
0,-0.030085,-0.029522,-0.007507,0.027393,-0.028865,-0.027047,0.000217,-0.014549,-0.029255,-0.002692,...,-0.009541,0.028054,-0.028925,0.007041,-0.027109,0.020328,-0.022529,0.004607,-0.027233,-0.021164
1,0.010617,0.000309,-0.045032,-0.011908,0.011115,0.01163,0.030397,0.030052,0.004319,-0.043061,...,-0.032078,-0.006984,0.014673,-0.030121,-0.021818,0.00385,0.021547,-0.036095,0.017528,0.01304
2,-0.001024,-0.003677,0.017107,-0.004644,-0.006589,-0.024006,0.012653,-0.035652,-0.003941,0.025153,...,0.031274,0.004798,0.001098,0.004616,0.003254,-0.036777,-0.011043,0.015351,-0.015193,0.005781
3,0.00145,-0.011142,-0.014567,-0.008891,-0.006681,0.008691,0.066173,0.028177,-0.003515,0.004922,...,-0.034547,0.012274,0.003193,0.035114,0.005367,-0.012382,0.012039,-0.038595,-0.008438,0.017386
4,0.005724,0.011646,-0.003187,0.005127,0.004913,0.003913,-0.014199,0.007112,-0.015143,0.020147,...,-0.006871,-0.014853,-0.000548,-0.039419,-0.002646,-0.00926,0.03182,-0.010483,-0.000283,-0.051815
