In [31]:
#===================================================
#
# rank.ipynb: Ranks the functions in terms of  
#             relevance to user portfolio
#
# Original: Vedaad Shakib (10/10/17)
#===================================================

import numpy as np 
import pandas as pd
from IPython.core.display import display
from datetime import datetime 
%run 'variance.ipynb'
%run 'utils.ipynb'

In [32]:
# define user portfolio
port = {"AAPL": 0.3, "YHOO": 0.6, "MSFT": 0.1}

In [33]:
# Load in financial data
df = pd.read_csv("prices-split-adjusted.csv")

# make the dates into actual datetime objects for easier comparison
df["date"] = df["date"].map(lambda x: datetime.strptime(x, '%Y-%m-%d'))

In [34]:
# calculate stock features
# TODO: add the rest of the features to the dataframe
stocks = set(df["symbol"].tolist())
stock_feat = pd.DataFrame({"variance": variance(stocks, df)}, index=stocks)

In [35]:
# calculate portfolio features
# TODO: add the rest of the features to the dataframe
port_feat = pd.DataFrame({"variance": variance(port, df)}, index=port)

In [36]:
# calculate a cost function for all the stocks in relation to the portfolio and rank them based on their cost
#
# cost function is defined as sum of the euclidian distances of the features of the stock and the features of 
# each individual stock in the portfolio, weighted by the its proportion in the portfolio

cost = {} # dictionary of stocks and their cost function 
min_dist = 10**10
for s in stocks:
    if s in port: continue
    sf = stock_feat.loc[s] # stock feature series
    sf = sf.as_matrix()
    dist = 0
    for key, val in port.items():
        pf = port_feat.loc[key] # portfolio feature series
        pf = pf.as_matrix()
        dist += val*euclid_dist(pf, sf) # val = weight of the stock in relation to the rest of the portfolio
    if dist < min_dist:
        min_dist = dist
        opt_stock = s
    cost[s] = dist
    
ranking = sorted(cost.keys(), key = lambda x: cost[x]) # sort by the cost

print("\n".join(["%s\t%s"%(i,j) for i,j in zip(range(10),ranking[:10])])) # print rankings

0	PNW
1	SWN
2	BBBY
3	NTRS
4	COH
5	WFM
6	CL
7	ICE
8	NWL
9	JPM
