In [17]:
"""
AUTHOR: Pranav Parthasarathy
        Vedaad Shakib
        Shubrakanti Ganguli
        Alexander Bondarenko
FUNCTION: Combine and standardize all features into one notebook, perform PCA on these features
"""
import pandas as pd
import numpy as np
import urllib as u
from bs4 import BeautifulSoup as bs
import matplotlib.pyplot as plt
import lxml

from IPython.display import display

In [137]:
NUM_RANKED = 20

In [18]:
constituents = pd.read_csv("constituents.csv")
fundamentals = pd.read_csv("fundamentals.csv")
prices_split_adjusted = pd.read_csv("prices-split-adjusted.csv")
securities = pd.read_csv("securities.csv")

In [19]:
# sample portfolio
portfolio = {'AAPL': 0.5, 'MSFT': 0.5}

In [21]:
fundamentals = fundamentals[['Ticker Symbol']]
universe = set(securities['Ticker symbol']) \
         & set(prices_split_adjusted['symbol']) \
         & set(fundamentals['Ticker Symbol']) \
         & set(constituents['Symbol'])

In [22]:
def historical_volatility(df):
    p = np.array(df.close)
    lr = np.log(p[1:]) - np.log(p[:-1])
    return np.sum(np.square(lr))
    
def covariance(df1, df2):
    date1 = np.array(df1.date)
    date2 = np.array(df2.date)
    start = max(date1[0], date2[0])
    end = min(date1[-1], date2[-1])
    p1 = np.array(df1[(start <= df1.date) & (df1.date <= end)].close)
    p2 = np.array(df2[(start <= df2.date) & (df2.date <= end)].close)
    lr1 = np.log(p1[1:]) - np.log(p1[:-1])
    lr2 = np.log(p2[1:]) - np.log(p2[:-1])
    return np.sum(lr1*lr2)

In [23]:
vol = []
portfolio_prices = {s: prices_split_adjusted[prices_split_adjusted.symbol == s].sort_values(['date'], ascending=[True]) for s in portfolio.keys()}
for s in universe:
    df = prices_split_adjusted[prices_split_adjusted.symbol == s]
    df.sort_values(['date'], ascending=[True], inplace=True)
    vol.append({
        'symbol': s,
        'volatility': historical_volatility(df),
        'added_volatility': sum(w * covariance(df, portfolio_prices[s]) for s, w in portfolio.items()),
    })
vol = pd.DataFrame(vol, columns = ['symbol', 'volatility', 'added_volatility'])
vol

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


Unnamed: 0,symbol,volatility,added_volatility
0,TDG,0.228652,0.057818
1,ALK,0.758006,0.202688
2,LRCX,0.666669,0.227421
3,CPB,0.200839,0.070431
4,K,0.181806,0.067238
5,UTX,0.274869,0.153107
6,XOM,0.253286,0.131806
7,AAPL,0.482109,0.324397
8,CMA,0.612700,0.194471
9,NTAP,0.731537,0.205115


In [None]:
securities.columns

In [24]:
securities = securities[['Ticker symbol', 'GICS Sector', 'GICS Sub Industry']]

In [25]:
def merge(keep_key, dfs_by_key):
    merged = dfs_by_key[0][1].copy()
    for k, df in dfs_by_key[1:]:
        merged = merged.merge(df, left_on=dfs_by_key[0][0], right_on=k, how='inner')
        if k != keep_key:
            del merged[k]
    if dfs_by_key[0][0] != keep_key:
        del merged[dfs_by_key[0][0]]
    return merged

In [26]:
df = merge('symbol', [('symbol', vol), ('Symbol', constituents), ('Ticker symbol', securities)])
df.head()

Unnamed: 0,symbol,volatility,added_volatility,Name,Sector,GICS Sector,GICS Sub Industry
0,TDG,0.228652,0.057818,TransDigm Group,Industrials,Industrials,Aerospace & Defense
1,ALK,0.758006,0.202688,Alaska Air Group Inc,Industrials,Industrials,Airlines
2,LRCX,0.666669,0.227421,Lam Research,Information Technology,Information Technology,Semiconductor Equipment
3,CPB,0.200839,0.070431,Campbell Soup,Consumer Staples,Consumer Staples,Packaged Foods & Meats
4,K,0.181806,0.067238,Kellogg Co.,Consumer Staples,Consumer Staples,Packaged Foods & Meats


In [27]:
volatility_df = df

In [28]:
df_sec = pd.read_csv("securities.csv")
df_fund = pd.read_csv("fundamentals.csv")
df_price = pd.read_csv("prices.csv")

df_price = df_price[['symbol','close']].drop_duplicates(subset='symbol', keep = "last")
df = pd.DataFrame()
df["Ticker"] = df_fund["Ticker Symbol"]
df["EPS"] = df_fund['Earnings Per Share']
df = df.drop_duplicates(subset='Ticker', keep = "last")
df["Industry"] = np.nan
df["Price"] = np.nan

In [29]:
for x in df_sec.iterrows():
    truth = df["Ticker"]==x[1]['Ticker symbol']
    try:
        df.loc[df.loc[truth]["Ticker"].index.values[0],'Industry'] = x[1]['GICS Sector']
    except:
        pass

for x in df_price.iterrows():
    truth = df["Ticker"]==x[1]['symbol']
    try:
        df.loc[df.loc[truth]["Ticker"].index.values[0],'Price'] = x[1]['close']
    except:
        pass


df = df.drop(df[df.EPS < 0].index)
df["PE"] = round(df['Price']/df['EPS'], 2)
df = df.dropna()

# Calculate industry averages
industries_dict = {sector: np.mean(df[df.Industry == sector].PE.values) for sector in df.Industry.unique()}
df.head()

Unnamed: 0,Ticker,EPS,Industry,Price,PE
3,AAL,11.39,Industrials,46.689999,4.1
7,AAP,6.45,Consumer Discretionary,169.119995,26.22
11,AAPL,8.35,Information Technology,115.82,13.87
15,ABBV,3.15,Health Care,62.619999,19.88
19,ABC,6.73,Health Care,78.190002,11.62


In [30]:
volatility_df.head()

Unnamed: 0,symbol,volatility,added_volatility,Name,Sector,GICS Sector,GICS Sub Industry
0,TDG,0.228652,0.057818,TransDigm Group,Industrials,Industrials,Aerospace & Defense
1,ALK,0.758006,0.202688,Alaska Air Group Inc,Industrials,Industrials,Airlines
2,LRCX,0.666669,0.227421,Lam Research,Information Technology,Information Technology,Semiconductor Equipment
3,CPB,0.200839,0.070431,Campbell Soup,Consumer Staples,Consumer Staples,Packaged Foods & Meats
4,K,0.181806,0.067238,Kellogg Co.,Consumer Staples,Consumer Staples,Packaged Foods & Meats


In [31]:
volatility_pe_df = volatility_df.merge(df, how='inner', left_on=['symbol'], right_on=['Ticker'])

In [32]:
volatility_pe_df.head()

Unnamed: 0,symbol,volatility,added_volatility,Name,Sector,GICS Sector,GICS Sub Industry,Ticker,EPS,Industry,Price,PE
0,TDG,0.228652,0.057818,TransDigm Group,Industrials,Industrials,Aerospace & Defense,TDG,2.77,Industrials,248.960007,89.88
1,ALK,0.758006,0.202688,Alaska Air Group Inc,Industrials,Industrials,Airlines,ALK,6.61,Industrials,88.730003,13.42
2,LRCX,0.666669,0.227421,Lam Research,Information Technology,Information Technology,Semiconductor Equipment,LRCX,5.75,Information Technology,105.730003,18.39
3,CPB,0.200839,0.070431,Campbell Soup,Consumer Staples,Consumer Staples,Packaged Foods & Meats,CPB,1.82,Consumer Staples,60.470001,33.23
4,K,0.181806,0.067238,Kellogg Co.,Consumer Staples,Consumer Staples,Packaged Foods & Meats,K,1.74,Consumer Staples,73.709999,42.36


In [34]:
def get_standarized_PE(ticker, pe= None, industry= None):
    if not pe or not industry:
        assert ticker in df.Ticker.values, "Please provide price and Industry"
        data = df.loc[df["Ticker"] == ticker]
        indus = data["Industry"].values[0]
        PE = data["PE"].values[0]
        avg = industries_dict[indus]
        return (PE - avg)/avg
    else:
        assert industry in industries_sect.keyes(), "You're sector needs to be a GICS Sector"
        avg = industries_dict[industry]
        return (pe - avg)/avg

In [35]:
col = []
for ticker in volatility_pe_df['symbol']:
    col.append(get_standarized_PE(ticker))
volatility_pe_df['std_pe'] = np.array(col)

In [36]:
volatility_pe_df.head()

Unnamed: 0,symbol,volatility,added_volatility,Name,Sector,GICS Sector,GICS Sub Industry,Ticker,EPS,Industry,Price,PE,std_pe
0,TDG,0.228652,0.057818,TransDigm Group,Industrials,Industrials,Aerospace & Defense,TDG,2.77,Industrials,248.960007,89.88,2.446119
1,ALK,0.758006,0.202688,Alaska Air Group Inc,Industrials,Industrials,Airlines,ALK,6.61,Industrials,88.730003,13.42,-0.485459
2,LRCX,0.666669,0.227421,Lam Research,Information Technology,Information Technology,Semiconductor Equipment,LRCX,5.75,Information Technology,105.730003,18.39,-0.37389
3,CPB,0.200839,0.070431,Campbell Soup,Consumer Staples,Consumer Staples,Packaged Foods & Meats,CPB,1.82,Consumer Staples,60.470001,33.23,0.231245
4,K,0.181806,0.067238,Kellogg Co.,Consumer Staples,Consumer Staples,Packaged Foods & Meats,K,1.74,Consumer Staples,73.709999,42.36,0.569531


In [44]:
fifty_two_week_df = pd.read_csv("prices-split-adjusted.csv")
fifty_two_week_df['date'] = pd.to_datetime(fifty_two_week_df.date)

#Keep only the data from 2016 (we only need 1 years worth of data)
mask = (fifty_two_week_df['date'] > '2015-12-31')
fifty_two_week_df = fifty_two_week_df.loc[mask]


#Keep only the high and low and merge it all into one final data frame
s1 = fifty_two_week_df.groupby(['symbol'], sort=False)['close'].max()
s2 = fifty_two_week_df.groupby(['symbol'], sort=False)['close'].min()
final = pd.concat([s1, s2], axis=1).reset_index()
final.columns = ['symbol','high','low']

fifty_two_week_volatility_pe_df = final.merge(
    volatility_pe_df, how='inner', left_on=['symbol'], right_on=['symbol'])
fifty_two_week_volatility_pe_df.head()

Unnamed: 0,symbol,high,low,volatility,added_volatility,Name,Sector,GICS Sector,GICS Sub Industry,Ticker,EPS,Industry,Price,PE,std_pe
0,AAL,49.639999,25.27,1.642028,0.223905,American Airlines Group,Industrials,Industrials,Airlines,AAL,11.39,Industrials,46.689999,4.1,-0.842801
1,AAP,176.779999,136.190002,0.519604,0.117666,Advance Auto Parts,Consumer Discretionary,Consumer Discretionary,Automotive Retail,AAP,6.45,Consumer Discretionary,169.119995,26.22,0.170151
2,AAPL,118.25,90.339996,0.482109,0.324397,Apple Inc.,Information Technology,Information Technology,Computer Hardware,AAPL,8.35,Information Technology,115.82,13.87,-0.527779
3,ABBV,67.389999,51.18,0.302683,0.066063,AbbVie,Health Care,Health Care,Pharmaceuticals,ABBV,3.15,Health Care,62.619999,19.88,-0.33526
4,ABC,103.360001,69.029999,0.310987,0.104973,AmerisourceBergen Corp,Health Care,Health Care,Health Care Distributors,ABC,6.73,Health Care,78.190002,11.62,-0.611455


In [56]:
sector_feature = []
sector_weights = {}
for ticker in portfolio:
    sector = list(fifty_two_week_volatility_pe_df[fifty_two_week_volatility_pe_df.symbol == ticker]['Sector'])[0]
    if sector in sector_weights:
        sector_weights[sector] += portfolio[ticker]
    else:
        sector_weights[sector] = portfolio[ticker]
for x in fifty_two_week_volatility_pe_df.iterrows():
    if x[1]['Sector'] in sector_weights:
        sector_feature.append(sector_weights[x[1]['Sector']])
    else:
        sector_feature.append(0)
fifty_two_week_volatility_pe_df['portfolio_sector'] = sector_feature

(0, symbol                                   AAL
high                                   49.64
low                                    25.27
volatility                           1.64203
added_volatility                    0.223905
Name                 American Airlines Group
Sector                           Industrials
GICS Sector                      Industrials
GICS Sub Industry                   Airlines
Ticker                                   AAL
EPS                                    11.39
Industry                         Industrials
Price                                  46.69
PE                                       4.1
std_pe                             -0.842801
Name: 0, dtype: object)
(1, symbol                                  AAP
high                                 176.78
low                                  136.19
volatility                         0.519604
added_volatility                   0.117666
Name                     Advance Auto Parts
Sector               Consumer

Name: 243, dtype: object)
(244, symbol                                               TIF
high                                               85.06
low                                                57.48
volatility                                      0.610814
added_volatility                                0.194251
Name                                       Tiffany & Co.
Sector                            Consumer Discretionary
GICS Sector                       Consumer Discretionary
GICS Sub Industry    Apparel, Accessories & Luxury Goods
Ticker                                               TIF
EPS                                                 3.61
Industry                          Consumer Discretionary
Price                                              77.43
PE                                                 21.45
std_pe                                        -0.0427252
Name: 244, dtype: object)
(245, symbol                                  TJX
high                                 

In [57]:
fifty_two_week_volatility_pe_df.head()

Unnamed: 0,symbol,high,low,volatility,added_volatility,Name,Sector,GICS Sector,GICS Sub Industry,Ticker,EPS,Industry,Price,PE,std_pe,portfolio_sector
0,AAL,49.639999,25.27,1.642028,0.223905,American Airlines Group,Industrials,Industrials,Airlines,AAL,11.39,Industrials,46.689999,4.1,-0.842801,0.0
1,AAP,176.779999,136.190002,0.519604,0.117666,Advance Auto Parts,Consumer Discretionary,Consumer Discretionary,Automotive Retail,AAP,6.45,Consumer Discretionary,169.119995,26.22,0.170151,0.0
2,AAPL,118.25,90.339996,0.482109,0.324397,Apple Inc.,Information Technology,Information Technology,Computer Hardware,AAPL,8.35,Information Technology,115.82,13.87,-0.527779,1.0
3,ABBV,67.389999,51.18,0.302683,0.066063,AbbVie,Health Care,Health Care,Pharmaceuticals,ABBV,3.15,Health Care,62.619999,19.88,-0.33526,0.0
4,ABC,103.360001,69.029999,0.310987,0.104973,AmerisourceBergen Corp,Health Care,Health Care,Health Care Distributors,ABC,6.73,Health Care,78.190002,11.62,-0.611455,0.0


In [135]:
"""
standardize unstandardized features
"""
vol_mean, vol_std = fifty_two_week_volatility_pe_df[
    'volatility'].mean(), fifty_two_week_volatility_pe_df['volatility'].std()
high_low_diff = [x/y for x,y in zip(fifty_two_week_volatility_pe_df['high'], fifty_two_week_volatility_pe_df['low'])]
high_low_mean, high_low_std = pd.Series(high_low_diff).mean(), pd.Series(high_low_diff).std()
sector_mean, sector_std = fifty_two_week_volatility_pe_df[
    'portfolio_sector'].mean(), fifty_two_week_volatility_pe_df['portfolio_sector'].std()

std_high_low, std_vol, std_sector = [],[],[]
for x in fifty_two_week_volatility_pe_df.iterrows():
    std_high_low.append((x[1]['high']/x[1]['low']-high_low_mean)/high_low_std)
    std_vol.append((x[1]['volatility']-vol_mean)/vol_std)
    std_sector.append((x[1]['portfolio_sector']-sector_mean)/sector_std)
    
fifty_two_week_volatility_pe_df['std_high_low'] = np.array(std_high_low)
fifty_two_week_volatility_pe_df['std_vol'] = np.array(std_vol)
fifty_two_week_volatility_pe_df['std_sector'] = np.array(std_sector)

In [139]:
"""
Now make features (52 week high/low + volatility + portfolio_sector + std_pe) and 
perform Latent Factor Analysis
"""

X = []
centroid = {}
for x in fifty_two_week_volatility_pe_df.iterrows():
    point = np.array([
        x[1]['std_high_low'], x[1]['std_vol'], x[1]['std_pe'], x[1]['std_sector']])
    if x[1]['symbol'] in portfolio:
        centroid[tuple(point)] = portfolio[x[1]['symbol']]
    X.append(point)
    
center = [0,0,0,0]
for pt in centroid:
    for i in range(4):
        center[i] += centroid[pt]*pt[i]
        
X = np.array(X)
U, s, V = np.linalg.svd(X, full_matrices=True)
shifted_center = []
for i in range(4):
    shifted_center.append(V[0][i]+center[i])
    
dist = {}
for i in range(len(X)):
    c_dist = 0
    for j in range(4):
        c_dist += (X[i][j]-shifted_center[j])**2
    dist[i] = c_dist
dist = sorted(dist, key = lambda x: dist[x])
rankings = []
for i in range(NUM_RANKED):
    rankings.append(fifty_two_week_volatility_pe_df.iloc[dist[i]]['Name'])
rankings

['Fiserv Inc',
 'Paychex Inc.',
 'Microsoft Corp.',
 'Intuit Inc.',
 'Verisign Inc.',
 'International Business Machines',
 'Apple Inc.',
 'TE Connectivity Ltd.',
 'FLIR Systems',
 'Fidelity National Information Services',
 'Western Union Co',
 'Total System Services',
 'Cisco Systems',
 'Xilinx Inc',
 'Xerox Corp.',
 'Amphenol Corp',
 'KLA-Tencor Corp.',
 'Harris Corporation',
 'Texas Instruments',
 'Global Payments Inc']

symbol                                             PCLN
high                                            1578.13
low                                               973.8
volatility                                     0.754984
added_volatility                               0.211289
Name                                  Priceline.com Inc
Sector                           Consumer Discretionary
GICS Sector                      Consumer Discretionary
GICS Sub Industry    Internet & Direct Marketing Retail
Ticker                                             PCLN
EPS                                               50.09
Industry                         Consumer Discretionary
Price                                           1466.06
PE                                                29.27
std_pe                                         0.306267
portfolio_sector                                      0
std_high_low                                    30.2075
std_vol                                        0

In [142]:
s

array([ 21.70365697,  16.02006113,  12.52939227,   9.58890784])

In [141]:
center

[-0.5717030911657921,
 -0.23720232995400908,
 -0.26494165712340217,
 2.355613455783228]