In [38]:
"""
AUTHOR: Pranav Parthasarathy
FUNCTION: Intended to combine all features into one notebook, perform PCA on these features
"""
import pandas as pd
import numpy as np
import urllib as u
from bs4 import BeautifulSoup as bs
import matplotlib.pyplot as plt
import lxml

In [15]:
constituents = pd.read_csv("constituents.csv")
fundamentals = pd.read_csv("fundamentals.csv")
prices_split_adjusted = pd.read_csv("prices-split-adjusted.csv")
securities = pd.read_csv("securities.csv")

In [16]:
# sample portfolio
portfolio = {'AAPL': 0.5, 'MSFT': 0.5}

In [17]:
fundamentals = fundamentals[['Ticker Symbol']]
universe = set(securities['Ticker symbol']) \
         & set(prices_split_adjusted['symbol']) \
         & set(fundamentals['Ticker Symbol']) \
         & set(constituents['Symbol'])

In [18]:
def historical_volatility(df):
    p = np.array(df.close)
    lr = np.log(p[1:]) - np.log(p[:-1])
    return np.sum(np.square(lr))
    
def covariance(df1, df2):
    date1 = np.array(df1.date)
    date2 = np.array(df2.date)
    start = max(date1[0], date2[0])
    end = min(date1[-1], date2[-1])
    p1 = np.array(df1[(start <= df1.date) & (df1.date <= end)].close)
    p2 = np.array(df2[(start <= df2.date) & (df2.date <= end)].close)
    lr1 = np.log(p1[1:]) - np.log(p1[:-1])
    lr2 = np.log(p2[1:]) - np.log(p2[:-1])
    return np.sum(lr1*lr2)

In [19]:
vol = []
portfolio_prices = {s: prices_split_adjusted[prices_split_adjusted.symbol == s].sort_values(['date'], ascending=[True]) for s in portfolio.keys()}
for s in universe:
    df = prices_split_adjusted[prices_split_adjusted.symbol == s]
    df.sort_values(['date'], ascending=[True], inplace=True)
    vol.append({
        'symbol': s,
        'volatility': historical_volatility(df),
        'added_volatility': sum(w * covariance(df, portfolio_prices[s]) for s, w in portfolio.items()),
    })
vol = pd.DataFrame(vol, columns = ['symbol', 'volatility', 'added_volatility'])
vol

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


Unnamed: 0,symbol,volatility,added_volatility
0,SHW,0.342774,0.124631
1,HRS,0.410688,0.177227
2,RCL,1.039588,0.259811
3,OKE,0.745463,0.164638
4,RSG,0.239195,0.119062
5,VNO,0.358398,0.154499
6,EW,0.751582,0.135078
7,HIG,0.765036,0.232777
8,BWA,0.752653,0.229012
9,GWW,0.356508,0.144279


In [20]:
securities.columns

Index(['Ticker symbol', 'Security', 'SEC filings', 'GICS Sector',
       'GICS Sub Industry', 'Address of Headquarters', 'Date first added',
       'CIK'],
      dtype='object')

In [11]:
securities = securities[['Ticker symbol', 'GICS Sector', 'GICS Sub Industry']]

In [21]:
def merge(keep_key, dfs_by_key):
    merged = dfs_by_key[0][1].copy()
    for k, df in dfs_by_key[1:]:
        merged = merged.merge(df, left_on=dfs_by_key[0][0], right_on=k, how='inner')
        if k != keep_key:
            del merged[k]
    if dfs_by_key[0][0] != keep_key:
        del merged[dfs_by_key[0][0]]
    return merged

In [25]:
df = merge('symbol', [('symbol', vol), ('Symbol', constituents), ('Ticker symbol', securities)])
df.head()

Unnamed: 0,symbol,volatility,added_volatility,Name,Sector,Security,SEC filings,GICS Sector,GICS Sub Industry,Address of Headquarters,Date first added,CIK
0,SHW,0.342774,0.124631,Sherwin-Williams,Materials,Sherwin-Williams,reports,Materials,Specialty Chemicals,"Cleveland, Ohio",1964-06-30,89800
1,HRS,0.410688,0.177227,Harris Corporation,Information Technology,Harris Corporation,reports,Information Technology,Telecommunications Equipment,"Melbourne, Florida",2008-09-22,202058
2,RCL,1.039588,0.259811,Royal Caribbean Cruises Ltd,Consumer Discretionary,Royal Caribbean Cruises Ltd,reports,Consumer Discretionary,"Hotels, Resorts & Cruise Lines","Miami, Florida",2014-12-05,884887
3,OKE,0.745463,0.164638,ONEOK,Energy,ONEOK,reports,Energy,Oil & Gas Exploration & Production,"Tulsa, Oklahoma",2010-03-15,1039684
4,RSG,0.239195,0.119062,Republic Services Inc,Industrials,Republic Services Inc,reports,Industrials,Industrial Conglomerates,"Phoenix, Arizona",2008-12-05,1060391


In [48]:
volatility_df = df

In [51]:
df_sec = pd.read_csv("securities.csv")
df_fund = pd.read_csv("fundamentals.csv")
df_price = pd.read_csv("prices.csv")

df_price = df_price[['symbol','close']].drop_duplicates(subset='symbol', keep = "last")
df = pd.DataFrame()
df["Ticker"] = df_fund["Ticker Symbol"]
df["EPS"] = df_fund['Earnings Per Share']
df = df.drop_duplicates(subset='Ticker', keep = "last")
df["Industry"] = np.nan
df["Price"] = np.nan

In [52]:
for x in df_sec.iterrows():
    truth = df["Ticker"]==x[1]['Ticker symbol']
    try:
        df.loc[df.loc[truth]["Ticker"].index.values[0],'Industry'] = x[1]['GICS Sector']
    except:
        pass

for x in df_price.iterrows():
    truth = df["Ticker"]==x[1]['symbol']
    try:
        df.loc[df.loc[truth]["Ticker"].index.values[0],'Price'] = x[1]['close']
    except:
        pass


df = df.drop(df[df.EPS < 0].index)
df["PE"] = round(df['Price']/df['EPS'], 2)
df = df.dropna()

# Calculate industry averages
industries_dict = {sector: np.mean(df[df.Industry == sector].PE.values) for sector in df.Industry.unique()}
df.head()

Unnamed: 0,Ticker,EPS,Industry,Price,PE
3,AAL,11.39,Industrials,46.689999,4.1
7,AAP,6.45,Consumer Discretionary,169.119995,26.22
11,AAPL,8.35,Information Technology,115.82,13.87
15,ABBV,3.15,Health Care,62.619999,19.88
19,ABC,6.73,Health Care,78.190002,11.62


In [56]:
volatility_df.head()

Unnamed: 0,symbol,volatility,added_volatility,Name,Sector,Security,SEC filings,GICS Sector,GICS Sub Industry,Address of Headquarters,Date first added,CIK,Price
0,SHW,0.342774,0.124631,Sherwin-Williams,Materials,Sherwin-Williams,reports,Materials,Specialty Chemicals,"Cleveland, Ohio",1964-06-30,89800.0,
1,HRS,0.410688,0.177227,Harris Corporation,Information Technology,Harris Corporation,reports,Information Technology,Telecommunications Equipment,"Melbourne, Florida",2008-09-22,202058.0,
2,RCL,1.039588,0.259811,Royal Caribbean Cruises Ltd,Consumer Discretionary,Royal Caribbean Cruises Ltd,reports,Consumer Discretionary,"Hotels, Resorts & Cruise Lines","Miami, Florida",2014-12-05,884887.0,
3,OKE,0.745463,0.164638,ONEOK,Energy,ONEOK,reports,Energy,Oil & Gas Exploration & Production,"Tulsa, Oklahoma",2010-03-15,1039684.0,46.689999
4,RSG,0.239195,0.119062,Republic Services Inc,Industrials,Republic Services Inc,reports,Industrials,Industrial Conglomerates,"Phoenix, Arizona",2008-12-05,1060391.0,


In [62]:
volatility_pe_df = volatility_df.merge(df, how='inner', left_on=['symbol'], right_on=['Ticker'])

In [63]:
volatility_pe_df.head()

Unnamed: 0,symbol,volatility,added_volatility,Name,Sector,Security,SEC filings,GICS Sector,GICS Sub Industry,Address of Headquarters,Date first added,CIK,Price_x,Ticker,EPS,Industry,Price_y,PE
0,SHW,0.342774,0.124631,Sherwin-Williams,Materials,Sherwin-Williams,reports,Materials,Specialty Chemicals,"Cleveland, Ohio",1964-06-30,89800.0,,SHW,11.38,Materials,268.73999,23.62
1,HRS,0.410688,0.177227,Harris Corporation,Information Technology,Harris Corporation,reports,Information Technology,Telecommunications Equipment,"Melbourne, Florida",2008-09-22,202058.0,,HRS,2.61,Information Technology,102.470001,39.26
2,RCL,1.039588,0.259811,Royal Caribbean Cruises Ltd,Consumer Discretionary,Royal Caribbean Cruises Ltd,reports,Consumer Discretionary,"Hotels, Resorts & Cruise Lines","Miami, Florida",2014-12-05,884887.0,,RCL,3.03,Consumer Discretionary,82.040001,27.08
3,OKE,0.745463,0.164638,ONEOK,Energy,ONEOK,reports,Energy,Oil & Gas Exploration & Production,"Tulsa, Oklahoma",2010-03-15,1039684.0,46.689999,OKE,1.17,Energy,57.41,49.07
4,EW,0.751582,0.135078,Edwards Lifesciences,Health Care,Edwards Lifesciences,reports,Health Care,Health Care Equipment,"Irvine, California",2011-04-01,1099800.0,,EW,2.3,Health Care,93.699997,40.74


In [71]:
def get_standarized_PE(ticker, pe= None, industry= None):
    if not pe or not industry:
        assert ticker in df.Ticker.values, "Please provide price and Industry"
        data = df.loc[df["Ticker"] == ticker]
        indus = data["Industry"].values[0]
        PE = data["PE"].values[0]
        avg = industries_dict[indus]
        return (PE - avg)/avg
    else:
        assert industry in industries_sect.keyes(), "You're sector needs to be a GICS Sector"
        avg = industries_dict[industry]
        return (pe - avg)/avg

In [73]:
col = []
for ticker in volatility_pe_df['symbol']:
    col.append(get_standarized_PE(ticker))
volatility_pe_df['std_pe'] = np.array(col)

In [76]:
volatility_pe_df.head()

Unnamed: 0,symbol,volatility,added_volatility,Name,Sector,Security,SEC filings,GICS Sector,GICS Sub Industry,Address of Headquarters,Date first added,CIK,Price_x,Ticker,EPS,Industry,Price_y,PE,std_pe
0,SHW,0.342774,0.124631,Sherwin-Williams,Materials,Sherwin-Williams,reports,Materials,Specialty Chemicals,"Cleveland, Ohio",1964-06-30,89800.0,,SHW,11.38,Materials,268.73999,23.62,-0.299272
1,HRS,0.410688,0.177227,Harris Corporation,Information Technology,Harris Corporation,reports,Information Technology,Telecommunications Equipment,"Melbourne, Florida",2008-09-22,202058.0,,HRS,2.61,Information Technology,102.470001,39.26,0.336655
2,RCL,1.039588,0.259811,Royal Caribbean Cruises Ltd,Consumer Discretionary,Royal Caribbean Cruises Ltd,reports,Consumer Discretionary,"Hotels, Resorts & Cruise Lines","Miami, Florida",2014-12-05,884887.0,,RCL,3.03,Consumer Discretionary,82.040001,27.08,0.208532
3,OKE,0.745463,0.164638,ONEOK,Energy,ONEOK,reports,Energy,Oil & Gas Exploration & Production,"Tulsa, Oklahoma",2010-03-15,1039684.0,46.689999,OKE,1.17,Energy,57.41,49.07,-0.187852
4,EW,0.751582,0.135078,Edwards Lifesciences,Health Care,Edwards Lifesciences,reports,Health Care,Health Care Equipment,"Irvine, California",2011-04-01,1099800.0,,EW,2.3,Health Care,93.699997,40.74,0.362248
