In [1]:
import sys
sys.path.insert(0, '..')
%load_ext autoreload
%autoreload 2
%aimport std_func

# Hide warnings
import warnings
warnings.filterwarnings("ignore")

ModuleNotFoundError: No module named 'pattern'

## Estimates from Sample Covariance

The portfolios we constructed in this notebook serve as reference to the portfolios using cosine similarity estimates and factor model estimates. Here, we simply use the sample return and sample covariance to generate portfolios for each industry.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
r_selected = pd.read_csv("data/filtered_r.csv")
# get the mean of all 
r_selected.set_index("name", inplace = True)
mu = r_selected.mean(axis = 1)
# compute the covariance matrix 
cov = r_selected.T.cov()

### Perform Mean-Variance Analysis
We will use one industry per time to generate a minimum-variance portfolio. In our demonstration of mean-variance analysis process, we use the industry `pharmaceutical preparations` whose SIC code is `2834`. At the end, we will show the results for other industries. 

We first get the name list of `pharmaceutical preparations` companies and match the names with the companies in returns data. Then, we get the sample mean and sample covariance for this specific industry. We get the efficient frontier, the set of optimal portfolios, for the industry; and recognize the minimum-variance portfolio.

In [4]:
#!pip install dataframe_image

In [5]:
!pip install PyPortfolioOpt







In [6]:
from pypfopt import EfficientFrontier
from pypfopt import risk_models
from pypfopt import expected_returns
from pypfopt import objective_functions
from pypfopt import plotting

ModuleNotFoundError: No module named 'pypfopt'

In [7]:
df = pd.read_csv('../data/preprocessed.csv',
                 usecols = ['reportingDate', 'name', 'CIK',
                           'coDescription_stopwords', 'SIC', 'SIC_desc'])
df = df.set_index(df.name)

In [8]:
# get the names of the companies in the pharmaceutical preparations industry
Pharm = df[df.SIC == 2834]
Pharm_list = Pharm.index

In [9]:
# get the companies name that match return data and business description data
SET = (set(Pharm_list) & set(r_selected.index))
LIST = [*SET, ]

#### Sample Mean for the Pharmaceutical Preparations Industry

In [10]:
mu_Pharm = mu[LIST]
mu_Pharm

name
ACADIA PHARMACEUTICALS INC       -0.013054
BIOMARIN PHARMACEUTICAL INC       0.001661
PRESTIGE BRANDS HOLDINGS, INC.   -0.014350
ANI PHARMACEUTICALS INC          -0.000895
EVOKE PHARMA INC                  0.001337
                                    ...   
CATALYST PHARMACEUTICALS, INC.    0.058129
STRONGBRIDGE BIOPHARMA PLC        0.016958
NATURES SUNSHINE PRODUCTS INC     0.003125
HEAT BIOLOGICS, INC.             -0.010232
ACELRX PHARMACEUTICALS INC        0.020838
Length: 124, dtype: float64

#### Sample Covariance for the Pharmaceutical Preparations Industry

In [11]:
tmp = cov[LIST].T
cov_Pharm = tmp[LIST]
cov_Pharm

name,ACADIA PHARMACEUTICALS INC,BIOMARIN PHARMACEUTICAL INC,"PRESTIGE BRANDS HOLDINGS, INC.",ANI PHARMACEUTICALS INC,EVOKE PHARMA INC,"CHIASMA, INC",CATABASIS PHARMACEUTICALS INC,"CELLECTAR BIOSCIENCES, INC.",XOMA CORP,CORMEDIX INC.,...,ENDO INTERNATIONAL PLC,"ADVAXIS, INC.","SIERRA ONCOLOGY, INC.","ACTINIUM PHARMACEUTICALS, INC.","GLOBAL BLOOD THERAPEUTICS, INC.","CATALYST PHARMACEUTICALS, INC.",STRONGBRIDGE BIOPHARMA PLC,NATURES SUNSHINE PRODUCTS INC,"HEAT BIOLOGICS, INC.",ACELRX PHARMACEUTICALS INC
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ACADIA PHARMACEUTICALS INC,0.025091,0.005883,0.005519,0.003747,0.008868,0.015785,0.010595,0.007735,0.009431,0.013532,...,-0.004392,0.004847,0.006682,0.006830,0.004106,-0.000628,-0.002626,-0.000312,0.004212,0.014184
BIOMARIN PHARMACEUTICAL INC,0.005883,0.007078,0.002762,0.003723,-0.001490,0.001363,0.001731,-0.000337,0.008034,0.003980,...,0.004369,0.006369,0.005543,0.007147,0.002449,-0.001136,0.000501,0.002889,0.003687,0.009018
"PRESTIGE BRANDS HOLDINGS, INC.",0.005519,0.002762,0.007043,0.003118,0.005310,0.003855,-0.002394,0.001340,-0.000104,-0.001771,...,0.005259,0.003874,0.003125,0.008591,-0.000874,0.001227,-0.000254,0.001547,-0.000372,0.005980
ANI PHARMACEUTICALS INC,0.003747,0.003723,0.003118,0.010194,0.002785,-0.001768,0.002830,0.000586,0.008308,0.005124,...,0.000802,0.003035,0.009495,0.004878,-0.001194,0.002672,-0.005220,0.003865,0.001947,0.003685
EVOKE PHARMA INC,0.008868,-0.001490,0.005310,0.002785,0.044894,0.011879,0.005690,0.017206,0.007935,0.020902,...,-0.008416,0.003388,-0.006534,0.006544,0.007635,-0.000906,-0.010274,-0.008648,-0.001646,0.001968
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"CATALYST PHARMACEUTICALS, INC.",-0.000628,-0.001136,0.001227,0.002672,-0.000906,0.002478,0.017590,0.008495,0.013079,-0.000194,...,0.003559,0.016339,0.012708,0.006623,0.005350,0.049876,0.019141,0.007510,0.024567,0.007830
STRONGBRIDGE BIOPHARMA PLC,-0.002626,0.000501,-0.000254,-0.005220,-0.010274,0.001311,0.012113,0.012295,0.007632,-0.015399,...,0.000747,0.000674,0.003890,-0.002259,0.014406,0.019141,0.044082,0.004545,-0.001532,0.014582
NATURES SUNSHINE PRODUCTS INC,-0.000312,0.002889,0.001547,0.003865,-0.008648,0.000256,0.007587,-0.001865,-0.003559,-0.011241,...,0.008621,0.004654,0.001700,0.003263,0.000219,0.007510,0.004545,0.018088,0.010699,0.007317
"HEAT BIOLOGICS, INC.",0.004212,0.003687,-0.000372,0.001947,-0.001646,0.000472,0.023285,-0.001913,0.015210,0.015341,...,0.001185,0.031221,-0.009181,0.014973,0.005784,0.024567,-0.001532,0.010699,0.114481,0.008267


#### Efficient Frontier - Pharmaceutical Preparations

In [12]:
ef1 = EfficientFrontier(mu_Pharm, cov_Pharm, weight_bounds=(0, 0.2))

fig, ax = plt.subplots()
plotting.plot_efficient_frontier(ef1, ax=ax, show_assets=True)

# Find and plot the tangency portfolio
ef2 = EfficientFrontier(mu_Pharm, cov_Pharm, weight_bounds=(0, 0.2))
# min volatility
ef2.min_volatility()
ret_tangent, std_tangent, _ = ef2.portfolio_performance()
ax.scatter(std_tangent, ret_tangent, marker="*", s=100, c="r", label="Min Volatility")

# Format
ax.set_title("Efficient Frontier - Pharmaceutical Preparations \n Sample Covariance Estimates")
ax.legend()
plt.tight_layout()
plt.savefig('images/Efficient_Frontier_Returns.png', dpi=200, bbox_inches='tight')
plt.show()

NameError: name 'EfficientFrontier' is not defined

##### Min Volatility Portfolio

###### Performance

In [13]:
ef2.portfolio_performance(verbose=True);

NameError: name 'ef2' is not defined

###### Weights

In [14]:
companies = []
weights = []
for company, weight in ef2.clean_weights().items():
    if weight != 0:
        companies.append(company)
        weights.append(weight)
        
dic = {'Company_Name':companies,'Weight':weights}
min_vol = pd.DataFrame(dic)

NameError: name 'ef2' is not defined

In [15]:
pd.read_csv("data/min_vol_sample_Pharmaceutical_Preparations.csv")

Unnamed: 0,Company_Name,Weight
0,"CHEMBIO DIAGNOSTICS, INC.",0.02843
1,JOHNSON & JOHNSON,0.17878
2,BIODELIVERY SCIENCES INTERNATIONAL INC,0.0368
3,"PROPHASE LABS, INC.",0.0512
4,ORAMED PHARMACEUTICALS INC.,0.04982
5,XENON PHARMACEUTICALS INC.,0.0135
6,BRISTOL MYERS SQUIBB CO,0.12824
7,PFENEX INC.,0.03777
8,ACHAOGEN INC,0.00515
9,ZOETIS INC.,0.01195


### Results for the Other 4 Industries

In [16]:
SIC_list = [7372, 1311, 6798, 6022]
SIC_desc = ['Prepackaged Software (mass reproduction of software)', 'Crude Petroleum and Natural Gas', 
           'Real Estate Investment Trusts', 'State Commercial Banks (commercial banking)']

#### Prepackaged Software (mass reproduction of software)

In [17]:
SIC = SIC_list[0]
    
industry_name = SIC_desc[SIC_list.index(SIC)]
    
# get the names of the companies in the other industries
Companies = df[df.SIC == SIC]
Company_list = Companies.index

# get the companies name that match return data and business description data
SET = (set(Company_list) & set(r_selected.index))
LIST = [*SET, ]

mu_sample = mu[LIST]
# get the outliers
outlier = mu_sample[mu_sample>1].index
mu_sample = mu_sample.drop(outlier)
LIST = mu_sample.index

tmp = cov[LIST].T
cov_sample = tmp[LIST]

# perform minimum variance analysis
ef1 = EfficientFrontier(mu_sample, cov_sample, weight_bounds=(0, 0.2))

fig, ax = plt.subplots()
plotting.plot_efficient_frontier(ef1, ax=ax, show_assets=True)

# Find and plot the tangency portfolio
ef2 = EfficientFrontier(mu_sample, cov_sample, weight_bounds=(0, 0.2))
# min volatility
ef2.min_volatility()
ret_tangent, std_tangent, _ = ef2.portfolio_performance()
ax.scatter(std_tangent, ret_tangent, marker="*", s=100, c="r", label="Min Volatility")

# Format
ax.set_title("Efficient Frontier - %s \n Sample Covariance Estimates" %industry_name)
ax.legend()
plt.tight_layout()
plt.savefig('images/Efficient_Frontier_Sample_Covariance_Estimates' + str(industry_name) + '.png', dpi=200, bbox_inches='tight')
plt.show()

NameError: name 'EfficientFrontier' is not defined

##### Min Volatility Portfolio

###### Performance

In [18]:
ef2.portfolio_performance(verbose=True);

NameError: name 'ef2' is not defined

###### Weights

In [19]:
pd.read_csv("data/min_vol_sample_Prepackaged_Software.csv")

Unnamed: 0,Company_Name,Weight
0,MAJESCO,0.01906
1,"NUANCE COMMUNICATIONS, INC.",0.08608
2,AWARE INC /MA/,0.2
3,MICROSTRATEGY INC,0.0216
4,QUMU CORP,0.05153
5,ROSETTA STONE INC,0.01647
6,COMMVAULT SYSTEMS INC,0.07381
7,"ENDURANCE INTERNATIONAL GROUP HOLDINGS, INC.",0.02554
8,"QUALYS, INC.",0.06668
9,LIVEPERSON INC,0.01519


#### Crude Petroleum and Natural Gas
When we conduct the same analysis, there is no weight shown. Efficient frontier cannot be found.

#### Real Estate Investment Trusts

In [20]:
SIC = SIC_list[2]
    
industry_name = SIC_desc[SIC_list.index(SIC)]
    
# get the names of the companies in the other industries
Companies = df[df.SIC == SIC]
Company_list = Companies.index

# get the companies name that match return data and business description data
SET = (set(Company_list) & set(r_selected.index))
LIST = [*SET, ]

mu_sample = mu[LIST]
# get the outliers
outlier = mu_sample[mu_sample>1].index
mu_sample = mu_sample.drop(outlier)
LIST = mu_sample.index

tmp = cov[LIST].T
cov_sample = tmp[LIST]

# perform minimum variance analysis
ef1 = EfficientFrontier(mu_sample, cov_sample, weight_bounds=(0, 0.2))

fig, ax = plt.subplots()
plotting.plot_efficient_frontier(ef1, ax=ax, show_assets=True)

# Find and plot the tangency portfolio
ef2 = EfficientFrontier(mu_sample, cov_sample, weight_bounds=(0, 0.2))
# min volatility
ef2.min_volatility()
ret_tangent, std_tangent, _ = ef2.portfolio_performance()
ax.scatter(std_tangent, ret_tangent, marker="*", s=100, c="r", label="Min Volatility")

# Format
ax.set_title("Efficient Frontier - %s \n Sample Covariance Estimates" %industry_name)
ax.legend()
plt.tight_layout()
plt.savefig('images/Efficient_Frontier_Sample_Covariance_Estimates' + str(industry_name) + '.png', dpi=200, bbox_inches='tight')
plt.show()

NameError: name 'EfficientFrontier' is not defined

##### Min Volatility Portfolio

###### Performance

In [21]:
ef2.portfolio_performance(verbose=True);

NameError: name 'ef2' is not defined

###### Weights

In [22]:
pd.read_csv("data/min_vol_sample_Real_Estate_Investment_Trusts.csv")

Unnamed: 0,Company_Name,Weight
0,BRT APARTMENTS CORP.,0.01724
1,PUBLIC STORAGE,0.10938
2,GREAT AJAX CORP.,0.2
3,ALEXANDERS INC,0.02285
4,LADDER CAPITAL CORP,0.0442
5,CIM COMMERCIAL TRUST CORP,0.05461
6,ARES COMMERCIAL REAL ESTATE CORP,0.09107
7,"MANHATTAN BRIDGE CAPITAL, INC",0.01483
8,HMG COURTLAND PROPERTIES INC,0.12513
9,NEW YORK MORTGAGE TRUST INC,0.02084


#### State Commercial Banks (commercial banking)

In [23]:
SIC = SIC_list[3]
    
industry_name = SIC_desc[SIC_list.index(SIC)]
    
# get the names of the companies in the other industries
Companies = df[df.SIC == SIC]
Company_list = Companies.index

# get the companies name that match return data and business description data
SET = (set(Company_list) & set(r_selected.index))
LIST = [*SET, ]

mu_sample = mu[LIST]
# get the outliers
outlier = mu_sample[mu_sample>1].index
mu_sample = mu_sample.drop(outlier)
LIST = mu_sample.index

tmp = cov[LIST].T
cov_sample = tmp[LIST]

# perform minimum variance analysis
ef1 = EfficientFrontier(mu_sample, cov_sample, weight_bounds=(0, 0.2))

fig, ax = plt.subplots()
plotting.plot_efficient_frontier(ef1, ax=ax, show_assets=True)

# Find and plot the tangency portfolio
ef2 = EfficientFrontier(mu_sample, cov_sample, weight_bounds=(0, 0.2))
# min volatility
ef2.min_volatility()
ret_tangent, std_tangent, _ = ef2.portfolio_performance()
ax.scatter(std_tangent, ret_tangent, marker="*", s=100, c="r", label="Min Volatility")

# Format
ax.set_title("Efficient Frontier - %s \n Sample Covariance Estimates" %industry_name)
ax.legend()
plt.tight_layout()
plt.savefig('images/Efficient_Frontier_Sample_Covariance_Estimates' + str(industry_name) + '.png', dpi=200, bbox_inches='tight')
plt.show()

NameError: name 'EfficientFrontier' is not defined

##### Min Volatility Portfolio

###### Performance

In [24]:
ef2.portfolio_performance(verbose=True);

NameError: name 'ef2' is not defined

###### Weights

In [25]:
pd.read_csv("data/min_vol_sample_State_Commercial_Banks.csv")

Unnamed: 0,Company_Name,Weight
0,"RELIANT BANCORP, INC.",0.12273
1,"CAROLINA TRUST BANCSHARES, INC.",0.11786
2,INVESTAR HOLDING CORP,0.1944
3,FIRST COMMUNITY CORP /SC/,0.05076
4,VILLAGE BANK & TRUST FINANCIAL CORP.,0.13994
5,BANK OF THE JAMES FINANCIAL GROUP INC,0.00178
6,CITIZENS & NORTHERN CORP,0.05375
7,BANK OF NEW YORK MELLON CORP,0.09533
8,"FAUQUIER BANKSHARES, INC.",0.02143
9,MACKINAC FINANCIAL CORP /MI/,0.02478
