In [1]:
import sys
sys.path.insert(0, '..')
%load_ext autoreload
%autoreload 2
%aimport std_func

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Mean-Variance Analysis - Minimum Variance

Mean-variance analysis is a mathematical framework that examplifies the trade-off between return and risk. It is used to create diversified portfolios based on investors’ expectation. There are one main approach used in this report. We have the minimum volatility portfolio that concentrates on minimizing the risk of the portfolio. Mimimum variance portfolio can help us compare the correlation of simple sample covariance, covariance generated using cosine similarity distances and covariance generated using factor model in Sent-LDA.

### Monthly Returns
Since we will generate cosine similarity estimates in the next notebook using business description of companies from 2016 to 2018. We only consider monthly returns before 2019.

In [3]:
#returns_data = pd.read_csv('../data/returns_2018_top5_SIC.csv', 
#                      parse_dates = ['DATE'], 
#                      usecols = ["trt1m", "CIK", "name", "DATE"])

In [4]:
# TAKE OUT THE DATA before 2019 and reset index
#end_date = pd.to_datetime('2019-01-01')
#returns = returns_data.loc[returns_data['DATE'] < end_date]

# drop the missing value
#returns = returns.dropna()
# group the data by name and date
#group_returns = returns.groupby(['name', 'DATE'])
# calculate the percentage return
#group_returns_pct = group_returns.agg({"trt1m": (lambda x:  x/100)})
#r_selected = group_returns_pct.unstack()
# drop the first index
#r_selected.columns = r_selected.columns.droplevel(0)

In [5]:
# there are some rows containing more than one float number (array of multiple returns)
#for i in range(len(r_selected)):
#    if type(r_selected.iloc[i,1]) == np.ndarray:
#        r_selected = r_selected.drop(index = r_selected.index[i])
# convert object to float
#r_selected = r_selected.astype(float)
# drop missing values rows
#r_selected = r_selected.dropna()
#r_selected.to_csv("filtered_r.csv")
# get the mean of all 
#mu = r_selected.mean(axis = 1)
# compute the covariance matrix 
#cov = r_selected.T.cov()

In [6]:
r_selected = pd.read_csv("filtered_r.csv")
# get the mean of all 
r_selected.set_index("name", inplace = True)
mu = r_selected.mean(axis = 1)
# compute the covariance matrix 
cov = r_selected.T.cov()

In [7]:
## Cosine Similarity

In [8]:
df = pd.read_csv('../data/preprocessed.csv',
                 usecols = ['reportingDate', 'name', 'CIK',
                           'coDescription_stopwords', 'SIC', 'SIC_desc'])
df = df.set_index(df.name)

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

Vectorizer = CountVectorizer(ngram_range = (2,4), 
                             max_features = 600)

count_data = Vectorizer.fit_transform(df['coDescription_stopwords'])
wordsCount = pd.DataFrame(count_data.toarray(),columns=Vectorizer.get_feature_names_out())
wordsCount = wordsCount.set_index(df['name'])

In [10]:
#### Cosine Similarity Computation

In [11]:
# Compute Cosine Similarity
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = pd.DataFrame(cosine_similarity(wordsCount, wordsCount))
cosine_sim = cosine_sim.set_index(df['name'])
cosine_sim.columns = df['name']
#cosine_sim

### Perform Mean-Variance Analysis

In [12]:
from pypfopt import EfficientFrontier
from pypfopt import risk_models
from pypfopt import expected_returns
from pypfopt import objective_functions
from pypfopt import plotting

In [13]:
# get the names of the companies in the pharmaceutical preparations industry
Pharm = df[df.SIC == 2834]
Pharm_list = Pharm.index

In [14]:
# get the companies name that match return data and business description data
SET = (set(Pharm_list) & set(r_selected.index))
LIST = [*SET, ]

#### Mean and Covariance for Returns in the Pharmaceutical Preparations Industry

In [15]:
mu_Pharm = mu[LIST]
mu_Pharm

name
EYEGATE PHARMACEUTICALS INC      -0.034348
IMMUNOGEN INC                     0.021513
PULMATRIX, INC.                   0.009480
XENCOR INC                        0.040986
HORIZON PHARMA PLC                0.011321
                                    ...   
AQUINOX PHARMACEUTICALS, INC     -0.004622
FIVE PRIME THERAPEUTICS INC      -0.038194
ULTRAGENYX PHARMACEUTICAL INC.   -0.000979
ENDO INTERNATIONAL PLC           -0.006443
RECRO PHARMA, INC.                0.008656
Length: 124, dtype: float64

In [16]:
tmp = cov[LIST].T
cov_Pharm = tmp[LIST]
cov_Pharm

name,EYEGATE PHARMACEUTICALS INC,IMMUNOGEN INC,"PULMATRIX, INC.",XENCOR INC,HORIZON PHARMA PLC,LILLY ELI & CO,"ALNYLAM PHARMACEUTICALS, INC.","CHEMBIO DIAGNOSTICS, INC.","IMPRIMIS PHARMACEUTICALS, INC.",LIGAND PHARMACEUTICALS INC,...,"HERON THERAPEUTICS, INC. /DE/",CELGENE CORP /DE/,IRONWOOD PHARMACEUTICALS INC,XENON PHARMACEUTICALS INC.,OPHTHOTECH CORP.,"AQUINOX PHARMACEUTICALS, INC",FIVE PRIME THERAPEUTICS INC,ULTRAGENYX PHARMACEUTICAL INC.,ENDO INTERNATIONAL PLC,"RECRO PHARMA, INC."
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
EYEGATE PHARMACEUTICALS INC,0.067765,0.008265,0.036116,0.003707,0.000763,0.005505,0.014804,0.001487,0.014236,-0.003018,...,0.006248,0.004158,0.010305,-0.005188,-0.011197,-0.002083,0.002977,0.012935,0.001302,0.000540
IMMUNOGEN INC,0.008265,0.054333,0.031164,0.002613,0.005660,0.001418,0.017972,0.002480,-0.000511,0.010364,...,0.009876,0.008285,0.008481,0.000785,0.005605,0.016301,0.003954,0.016139,-0.003940,0.001424
"PULMATRIX, INC.",0.036116,0.031164,0.306222,-0.011031,-0.001549,0.009467,0.020250,-0.012933,-0.007801,0.005065,...,-0.002842,0.003889,-0.000049,0.002778,0.001751,0.008470,0.002333,0.014681,-0.009912,-0.012627
XENCOR INC,0.003707,0.002613,-0.011031,0.022963,0.007579,0.001555,0.002949,0.004722,-0.001048,0.004592,...,0.002171,0.002003,0.006714,-0.000965,0.002433,0.011319,0.005985,0.011092,0.001781,-0.003559
HORIZON PHARMA PLC,0.000763,0.005660,-0.001549,0.007579,0.014189,0.000764,0.006950,0.000456,0.001518,0.007058,...,0.004239,0.003863,0.001926,0.004863,0.013270,0.006688,0.007463,0.013208,-0.000178,-0.000387
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"AQUINOX PHARMACEUTICALS, INC",-0.002083,0.016301,0.008470,0.011319,0.006688,0.000891,0.009848,-0.001749,-0.000662,0.003048,...,-0.002289,0.006112,0.004297,-0.008879,-0.000505,0.044662,0.009508,0.015377,-0.008580,0.002959
FIVE PRIME THERAPEUTICS INC,0.002977,0.003954,0.002333,0.005985,0.007463,0.000214,0.007634,-0.002594,-0.010837,0.007070,...,-0.003428,0.003060,0.004857,0.006868,0.003760,0.009508,0.022705,0.009038,-0.004608,0.000236
ULTRAGENYX PHARMACEUTICAL INC.,0.012935,0.016139,0.014681,0.011092,0.013208,0.001649,0.015535,0.007680,-0.004799,0.012549,...,0.010569,0.008968,0.012150,0.012548,0.009574,0.015377,0.009038,0.031959,0.006636,-0.001879
ENDO INTERNATIONAL PLC,0.001302,-0.003940,-0.009912,0.001781,-0.000178,0.003087,0.006411,0.010691,0.000449,0.002724,...,0.004021,0.004700,0.004758,0.021544,0.004510,-0.008580,-0.004608,0.006636,0.037921,-0.000460


In [17]:
#### Cosine Similarity Distances in the Pharmaceutical Preparations Industry

In [18]:
tmp = cosine_sim[LIST].drop_duplicates().T
Pharm_cos_sim = tmp[LIST].drop_duplicates()

In [19]:
#### Covariance for Cosine Similarity

In [20]:
cos_sim_sd = pd.DataFrame(np.sqrt(np.diag(np.diagonal(cov_Pharm))))
cos_sim_sd = cos_sim_sd.set_index(cov_Pharm.index)
cos_sim_sd.columns = cov_Pharm.index

In [21]:
cos_sim_cov = pd.DataFrame((np.dot(np.dot(cos_sim_sd, Pharm_cos_sim),cos_sim_sd)))