# Universal Sentence Encoder
The Universal Sentence Encoder encodes text into high-dimensional vectors that can be used for text classification, semantic similarity, clustering and other natural language tasks. It is trained on a variety of data sources to learn for a wide variety of tasks. The sources are Wikipedia, web news, web question-answer pages, and discussion forums. The input is a variable-length English text and the output is a 512-dimensional vector.

In [41]:
from datetime import datetime
import pandas as pd
import numpy as np
import json
import re
import string
import nltk

In [42]:
df = pd.read_json('bq_2018_top5SIC.json', lines=True)
df.drop_duplicates(subset = "name", keep=False, inplace=True)
df

Unnamed: 0,accessionNumber,filingDate,reportingDate,financialEntity,htmlFile,coDescription,CIK,name,countryinc,cityma,SIC,SIC_desc
0,0001441816-18-000028,2018-03-30 20:12:23 UTC,2018-02-01,financialEntities/params;cik=1441816,https://www.sec.gov/Archives/edgar/data/144181...,Item 1. BusinessOverviewMongoDB is the leading...,1441816,"MONGODB, INC.",US,NEW YORK,7372,Prepackaged Software (mass reproduction of sof...
1,0001108524-18-000011,2018-03-09 22:01:46 UTC,2018-02-01,financialEntities/params;cik=1108524,https://www.sec.gov/Archives/edgar/data/110852...,ITEM 1. BUSINESSOverviewSalesforce is a global...,1108524,SALESFORCE COM INC,US,SAN FRANCISCO,7372,Prepackaged Software (mass reproduction of sof...
2,0001564590-18-006986,2018-03-28 21:27:30 UTC,2018-02-01,financialEntities/params;cik=1385867,https://www.sec.gov/Archives/edgar/data/138586...,"Item 1.Business1<p style=""margin-bottom:0pt;ma...",1385867,COUPA SOFTWARE INC,US,SAN MATEO,7372,Prepackaged Software (mass reproduction of sof...
3,0001353283-18-000004,2018-03-30 21:21:46 UTC,2018-02-01,financialEntities/params;cik=1353283,https://www.sec.gov/Archives/edgar/data/135328...,Item 1. BusinessOverviewSplunk provides innov...,1353283,SPLUNK INC,,SAN FRANCISCO,7372,Prepackaged Software (mass reproduction of sof...
4,0001660134-18-000007,2018-03-12 20:45:43 UTC,2018-02-01,financialEntities/params;cik=1660134,https://www.sec.gov/Archives/edgar/data/166013...,Item 1. BusinessOverview Okta is the leading i...,1660134,"OKTA, INC.",US,SAN FRANCISCO,7372,Prepackaged Software (mass reproduction of sof...
...,...,...,...,...,...,...,...,...,...,...,...,...
1122,0001047469-19-001136,2019-03-12 20:29:52 UTC,2019-01-01,financialEntities/params;cik=1261249,https://www.sec.gov/Archives/edgar/data/126124...,Item 1. Business 3 <FONT SIZE=2,1261249,AGILE THERAPEUTICS INC,US,PRINCETON,2834,Pharmaceutical Preparations
1123,0001628280-19-001771,2019-02-22 22:02:40 UTC,2019-01-01,financialEntities/params;cik=1101239,https://www.sec.gov/Archives/edgar/data/110123...,"ITEM 1.BUSINESSThe words ""Equinix"", ""we"", ""our...",1101239,EQUINIX INC,US,REDWOOD CITY,6798,Real Estate Investment Trusts
1124,0001558370-19-001346,2019-02-28 22:04:41 UTC,2019-01-01,financialEntities/params;cik=1583107,https://www.sec.gov/Archives/edgar/data/158310...,"Item 1. Business 4<font style=""display:inline;""",1583107,"THERAVANCE BIOPHARMA, INC.",KY,"GEORGE TOWN, GRAND CAYMAN",2834,Pharmaceutical Preparations
1125,0001601545-19-000016,2019-03-08 14:53:38 UTC,2019-01-01,financialEntities/params;cik=1601545,https://www.sec.gov/Archives/edgar/data/160154...,"Item 1.BUSINESS3<font style=""font-family:inher...",1601545,"BLUE HILLS BANCORP, INC.",US,HYDE PARK,6022,State Commercial Banks (commercial banking)


## Data Cleaning
#### - Normalization
#### - Remove Stopwords
#### - Lemmatization

In [43]:
#strip any left over html code
def clean_data_fn(insrt_data):
    
    clean_data = []
    
    for idx, ele in insrt_data.iterrows():
        if "https://www.sec.gov/Archives/edgar/data/" in ele["coDescription"]:
            pass
        else:
            clean_txt = re.compile('<.*?>')
            desc = re.sub(clean_txt,'',ele["coDescription"]).replace(u'\xa0', u' ').replace("   ", "").replace("'", "").replace('"','')
            if re.search('<', desc):
                pos = re.search('<', desc).start()
                desc = desc[:pos].lower()
            if (desc.find("business") >= 20): # didnt find it in the first 20 characters then look for next
                desc = desc[6 : ( desc.rfind("<") )] # remove the "Item 1." stuff only
            else: # found "business", remove everything before it
                desc =  desc[( desc.find("business") + 8 ) : ( desc.rfind("<") ) ]
            if (desc.find("overview") <= 20): # didnt find it in the first 20 characters then look for next
                desc =  desc[( desc.find("overview") + 8 ) :]
            # remove leading white space and periods
            desc = re.sub(r"^\.", "", desc).strip()            
            new_data = ele.copy()
            new_data["coDescription"] = desc
            # remove any filings with a description less than 250 characters (not enough information for us)
            if len(desc)<250:
                pass
            else:
                clean_data.append(new_data)
                
    return(pd.DataFrame(clean_data))

df = clean_data_fn(df)#.rename(columns = {"financialEntity":"CIK"})
df["CIK"] = df["CIK"].astype(int)
df.head()

Unnamed: 0,accessionNumber,filingDate,reportingDate,financialEntity,htmlFile,coDescription,CIK,name,countryinc,cityma,SIC,SIC_desc
0,0001441816-18-000028,2018-03-30 20:12:23 UTC,2018-02-01,financialEntities/params;cik=1441816,https://www.sec.gov/Archives/edgar/data/144181...,"mongodb is the leading modern, general purpose...",1441816,"MONGODB, INC.",US,NEW YORK,7372,Prepackaged Software (mass reproduction of sof...
1,0001108524-18-000011,2018-03-09 22:01:46 UTC,2018-02-01,financialEntities/params;cik=1108524,https://www.sec.gov/Archives/edgar/data/110852...,salesforce is a global leader in customer rela...,1108524,SALESFORCE COM INC,US,SAN FRANCISCO,7372,Prepackaged Software (mass reproduction of sof...
3,0001353283-18-000004,2018-03-30 21:21:46 UTC,2018-02-01,financialEntities/params;cik=1353283,https://www.sec.gov/Archives/edgar/data/135328...,splunk provides innovative software solutions ...,1353283,SPLUNK INC,,SAN FRANCISCO,7372,Prepackaged Software (mass reproduction of sof...
4,0001660134-18-000007,2018-03-12 20:45:43 UTC,2018-02-01,financialEntities/params;cik=1660134,https://www.sec.gov/Archives/edgar/data/166013...,okta is the leading independent provider of id...,1660134,"OKTA, INC.",US,SAN FRANCISCO,7372,Prepackaged Software (mass reproduction of sof...
5,0001564590-18-007164,2018-03-29 21:34:05 UTC,2018-02-01,financialEntities/params;cik=1393052,https://www.sec.gov/Archives/edgar/data/139305...,veeva is a leading provider of industry cloud ...,1393052,VEEVA SYSTEMS INC,,PLEASANTON,7372,Prepackaged Software (mass reproduction of sof...


In [44]:
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import word_tokenize
# nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
    
def lemmatize_sentence(sentence):
    lemmatized_output = [lemmatizer.lemmatize(w) for w in word_tokenize(sentence)]
    return " ".join(lemmatized_output)

lemma_desc = df["coDescription"].apply(lemmatize_sentence)
df["coDescription_lemmatized"] = lemma_desc
df["coDescription_lemmatized"].head()

0    mongodb is the leading modern , general purpos...
1    salesforce is a global leader in customer rela...
3    splunk provides innovative software solution t...
4    okta is the leading independent provider of id...
5    veeva is a leading provider of industry cloud ...
Name: coDescription_lemmatized, dtype: object

In [45]:
from nltk.corpus import stopwords

# remove all numbers so they don't show up as dimensions
def remove_nums(x):
    text = x.lower()
    text = re.sub(r'\d+', '', text)
    return text

def remove_stopwords(x):
    stop_words = set(stopwords.words('english'))

    word_tokens = word_tokenize(x)

    filtered_sentence = ' '.join([w for w in word_tokens if not w.lower() in stop_words and w.isalnum()])

    return(filtered_sentence)

rm_num_stopwords = df["coDescription_lemmatized"].apply(remove_nums).apply(remove_stopwords)
df["coDescription_stopwords"] = rm_num_stopwords
rm_num_stopwords.head()

0    mongodb leading modern general purpose databas...
1    salesforce global leader customer relationship...
3    splunk provides innovative software solution e...
4    okta leading independent provider identity ent...
5    veeva leading provider industry cloud solution...
Name: coDescription_lemmatized, dtype: object

In [46]:
r_corr = pd.read_csv('corr_matrix.csv', index_col=0)
r_corr = r_corr.groupby(r_corr.index).first()
r_corr = r_corr.filter(items = list(r_corr.index), axis=1)
r_corr

Unnamed: 0_level_0,1ST SOURCE CORP,"2U, INC.",3D SYSTEMS CORP,ABBVIE INC.,ABRAXAS PETROLEUM CORP,ACADIA PHARMACEUTICALS INC,ACADIA REALTY TRUST,ACELRX PHARMACEUTICALS INC,ACER THERAPEUTICS INC.,ACHAOGEN INC,...,WORKIVA INC,"WPX ENERGY, INC.",XENCOR INC,XENON PHARMACEUTICALS INC.,XOMA CORP,"ZEDGE, INC.",ZOETIS INC.,ZOMEDICA PHARMACEUTICALS CORP.,ZOSANO PHARMA CORP,"ZYNERBA PHARMACEUTICALS, INC."
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1ST SOURCE CORP,1.000000,0.445620,0.456612,0.426005,0.573052,-0.008397,0.447781,0.211480,0.349187,0.062904,...,0.296387,0.674714,0.403629,0.450671,0.549631,0.447259,0.713262,0.654885,0.481596,0.504739
"2U, INC.",0.445620,1.000000,0.729139,0.366627,0.073196,-0.065312,0.241134,0.347920,0.553948,0.246458,...,0.644985,0.417311,0.688344,0.251203,-0.234525,0.141046,0.582820,0.444045,0.028632,0.525276
3D SYSTEMS CORP,0.456612,0.729139,1.000000,0.230479,0.098493,0.073149,0.460555,0.316575,0.845823,-0.067410,...,0.866401,0.393633,0.381451,0.503728,-0.197060,0.056334,0.515294,0.497355,0.245695,0.662672
ABBVIE INC.,0.426005,0.366627,0.230479,1.000000,0.020591,0.076372,-0.154133,-0.291458,0.323191,-0.498499,...,0.346075,0.120014,0.545121,-0.141870,0.580677,0.575760,0.397269,0.034364,0.205675,0.337844
ABRAXAS PETROLEUM CORP,0.573052,0.073196,0.098493,0.020591,1.000000,-0.070128,0.291224,0.489595,0.170491,0.368918,...,-0.104898,0.817680,-0.024715,0.601745,0.419803,0.234896,0.364759,0.376640,0.242583,0.655034
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"ZEDGE, INC.",0.447259,0.141046,0.056334,0.575760,0.234896,-0.332752,0.182316,-0.163950,0.131286,-0.272621,...,0.043221,-0.060208,0.443667,-0.028123,0.498482,1.000000,0.437769,0.242696,0.336963,0.285559
ZOETIS INC.,0.713262,0.582820,0.515294,0.397269,0.364759,0.078082,0.424855,0.192885,0.511341,0.060666,...,0.409413,0.428090,0.497782,0.480238,0.105692,0.437769,1.000000,0.760082,0.457153,0.468017
ZOMEDICA PHARMACEUTICALS CORP.,0.654885,0.444045,0.497355,0.034364,0.376640,-0.100401,0.633695,0.258257,0.393568,0.103966,...,0.155137,0.341592,0.323606,0.708180,-0.028635,0.242696,0.760082,1.000000,0.386716,0.294650
ZOSANO PHARMA CORP,0.481596,0.028632,0.245695,0.205675,0.242583,0.312049,0.800948,0.275537,0.049354,-0.513189,...,0.390910,0.221335,0.454503,0.210043,0.426209,0.336963,0.457153,0.386716,1.000000,0.341455


In [47]:
name = pd.DataFrame();
name["name"] = r_corr.index.tolist()
df = df.merge(name, on="name")

In [49]:
df = df.set_index(df["name"])
df = df.sort_index(key=lambda x: x.str.lower())
df

Unnamed: 0_level_0,accessionNumber,filingDate,reportingDate,financialEntity,htmlFile,coDescription,CIK,name,countryinc,cityma,SIC,SIC_desc,coDescription_lemmatized,coDescription_stopwords
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1ST SOURCE CORP,0000034782-19-000039,2019-02-22 20:21:37 UTC,2019-01-01,financialEntities/params;cik=34782,https://www.sec.gov/Archives/edgar/data/34782/...,"urce corporation1st source corporation, an ind...",34782,1ST SOURCE CORP,US,SOUTH BEND,6022,State Commercial Banks (commercial banking),"urce corporation1st source corporation , an in...",urce corporationst source corporation indiana ...
"2U, INC.",0001104659-19-010491,2019-02-26 12:32:03 UTC,2019-01-01,financialEntities/params;cik=1459417,https://www.sec.gov/Archives/edgar/data/145941...,our missionour mission is to improve lives by ...,1459417,"2U, INC.",US,LANHAM,7372,Prepackaged Software (mass reproduction of sof...,our missionour mission is to improve life by e...,missionour mission improve life eliminating ba...
3D SYSTEMS CORP,0000910638-19-000004,2019-02-28 21:03:35 UTC,2019-01-01,financialEntities/params;cik=910638,https://www.sec.gov/Archives/edgar/data/910638...,3d systems corporation (“3d systems” or the “c...,910638,3D SYSTEMS CORP,US,ROCK HILL,7372,Prepackaged Software (mass reproduction of sof...,3d system corporation ( “ 3d system ” or the “...,system corporation system company u holding co...
ABBVIE INC.,0001551152-19-000008,2019-02-27 21:45:37 UTC,2019-01-01,financialEntities/params;cik=1551152,https://www.sec.gov/Archives/edgar/data/155115...,"abbvie(1) is a global, research-based biopharm...",1551152,ABBVIE INC.,US,NORTH CHICAGO,2834,Pharmaceutical Preparations,"abbvie ( 1 ) is a global , research-based biop...",abbvie global biopharmaceutical company abbvie...
ABRAXAS PETROLEUM CORP,0001437749-19-005085,2019-03-15 21:25:12 UTC,2019-01-01,financialEntities/params;cik=867665,https://www.sec.gov/Archives/edgar/data/867665...,l we are an independent energy company primari...,867665,ABRAXAS PETROLEUM CORP,US,SAN ANTONIO,1311,Crude Petroleum and Natural Gas,l we are an independent energy company primari...,l independent energy company primarily engaged...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
XOMA CORP,0001564590-19-006621,2019-03-07 12:17:14 UTC,2019-01-01,financialEntities/params;cik=791908,https://www.sec.gov/Archives/edgar/data/791908...,"and strategyxoma corporation (“xoma”), a delaw...",791908,XOMA CORP,,EMERYVILLE,2834,Pharmaceutical Preparations,"and strategyxoma corporation ( “ xoma ” ) , a ...",strategyxoma corporation xoma delaware corpora...
"ZEDGE, INC.",0001213900-18-014592,2018-10-29 20:18:30 UTC,2018-08-01,financialEntities/params;cik=1667313,https://www.sec.gov/Archives/edgar/data/166731...,ive compensation theinformation required by th...,1667313,"ZEDGE, INC.",US,NEW YORK,7372,Prepackaged Software (mass reproduction of sof...,ive compensation theinformation required by th...,ive compensation theinformation required item ...
ZOETIS INC.,0001555280-19-000041,2019-02-14 22:08:33 UTC,2019-01-01,financialEntities/params;cik=1555280,https://www.sec.gov/Archives/edgar/data/155528...,ts.costs and expenses costs of sales consist p...,1555280,ZOETIS INC.,,PARSIPPANY,2834,Pharmaceutical Preparations,ts.costs and expense cost of sale consist prim...,expense cost sale consist primarily cost mater...
ZOSANO PHARMA CORP,0001587221-19-000006,2019-03-25 12:59:56 UTC,2019-01-01,financialEntities/params;cik=1587221,https://www.sec.gov/Archives/edgar/data/158722...,zosano pharma corporation is a clinical stage ...,1587221,ZOSANO PHARMA CORP,US,FREMONT,2834,Pharmaceutical Preparations,zosano pharma corporation is a clinical stage ...,zosano pharma corporation clinical stage bioph...


Unnamed: 0,accessionNumber,filingDate,reportingDate,financialEntity,htmlFile,coDescription,CIK,name,countryinc,cityma,SIC,SIC_desc,coDescription_lemmatized,coDescription_stopwords
0,0001441816-18-000028,2018-03-30 20:12:23 UTC,2018-02-01,financialEntities/params;cik=1441816,https://www.sec.gov/Archives/edgar/data/144181...,"mongodb is the leading modern, general purpose...",1441816,"MONGODB, INC.",US,NEW YORK,7372,Prepackaged Software (mass reproduction of sof...,"mongodb is the leading modern , general purpos...",mongodb leading modern general purpose databas...
1,0001108524-18-000011,2018-03-09 22:01:46 UTC,2018-02-01,financialEntities/params;cik=1108524,https://www.sec.gov/Archives/edgar/data/110852...,salesforce is a global leader in customer rela...,1108524,SALESFORCE COM INC,US,SAN FRANCISCO,7372,Prepackaged Software (mass reproduction of sof...,salesforce is a global leader in customer rela...,salesforce global leader customer relationship...
2,0001353283-18-000004,2018-03-30 21:21:46 UTC,2018-02-01,financialEntities/params;cik=1353283,https://www.sec.gov/Archives/edgar/data/135328...,splunk provides innovative software solutions ...,1353283,SPLUNK INC,,SAN FRANCISCO,7372,Prepackaged Software (mass reproduction of sof...,splunk provides innovative software solution t...,splunk provides innovative software solution e...
3,0001660134-18-000007,2018-03-12 20:45:43 UTC,2018-02-01,financialEntities/params;cik=1660134,https://www.sec.gov/Archives/edgar/data/166013...,okta is the leading independent provider of id...,1660134,"OKTA, INC.",US,SAN FRANCISCO,7372,Prepackaged Software (mass reproduction of sof...,okta is the leading independent provider of id...,okta leading independent provider identity ent...
4,0001564590-18-007164,2018-03-29 21:34:05 UTC,2018-02-01,financialEntities/params;cik=1393052,https://www.sec.gov/Archives/edgar/data/139305...,veeva is a leading provider of industry cloud ...,1393052,VEEVA SYSTEMS INC,,PLEASANTON,7372,Prepackaged Software (mass reproduction of sof...,veeva is a leading provider of industry cloud ...,veeva leading provider industry cloud solution...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
481,0001144204-19-015290,2019-03-21 17:18:30 UTC,2019-01-01,financialEntities/params;cik=1487782,https://www.sec.gov/Archives/edgar/data/148778...,"in this report,the terms “we,” “our,” “us” and...",1487782,"REVEN HOUSING REIT, INC.",US,LA JOLLA,6798,Real Estate Investment Trusts,"in this report , the term “ we , ” “ our , ” “...",report term u company refer reven housing reit...
482,0001144204-19-016652,2019-03-28 20:28:30 UTC,2019-01-01,financialEntities/params;cik=1130166,https://www.sec.gov/Archives/edgar/data/113016...,llowing business section contains forward-look...,1130166,"CYCLACEL PHARMACEUTICALS, INC.",US,BERKELEY HEIGHTS,2834,Pharmaceutical Preparations,llowing business section contains forward-look...,llowing business section contains statement ac...
483,0001555280-19-000041,2019-02-14 22:08:33 UTC,2019-01-01,financialEntities/params;cik=1555280,https://www.sec.gov/Archives/edgar/data/155528...,ts.costs and expenses costs of sales consist p...,1555280,ZOETIS INC.,,PARSIPPANY,2834,Pharmaceutical Preparations,ts.costs and expense cost of sale consist prim...,expense cost sale consist primarily cost mater...
484,0001479094-19-000006,2019-02-13 21:22:54 UTC,2019-01-01,financialEntities/params;cik=1479094,https://www.sec.gov/Archives/edgar/data/147909...,certain definitionsin this report:we define ga...,1479094,"STAG INDUSTRIAL, INC.",,BOSTON,6798,Real Estate Investment Trusts,certain definitionsin this report : we define ...,certain definitionsin report define gaap gener...


### Training

In [50]:
import tensorflow_hub as hub
import seaborn as sns
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [51]:
embeddings = embed(df["coDescription_stopwords"])
embeddings

<tf.Tensor: shape=(486, 512), dtype=float32, numpy=
array([[ 0.04458071, -0.04458071,  0.04458071, ..., -0.04458071,
        -0.04458071,  0.04458071],
       [ 0.04485003, -0.04485003, -0.04485003, ..., -0.04485003,
        -0.04485003,  0.04485003],
       [ 0.04465589, -0.04465589,  0.04465589, ..., -0.04465589,
        -0.04465589,  0.04465589],
       ...,
       [ 0.0444294 , -0.0444294 ,  0.0444294 , ..., -0.0444294 ,
        -0.0444294 ,  0.0444294 ],
       [ 0.0449831 , -0.0449831 ,  0.04498303, ..., -0.0449831 ,
        -0.0449831 ,  0.0449831 ],
       [ 0.04508528, -0.04378994, -0.03905651, ..., -0.04508528,
        -0.04508528,  0.04390895]], dtype=float32)>

## Results

In [53]:
embedding_matrix = pd.DataFrame(embeddings)
embedding_matrix.index = df["name"]
embedding_matrix

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1ST SOURCE CORP,0.044581,-0.044581,0.044581,0.044581,-0.044581,-0.044581,0.044581,-0.044581,-0.044581,0.044581,...,-0.044581,-0.044581,-0.044581,0.044581,0.044581,0.044581,0.043473,-0.044581,-0.044581,0.044581
"2U, INC.",0.044850,-0.044850,-0.044850,0.043343,-0.044850,-0.044850,0.044850,-0.044850,-0.044850,0.044850,...,-0.044850,-0.044850,-0.044850,-0.006612,0.044850,0.044850,0.044850,-0.044850,-0.044850,0.044850
3D SYSTEMS CORP,0.044656,-0.044656,0.044656,0.044656,-0.044656,-0.044656,0.044656,-0.044656,-0.044656,0.044656,...,-0.044656,-0.044656,-0.044656,0.026355,0.044656,0.044656,0.044656,-0.044656,-0.044656,0.044656
ABBVIE INC.,0.044726,-0.044726,-0.044726,0.044726,-0.044726,-0.044726,0.044726,-0.044726,-0.044726,0.044726,...,-0.044726,-0.044726,-0.044726,-0.044726,0.044726,-0.040617,0.044724,-0.044726,-0.044726,0.044726
ABRAXAS PETROLEUM CORP,0.044369,-0.044369,-0.044359,-0.044369,-0.044369,-0.044369,0.044369,-0.044369,-0.044369,0.044369,...,-0.044369,-0.044369,-0.044369,0.044369,0.044369,-0.044369,0.044369,-0.044369,-0.044369,0.044369
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
XOMA CORP,0.044750,-0.044750,-0.044736,0.044750,-0.044750,-0.044750,0.044750,-0.044750,-0.044750,0.044750,...,-0.044750,-0.044750,0.040934,-0.044750,0.044750,0.044750,0.044750,-0.044750,-0.044750,0.044750
"ZEDGE, INC.",0.011260,-0.072204,-0.055123,0.037938,0.050115,0.010251,-0.024823,0.037015,-0.012099,-0.074498,...,-0.053491,-0.064837,0.031643,-0.004472,0.068950,0.021088,0.051598,-0.023273,0.031190,-0.022904
ZOETIS INC.,0.044429,-0.044429,0.044429,0.044429,-0.044429,-0.044429,0.044429,-0.044429,-0.044429,0.044429,...,-0.044429,-0.044429,-0.044429,0.044429,0.044429,0.044429,0.044429,-0.044429,-0.044429,0.044429
ZOSANO PHARMA CORP,0.044983,-0.044983,0.044983,-0.041795,-0.044983,-0.044983,0.044983,-0.044983,-0.044983,0.044983,...,-0.044983,-0.044983,0.044983,-0.044983,0.044983,-0.044983,0.044983,-0.044983,-0.044983,0.044983


In [54]:
dot_product = np.matmul(embedding_matrix, embedding_matrix.T)

  dot_product = np.matmul(embedding_matrix, embedding_matrix.T)


##  Performance Evaluation 
USE gives normalized embeddings, so the inner product of encodings can be treated as a similarity matrix.

In [16]:
r_corr = pd.read_csv('corr_matrix.csv', index_col=0)
r_corr = r_corr.groupby(r_corr.index).first()
r_corr = r_corr.filter(items = list(r_corr.index), axis=1)
r_corr

Unnamed: 0_level_0,1ST SOURCE CORP,"2U, INC.",3D SYSTEMS CORP,ABBVIE INC.,ABRAXAS PETROLEUM CORP,ACADIA PHARMACEUTICALS INC,ACADIA REALTY TRUST,ACELRX PHARMACEUTICALS INC,ACER THERAPEUTICS INC.,ACHAOGEN INC,...,WORKIVA INC,"WPX ENERGY, INC.",XENCOR INC,XENON PHARMACEUTICALS INC.,XOMA CORP,"ZEDGE, INC.",ZOETIS INC.,ZOMEDICA PHARMACEUTICALS CORP.,ZOSANO PHARMA CORP,"ZYNERBA PHARMACEUTICALS, INC."
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1ST SOURCE CORP,1.000000,0.445620,0.456612,0.426005,0.573052,-0.008397,0.447781,0.211480,0.349187,0.062904,...,0.296387,0.674714,0.403629,0.450671,0.549631,0.447259,0.713262,0.654885,0.481596,0.504739
"2U, INC.",0.445620,1.000000,0.729139,0.366627,0.073196,-0.065312,0.241134,0.347920,0.553948,0.246458,...,0.644985,0.417311,0.688344,0.251203,-0.234525,0.141046,0.582820,0.444045,0.028632,0.525276
3D SYSTEMS CORP,0.456612,0.729139,1.000000,0.230479,0.098493,0.073149,0.460555,0.316575,0.845823,-0.067410,...,0.866401,0.393633,0.381451,0.503728,-0.197060,0.056334,0.515294,0.497355,0.245695,0.662672
ABBVIE INC.,0.426005,0.366627,0.230479,1.000000,0.020591,0.076372,-0.154133,-0.291458,0.323191,-0.498499,...,0.346075,0.120014,0.545121,-0.141870,0.580677,0.575760,0.397269,0.034364,0.205675,0.337844
ABRAXAS PETROLEUM CORP,0.573052,0.073196,0.098493,0.020591,1.000000,-0.070128,0.291224,0.489595,0.170491,0.368918,...,-0.104898,0.817680,-0.024715,0.601745,0.419803,0.234896,0.364759,0.376640,0.242583,0.655034
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"ZEDGE, INC.",0.447259,0.141046,0.056334,0.575760,0.234896,-0.332752,0.182316,-0.163950,0.131286,-0.272621,...,0.043221,-0.060208,0.443667,-0.028123,0.498482,1.000000,0.437769,0.242696,0.336963,0.285559
ZOETIS INC.,0.713262,0.582820,0.515294,0.397269,0.364759,0.078082,0.424855,0.192885,0.511341,0.060666,...,0.409413,0.428090,0.497782,0.480238,0.105692,0.437769,1.000000,0.760082,0.457153,0.468017
ZOMEDICA PHARMACEUTICALS CORP.,0.654885,0.444045,0.497355,0.034364,0.376640,-0.100401,0.633695,0.258257,0.393568,0.103966,...,0.155137,0.341592,0.323606,0.708180,-0.028635,0.242696,0.760082,1.000000,0.386716,0.294650
ZOSANO PHARMA CORP,0.481596,0.028632,0.245695,0.205675,0.242583,0.312049,0.800948,0.275537,0.049354,-0.513189,...,0.390910,0.221335,0.454503,0.210043,0.426209,0.336963,0.457153,0.386716,1.000000,0.341455


In [84]:
r_corr = r_corr.filter(items = list(dot_product.index), axis=0)
r_corr = r_corr.filter(items = list(dot_product.index), axis=1)
r_corr

Unnamed: 0_level_0,1ST SOURCE CORP,"2U, INC.",3D SYSTEMS CORP,ABBVIE INC.,ABRAXAS PETROLEUM CORP,ACADIA PHARMACEUTICALS INC,ACADIA REALTY TRUST,ACELRX PHARMACEUTICALS INC,ACER THERAPEUTICS INC.,ACHAOGEN INC,...,WILDHORSE RESOURCE DEVELOPMENT CORP,WORKIVA INC,"WPX ENERGY, INC.",XENCOR INC,XENON PHARMACEUTICALS INC.,XOMA CORP,"ZEDGE, INC.",ZOETIS INC.,ZOSANO PHARMA CORP,"ZYNERBA PHARMACEUTICALS, INC."
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1ST SOURCE CORP,1.000000,0.445620,0.456612,0.426005,0.573052,-0.008397,0.447781,0.211480,0.349187,0.062904,...,0.367879,0.296387,0.674714,0.403629,0.450671,0.549631,0.447259,0.713262,0.481596,0.504739
"2U, INC.",0.445620,1.000000,0.729139,0.366627,0.073196,-0.065312,0.241134,0.347920,0.553948,0.246458,...,0.190081,0.644985,0.417311,0.688344,0.251203,-0.234525,0.141046,0.582820,0.028632,0.525276
3D SYSTEMS CORP,0.456612,0.729139,1.000000,0.230479,0.098493,0.073149,0.460555,0.316575,0.845823,-0.067410,...,0.133323,0.866401,0.393633,0.381451,0.503728,-0.197060,0.056334,0.515294,0.245695,0.662672
ABBVIE INC.,0.426005,0.366627,0.230479,1.000000,0.020591,0.076372,-0.154133,-0.291458,0.323191,-0.498499,...,-0.106556,0.346075,0.120014,0.545121,-0.141870,0.580677,0.575760,0.397269,0.205675,0.337844
ABRAXAS PETROLEUM CORP,0.573052,0.073196,0.098493,0.020591,1.000000,-0.070128,0.291224,0.489595,0.170491,0.368918,...,0.887783,-0.104898,0.817680,-0.024715,0.601745,0.419803,0.234896,0.364759,0.242583,0.655034
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
XOMA CORP,0.549631,-0.234525,-0.197060,0.580677,0.419803,-0.042336,0.005358,-0.258159,-0.128038,-0.385952,...,0.161650,-0.117924,0.311856,0.146026,0.011761,1.000000,0.498482,0.105692,0.426209,0.171061
"ZEDGE, INC.",0.447259,0.141046,0.056334,0.575760,0.234896,-0.332752,0.182316,-0.163950,0.131286,-0.272621,...,0.152788,0.043221,-0.060208,0.443667,-0.028123,0.498482,1.000000,0.437769,0.336963,0.285559
ZOETIS INC.,0.713262,0.582820,0.515294,0.397269,0.364759,0.078082,0.424855,0.192885,0.511341,0.060666,...,0.234112,0.409413,0.428090,0.497782,0.480238,0.105692,0.437769,1.000000,0.457153,0.468017
ZOSANO PHARMA CORP,0.481596,0.028632,0.245695,0.205675,0.242583,0.312049,0.800948,0.275537,0.049354,-0.513189,...,0.086327,0.390910,0.221335,0.454503,0.210043,0.426209,0.336963,0.457153,1.000000,0.341455


In [109]:
L_r_corr = []
for i in range(len(r_corr)):
    for j in range(len(r_corr)):
        L_r_corr.append(r_corr.iloc[i,j])
L_r_corr = [0 if pd.isna(x) else x for x in L_r_corr]

In [113]:
np.corrcoef(list(np.concatenate(dot_product).flat), L_r_corr)

array([[1.        , 0.06488972],
       [0.06488972, 1.        ]])

0.064 indicates a weak positive correlation between similarity and returns

#### Bartlett’s test of Sphericity
The Bartlett’s test of Sphericity is used to test the null hypothesis that the correlation matrix is an identity matrix. An identity correlation matrix means your variables are unrelated and not ideal for factor analysis/dimensionality reduction.