# Universal Sentence Encoder
The Universal Sentence Encoder encodes text into high-dimensional vectors that can be used for text classification, semantic similarity, clustering and other natural language tasks. It is trained on a variety of data sources to learn for a wide variety of tasks. The sources are Wikipedia, web news, web question-answer pages, and discussion forums. The input is a variable-length English text and the output is a 512-dimensional vector.

In [1]:
from datetime import datetime
import pandas as pd
import numpy as np
import json
import re
import string
import nltk

In [2]:
data = pd.read_json('bq_2018_top5SIC.json', lines=True)

ValueError: Expected object or value

In [None]:
df = data[['reportingDate', 'name', 'coDescription', 'SIC', 'SIC_desc']]

## Data Cleaning
#### - Normalization
#### - Remove Stopwords
#### - Lemmatization

In [None]:
#strip any left over html code
def clean_data_fn(insrt_data):
    
    clean_data = []
    
    for idx, ele in insrt_data.iterrows():
        if "https://www.sec.gov/Archives/edgar/data/" in ele[2]:
            pass
        else:
            clean_txt = re.compile('<.*?>')
            
            pos = 0
            
            desc = re.sub(clean_txt,'',ele["coDescription"]).replace(u'\xa0', u' ').replace("   ", "").replace("'", "").replace('"','')
            
            desc = desc.lower()
            
            if re.search('<', desc):
                pos = re.search('<', desc).start()
            
            desc = desc[:pos]
            
            if (desc.find("business") == -1): # didnt find it then look for next
                if(desc.find("business.") == -1): # didnt find it then just remove anything after "<" if at all
                    desc = desc[6 : ( desc.rfind("<") )]

                else: # found "Business.", remove everything before it
                    desc =  desc[( desc.find("business.") + 9 ) : ( desc.rfind("<") ) ]
            else:
                desc = desc[( desc.find("business") + 8 ) : ( desc.rfind("<") ) ]
            
            # remove leading white space and punctuation
            desc = re.sub(r'[\.\?\!\,\:\;\"]', '', desc).strip()
            
            # remove the non-letters
            desc = ''.join([x for x in desc if x in string.ascii_letters + '\'- '])
                
            new_data = pd.Series([ele[0], ele[1], desc, ele[3], ele[4]], index = insrt_data.columns)
            
            if len(desc)<250:
                pass
            else:
                clean_data.append(new_data)
    return(pd.DataFrame(clean_data))

df = clean_data_fn(df)
df

In [None]:
## Remove stop words
from nltk.corpus import stopwords
# nltk.download('stopwords')
from nltk.tokenize import word_tokenize

def removeStopWords(description):
    text_tokens = word_tokenize(description)
    tokens_without_sw = [word for word in text_tokens if not word in stopwords.words()]
    print("in")
    return (" ").join(tokens_without_sw)

df["coDescription"] = df["coDescription"].apply(removeStopWords)
df["coDescription"]

In [None]:
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer 

lemmatizer = WordNetLemmatizer()

def nltk2wn_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:                    
        return None
    
def lemmatize_sentence(sentence):
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))    
    wn_tagged = map(lambda x: (x[0], nltk2wn_tag(x[1])), nltk_tagged)
    res_words = []
    for word, tag in wn_tagged:
        if tag is None:                        
            res_words.append(word)
        else:
            res_words.append(lemmatizer.lemmatize(word, tag))
    return " ".join(res_words)

lemma_desc = df["coDescription"].apply(lemmatize_sentence)
df["coDescription_lemmatized"] = lemma_desc
df["coDescription_lemmatized"]

### Training

In [3]:
df = pd.read_csv('../data/preprocessed.csv')
df.drop_duplicates(subset = "name", keep=False, inplace=True)
df

Unnamed: 0.1,Unnamed: 0,accessionNumber,filingDate,reportingDate,financialEntity,htmlFile,coDescription,CIK,name,countryinc,cityma,SIC,SIC_desc,coDescription_lemmatized,coDescription_stopwords
0,0,0001441816-18-000028,2018-03-30 20:12:23 UTC,2018-02-01,financialEntities/params;cik=1441816,https://www.sec.gov/Archives/edgar/data/144181...,"mongodb is the leading modern, general purpose...",1441816,"MONGODB, INC.",US,NEW YORK,7372,Prepackaged Software (mass reproduction of sof...,"mongodb is the leading modern , general purpos...",mongodb leading modern general purpose databas...
1,1,0001108524-18-000011,2018-03-09 22:01:46 UTC,2018-02-01,financialEntities/params;cik=1108524,https://www.sec.gov/Archives/edgar/data/110852...,salesforce is a global leader in customer rela...,1108524,SALESFORCE COM INC,US,SAN FRANCISCO,7372,Prepackaged Software (mass reproduction of sof...,salesforce is a global leader in customer rela...,salesforce global leader customer relationship...
2,3,0001353283-18-000004,2018-03-30 21:21:46 UTC,2018-02-01,financialEntities/params;cik=1353283,https://www.sec.gov/Archives/edgar/data/135328...,splunk provides innovative software solutions ...,1353283,SPLUNK INC,,SAN FRANCISCO,7372,Prepackaged Software (mass reproduction of sof...,splunk provides innovative software solution t...,splunk provides innovative software solution e...
3,4,0001660134-18-000007,2018-03-12 20:45:43 UTC,2018-02-01,financialEntities/params;cik=1660134,https://www.sec.gov/Archives/edgar/data/166013...,okta is the leading independent provider of id...,1660134,"OKTA, INC.",US,SAN FRANCISCO,7372,Prepackaged Software (mass reproduction of sof...,okta is the leading independent provider of id...,okta leading independent provider identity ent...
4,5,0001564590-18-007164,2018-03-29 21:34:05 UTC,2018-02-01,financialEntities/params;cik=1393052,https://www.sec.gov/Archives/edgar/data/139305...,veeva is a leading provider of industry cloud ...,1393052,VEEVA SYSTEMS INC,,PLEASANTON,7372,Prepackaged Software (mass reproduction of sof...,veeva is a leading provider of industry cloud ...,veeva leading provider industry cloud solution...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
670,1114,0001595527-19-000005,2019-03-15 12:45:38 UTC,2019-01-01,financialEntities/params;cik=1595527,https://www.sec.gov/Archives/edgar/data/159552...,"ationwe were incorporated on december 19, 2013...",1595527,"AMERICAN REALTY CAPITAL NEW YORK CITY REIT, INC.",US,NEW YORK,6798,Real Estate Investment Trusts,"ationwe were incorporated on december 19 , 201...",ationwe incorporated december maryland corpora...
671,1115,0001144204-19-016652,2019-03-28 20:28:30 UTC,2019-01-01,financialEntities/params;cik=1130166,https://www.sec.gov/Archives/edgar/data/113016...,llowing business section contains forward-look...,1130166,"CYCLACEL PHARMACEUTICALS, INC.",US,BERKELEY HEIGHTS,2834,Pharmaceutical Preparations,llowing business section contains forward-look...,llowing business section contains statement ac...
672,1117,0001555280-19-000041,2019-02-14 22:08:33 UTC,2019-01-01,financialEntities/params;cik=1555280,https://www.sec.gov/Archives/edgar/data/155528...,ts.costs and expenses costs of sales consist p...,1555280,ZOETIS INC.,,PARSIPPANY,2834,Pharmaceutical Preparations,ts.costs and expense cost of sale consist prim...,expense cost sale consist primarily cost mater...
673,1121,0001479094-19-000006,2019-02-13 21:22:54 UTC,2019-01-01,financialEntities/params;cik=1479094,https://www.sec.gov/Archives/edgar/data/147909...,certain definitionsin this report:we define ga...,1479094,"STAG INDUSTRIAL, INC.",,BOSTON,6798,Real Estate Investment Trusts,certain definitionsin this report : we define ...,certain definitionsin report define gaap gener...


In [4]:
import tensorflow_hub as hub
!pip3 install seaborn
import seaborn as sns
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")



You should consider upgrading via the 'C:\Users\maryx\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


OSError: SavedModel file does not exist at: C:\Users\maryx\AppData\Local\Temp\tfhub_modules\063d866c06683311b44b4992fd46003be952409c\{saved_model.pbtxt|saved_model.pb}

In [None]:
embeddings = embed(df["coDescription_stopwords"])
embeddings

## Plotting

In [None]:
def plot_similarity(labels, features, rotation):
    corr = np.inner(features, features)
    sns.set(font_scale=1.2)
    g = sns.heatmap(
      corr,
      xticklabels=labels,
      yticklabels=labels,
      vmin=0,
      vmax=1,
      cmap="YlOrRd")
    g.set_xticklabels(labels, rotation=rotation)
    g.set_title("Semantic Textual Similarity")

plot_similarity(df["name"][:20],embeddings[:20], 90)

In [None]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import plotly.express as px

# Referenced from my CSCD25 course
def visualize_pca(vectors, index):
    multi_index = pd.MultiIndex.from_frame(index, names=["name", "industry"])
    
    pca = PCA(n_components = 50)
    pca_embedding = pca.fit_transform(vectors)
    pca_embedding = pd.DataFrame(pca_embedding, index = multi_index)
    
    fig = px.scatter(pca_embedding, x =0 , y = 1, hover_data={"name": pca_embedding.index.get_level_values(0),
                                                              "industry": pca_embedding.index.get_level_values(1)},
                     color = pca_embedding.index.get_level_values(1), width=1200, height=600)
    fig.show()
    
    return [pca, pca_embedding]

plot_pca = visualize_pca(embeddings, df.loc[:,["name","SIC_desc"]])

In [None]:
fig = px.scatter_3d(plot_pca[1], x =0 , y = 1, z = 2, hover_data={"name": plot_pca[1].index.get_level_values(0),
                                                              "industry": plot_pca[1].index.get_level_values(1)},
                    color = plot_pca[1].index.get_level_values(1), width=1200, height=700)
fig.show()

In [None]:
for i in range(len(embeddings)):
    
corr = np.inner(embeddings, embeddings)
len(corr)

In [None]:
embedding_matrix = pd.DataFrame(embeddings)
embedding_matrix.index = df["name"]
embedding_matrix

## Similarity Matrix
USE gives normalized embeddings, so the inner product of encodings can be treated as a similarity matrix.

In [None]:
dot_product = np.matmul(embedding_matrix, embedding_matrix.T)
np.fill_diagonal(dot_product.values, 0)

In [None]:
dot_product.index = df["SIC_desc"]
dot_product.columns = df["SIC_desc"]
dot_product_df = pd.DataFrame(dot_product.idxmax(axis=1))
dot_product_df.reset_index(level=0, inplace=True)
dot_product_df.columns = ["desc1","desc2"]
dot_product_df

In [None]:
print("Percentage of correct category predictions: ")
print(np.sum(np.where(dot_product_df.iloc[:,1] == dot_product_df.iloc[:,0], 1, 0))/len(embeddings))