# Universal Sentence Encoder
The Universal Sentence Encoder encodes text into high-dimensional vectors that can be used for text classification, semantic similarity, clustering and other natural language tasks. It is trained on a variety of data sources to learn for a wide variety of tasks. The sources are Wikipedia, web news, web question-answer pages, and discussion forums. The input is a variable-length English text and the output is a 512-dimensional vector.

In [1]:
from datetime import datetime
import pandas as pd
import numpy as np
import json
import re
import string
import nltk

In [2]:
data = pd.read_json('bq_2018_top5SIC.json', lines=True)

In [3]:
df = data[['reportingDate', 'name', 'coDescription', 'SIC', 'SIC_desc']]

## Data Cleaning
#### - Normalization
#### - Remove Stopwords
#### - Lemmatization

In [4]:
#strip any left over html code
def clean_data_fn(insrt_data):
    
    clean_data = []
    
    for idx, ele in insrt_data.iterrows():
        if "https://www.sec.gov/Archives/edgar/data/" in ele[2]:
            pass
        else:
            clean_txt = re.compile('<.*?>')
            
            pos = 0
            
            desc = re.sub(clean_txt,'',ele["coDescription"]).replace(u'\xa0', u' ').replace("   ", "").replace("'", "").replace('"','')
            
            desc = desc.lower()
            
            if re.search('<', desc):
                pos = re.search('<', desc).start()
            
            desc = desc[:pos]
            
            if (desc.find("business") == -1): # didnt find it then look for next
                if(desc.find("business.") == -1): # didnt find it then just remove anything after "<" if at all
                    desc = desc[6 : ( desc.rfind("<") )]

                else: # found "Business.", remove everything before it
                    desc =  desc[( desc.find("business.") + 9 ) : ( desc.rfind("<") ) ]
            else:
                desc = desc[( desc.find("business") + 8 ) : ( desc.rfind("<") ) ]
            
            # remove leading white space and punctuation
            desc = re.sub(r'[\.\?\!\,\:\;\"]', '', desc).strip()
            
            # remove the non-letters
            desc = ''.join([x for x in desc if x in string.ascii_letters + '\'- '])
                
            new_data = pd.Series([ele[0], ele[1], desc, ele[3], ele[4]], index = insrt_data.columns)
            
            if len(desc)<250:
                pass
            else:
                clean_data.append(new_data)
    return(pd.DataFrame(clean_data))

df = clean_data_fn(df)
df

Unnamed: 0,reportingDate,name,coDescription,SIC,SIC_desc
0,2018-02-01,"MONGODB, INC.",overviewmongodb is the leading modern general ...,7372,Prepackaged Software (mass reproduction of sof...
1,2018-02-01,SALESFORCE COM INC,overviewsalesforce is a global leader in custo...,7372,Prepackaged Software (mass reproduction of sof...
2,2018-02-01,SPLUNK INC,overviewsplunk provides innovative software so...,7372,Prepackaged Software (mass reproduction of sof...
3,2018-02-01,"OKTA, INC.",overview okta is the leading independent provi...,7372,Prepackaged Software (mass reproduction of sof...
4,2018-02-01,VEEVA SYSTEMS INC,overview veeva is a leading provider of indust...,7372,Prepackaged Software (mass reproduction of sof...
...,...,...,...,...,...
668,2019-01-01,"AMERICAN REALTY CAPITAL NEW YORK CITY REIT, INC.",organizationwe were incorporated on december ...,6798,Real Estate Investment Trusts
669,2019-01-01,"CYCLACEL PHARMACEUTICALS, INC.",the following business section contains forwar...,2834,Pharmaceutical Preparations
670,2019-01-01,ZOETIS INC.,productscosts and expenses costs of sales cons...,2834,Pharmaceutical Preparations
671,2019-01-01,"STAG INDUSTRIAL, INC.",certain definitionsin this reportwe define gaa...,6798,Real Estate Investment Trusts


In [None]:
## Remove stop words
from nltk.corpus import stopwords
# nltk.download('stopwords')
from nltk.tokenize import word_tokenize

def removeStopWords(description):
    text_tokens = word_tokenize(description)
    tokens_without_sw = [word for word in text_tokens if not word in stopwords.words()]
    print("in")
    return (" ").join(tokens_without_sw)

df["coDescription"] = df["coDescription"].apply(removeStopWords)
df["coDescription"]

in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in
in


In [None]:
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer 

lemmatizer = WordNetLemmatizer()

def nltk2wn_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:                    
        return None
    
def lemmatize_sentence(sentence):
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))    
    wn_tagged = map(lambda x: (x[0], nltk2wn_tag(x[1])), nltk_tagged)
    res_words = []
    for word, tag in wn_tagged:
        if tag is None:                        
            res_words.append(word)
        else:
            res_words.append(lemmatizer.lemmatize(word, tag))
    return " ".join(res_words)

lemma_desc = df["coDescription"].apply(lemmatize_sentence)
df["coDescription_lemmatized"] = lemma_desc
df["coDescription_lemmatized"]

### Training

In [None]:
import tensorflow_hub as hub
!pip3 install seaborn
import seaborn as sns
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [None]:
embeddings = embed(df["coDescription_lemmatized"])
embeddings

## Plotting

In [None]:
def plot_similarity(labels, features, rotation):
    corr = np.inner(features, features)
    sns.set(font_scale=1.2)
    g = sns.heatmap(
      corr,
      xticklabels=labels,
      yticklabels=labels,
      vmin=0,
      vmax=1,
      cmap="YlOrRd")
    g.set_xticklabels(labels, rotation=rotation)
    g.set_title("Semantic Textual Similarity")

plot_similarity(df["name"][:20],embeddings[:20], 90)

In [None]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import plotly.express as px

# Referenced from my CSCD25 course
def visualize_pca(vectors, index):
    multi_index = pd.MultiIndex.from_frame(index, names=["name", "industry"])
    
    pca = PCA(n_components = 50)
    pca_embedding = pca.fit_transform(vectors)
    pca_embedding = pd.DataFrame(pca_embedding, index = multi_index)
    
    fig = px.scatter(pca_embedding, x =0 , y = 1, hover_data={"name": pca_embedding.index.get_level_values(0),
                                                              "industry": pca_embedding.index.get_level_values(1)},
                     color = pca_embedding.index.get_level_values(1), width=1200, height=600)
    fig.show()
    
    return [pca, pca_embedding]

plot_pca = visualize_pca(embeddings, df.loc[:,["name","SIC_desc"]])

In [None]:
fig = px.scatter_3d(plot_pca[1], x =0 , y = 1, z = 2, hover_data={"name": plot_pca[1].index.get_level_values(0),
                                                              "industry": plot_pca[1].index.get_level_values(1)},
                    color = plot_pca[1].index.get_level_values(1), width=1200, height=700)
fig.show()

In [None]:
for i in range(len(embeddings)):
    
corr = np.inner(embeddings, embeddings)
len(corr)

In [None]:
embedding_matrix = pd.DataFrame(embeddings)
embedding_matrix.index = df["name"]
embedding_matrix

## Similarity Matrix
USE gives normalized embeddings, so the inner product of encodings can be treated as a similarity matrix.

In [None]:
dot_product = np.matmul(embedding_matrix, embedding_matrix.T)
np.fill_diagonal(dot_product.values, 0)

In [None]:
dot_product.index = df["SIC_desc"]
dot_product.columns = df["SIC_desc"]
dot_product_df = pd.DataFrame(dot_product.idxmax(axis=1))
dot_product_df.reset_index(level=0, inplace=True)
dot_product_df.columns = ["desc1","desc2"]
dot_product_df

In [None]:
print("Percentage of correct category predictions: ")
print(np.sum(np.where(dot_product_df.iloc[:,1] == dot_product_df.iloc[:,0], 1, 0))/len(embeddings))