In [12]:
from google.cloud import bigquery, storage, aiplatform
client = bigquery.Client(location="US", project="ubineerpoc")

query = """
SELECT MAX(reportingDate), financialEntity, coDescription FROM `ubineerpoc.10KData.10KBusinessDesc`
GROUP BY financialEntity, coDescription
LIMIT 800
"""
query_job = client.query(query)  # API request - starts the query

df = query_job.to_dataframe().rename(columns = {'f0_':'report_date'})
df

Unnamed: 0,report_date,financialEntity,coDescription
0,2018-12-29 05:00:00+00:00,financialEntities/params;cik=33488,"Item 1.Business3<FONT STYLE=""font-family: Time..."
1,2008-11-30 05:00:00+00:00,financialEntities/params;cik=811779,ITEM10. EXECUTIVE COMPENSATIONCOMPENSATIONDISC...
2,2015-06-30 04:00:00+00:00,financialEntities/params;cik=1157408,ITEM 1. Business 5 <FONT SIZE=2
3,2010-10-31 04:00:00+00:00,financialEntities/params;cik=1430523,ITEM1. BUSINESSOverviewof our Business and its...
4,2015-03-30 04:00:00+00:00,financialEntities/params;cik=1563166,Item 1. Business.Omitted.<p
...,...,...,...
795,2008-12-31 05:00:00+00:00,financialEntities/params;cik=1102432,Item1A. Risk FactorsThereare several material...
796,2008-12-31 05:00:00+00:00,financialEntities/params;cik=1420108,Item 1.Business.Our Background. We wereincorpo...
797,2008-12-31 05:00:00+00:00,financialEntities/params;cik=1445742,Item 1.Business. Business Development History...
798,2008-12-31 05:00:00+00:00,financialEntities/params;cik=1279643,https://www.sec.gov/Archives/edgar/data/127964...


In [1]:
# from google.cloud import bigquery
import os
import json
import pandas as pd
import numpy as np
import re

In [2]:
# df = pd.read_csv("514_companies.csv")
# df
df = pd.read_json("bq_2018_top5SIC.json", lines = True)
df.head()

Unnamed: 0,accessionNumber,filingDate,reportingDate,financialEntity,htmlFile,coDescription,CIK,name,countryinc,cityma,SIC,SIC_desc
0,0001441816-18-000028,2018-03-30 20:12:23 UTC,2018-02-01,financialEntities/params;cik=1441816,https://www.sec.gov/Archives/edgar/data/144181...,Item 1. BusinessOverviewMongoDB is the leading...,1441816,"MONGODB, INC.",US,NEW YORK,7372,Prepackaged Software (mass reproduction of sof...
1,0001108524-18-000011,2018-03-09 22:01:46 UTC,2018-02-01,financialEntities/params;cik=1108524,https://www.sec.gov/Archives/edgar/data/110852...,ITEM 1. BUSINESSOverviewSalesforce is a global...,1108524,SALESFORCE COM INC,US,SAN FRANCISCO,7372,Prepackaged Software (mass reproduction of sof...
2,0001564590-18-006986,2018-03-28 21:27:30 UTC,2018-02-01,financialEntities/params;cik=1385867,https://www.sec.gov/Archives/edgar/data/138586...,"Item 1.Business1<p style=""margin-bottom:0pt;ma...",1385867,COUPA SOFTWARE INC,US,SAN MATEO,7372,Prepackaged Software (mass reproduction of sof...
3,0001353283-18-000004,2018-03-30 21:21:46 UTC,2018-02-01,financialEntities/params;cik=1353283,https://www.sec.gov/Archives/edgar/data/135328...,Item 1. BusinessOverviewSplunk provides innov...,1353283,SPLUNK INC,,SAN FRANCISCO,7372,Prepackaged Software (mass reproduction of sof...
4,0001660134-18-000007,2018-03-12 20:45:43 UTC,2018-02-01,financialEntities/params;cik=1660134,https://www.sec.gov/Archives/edgar/data/166013...,Item 1. BusinessOverview Okta is the leading i...,1660134,"OKTA, INC.",US,SAN FRANCISCO,7372,Prepackaged Software (mass reproduction of sof...


In [44]:
df.columns.tolist()

['accessionNumber',
 'filingDate',
 'reportingDate',
 'financialEntity',
 'htmlFile',
 'coDescription',
 'CIK',
 'name',
 'countryinc',
 'cityma',
 'SIC',
 'SIC_desc']

## Standard Cleaning

In [3]:
#strip any left over html code
def clean_data_fn(insrt_data):
    
    clean_data = []
    
    for idx, ele in insrt_data.iterrows():
        if "https://www.sec.gov/Archives/edgar/data/" in ele["coDescription"]:
            pass
        else:
            clean_txt = re.compile('<.*?>')
            desc = re.sub(clean_txt,'',ele["coDescription"]).replace(u'\xa0', u' ').replace("   ", "").replace("'", "").replace('"','')
            if re.search('<', desc):
                pos = re.search('<', desc).start()
            desc = desc[:pos].lower()
            if (desc.find("business") >= 20): # didnt find it in the first 20 characters then look for next
                desc = desc[6 : ( desc.rfind("<") )] # remove the "Item 1." stuff only
            else: # found "business", remove everything before it
                desc =  desc[( desc.find("business") + 8 ) : ( desc.rfind("<") ) ]
            if (desc.find("overview") <= 20): # didnt find it in the first 20 characters then look for next
                desc =  desc[( desc.find("overview") + 8 ) :]
            # remove leading white space and periods
            desc = re.sub(r"^\.", "", desc).strip()            
            new_data = ele.copy()
            new_data["coDescription"] = desc
            # remove any filings with a description less than 250 characters (not enough information for us)
            if len(desc)<250:
                pass
            else:
                clean_data.append(new_data)
                
    return(pd.DataFrame(clean_data))

non_html_data = clean_data_fn(df)#.rename(columns = {"financialEntity":"CIK"})
non_html_data["CIK"] = non_html_data["CIK"].astype(int)
non_html_data.head()

Unnamed: 0,accessionNumber,filingDate,reportingDate,financialEntity,htmlFile,coDescription,CIK,name,countryinc,cityma,SIC,SIC_desc
0,0001441816-18-000028,2018-03-30 20:12:23 UTC,2018-02-01,financialEntities/params;cik=1441816,https://www.sec.gov/Archives/edgar/data/144181...,"mongodb is the leading modern, general purpose...",1441816,"MONGODB, INC.",US,NEW YORK,7372,Prepackaged Software (mass reproduction of sof...
1,0001108524-18-000011,2018-03-09 22:01:46 UTC,2018-02-01,financialEntities/params;cik=1108524,https://www.sec.gov/Archives/edgar/data/110852...,salesforce is a global leader in customer rela...,1108524,SALESFORCE COM INC,US,SAN FRANCISCO,7372,Prepackaged Software (mass reproduction of sof...
3,0001353283-18-000004,2018-03-30 21:21:46 UTC,2018-02-01,financialEntities/params;cik=1353283,https://www.sec.gov/Archives/edgar/data/135328...,splunk provides innovative software solutions ...,1353283,SPLUNK INC,,SAN FRANCISCO,7372,Prepackaged Software (mass reproduction of sof...
4,0001660134-18-000007,2018-03-12 20:45:43 UTC,2018-02-01,financialEntities/params;cik=1660134,https://www.sec.gov/Archives/edgar/data/166013...,okta is the leading independent provider of id...,1660134,"OKTA, INC.",US,SAN FRANCISCO,7372,Prepackaged Software (mass reproduction of sof...
5,0001564590-18-007164,2018-03-29 21:34:05 UTC,2018-02-01,financialEntities/params;cik=1393052,https://www.sec.gov/Archives/edgar/data/139305...,veeva is a leading provider of industry cloud ...,1393052,VEEVA SYSTEMS INC,,PLEASANTON,7372,Prepackaged Software (mass reproduction of sof...


## Lemmatization (optional)

In [33]:
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import word_tokenize
# nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
    
def lemmatize_sentence(sentence):
    lemmatized_output = [lemmatizer.lemmatize(w) for w in word_tokenize(sentence)]
    return " ".join(lemmatized_output)

lemma_desc = non_html_data["coDescription"].apply(lemmatize_sentence)
non_html_data["coDescription_lemmatized"] = lemma_desc
non_html_data["coDescription_lemmatized"].head()

0       mongodb is the leading modern , general purpos...
1       salesforce is a global leader in customer rela...
3       splunk provides innovative software solution t...
4       okta is the leading independent provider of id...
5       veeva is a leading provider of industry cloud ...
                              ...                        
1114    ationwe were incorporated on december 19 , 201...
1115    llowing business section contains forward-look...
1117    ts.costs and expense cost of sale consist prim...
1121    certain definitionsin this report : we define ...
1123    the word equinix , we , our , ours , u and the...
Name: coDescription_lemmatized, Length: 675, dtype: object

In [None]:

# def nltk2wn_tag(nltk_tag):
#     if nltk_tag.startswith('J'):
#         return wordnet.ADJ
#     elif nltk_tag.startswith('V'):
#         return wordnet.VERB
#     elif nltk_tag.startswith('N'):
#         return wordnet.NOUN
#     elif nltk_tag.startswith('R'):
#         return wordnet.ADV
#     else:                    
#         return None
    
# def weird_lemmatize_sentence(sentence):
#     nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))    
#     wn_tagged = map(lambda x: (x[0], nltk2wn_tag(x[1])), nltk_tagged)
#     res_words = []
#     for word, tag in wn_tagged:
#         if tag is None:                        
#             res_words.append(word)
#         elif tag.startswith('N'):
#             res_words.append(lemmatizer.lemmatize(word, tag))


# Removing stop words and numbers

In [35]:
from nltk.corpus import stopwords
nltk.download('stopwords')

# remove all numbers so they don't show up as dimensions
def remove_nums(x):
    text = x.lower()
    text = re.sub(r'\d+', '', text)
    return text

def remove_stopwords(x):
    stop_words = set(stopwords.words('english'))

    word_tokens = word_tokenize(x)

    filtered_sentence = ' '.join([w for w in word_tokens if not w.lower() in stop_words and w.isalnum()])

    return(filtered_sentence)

rm_num_stopwords = non_html_data["coDescription_lemmatized"].apply(remove_nums).apply(remove_stopwords)
non_html_data["coDescription_stopwords"] = rm_num_stopwords
rm_num_stopwords.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/richardye/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [39]:
non_html_data.to_csv("preprocessed.csv")

# Plotting

In [None]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import plotly.express as px

def pca_visualize_2d(vectors, index):
    multi_index = pd.MultiIndex.from_frame(index, names=["name", "industry"])
    
    pca = PCA(n_components = min(10, vectors.shape[1]))
    pca_embedding = pca.fit_transform(vectors)
    pca_embedding = pd.DataFrame(pca_embedding, index = multi_index)
    
    fig = px.scatter(pca_embedding, x =0 , y = 1, hover_data={"name": pca_embedding.index.get_level_values(0),
                                                              "industry": pca_embedding.index.get_level_values(1)},
                     color = pca_embedding.index.get_level_values(1), width=1200, height=600)
    fig.show()
    
    return [pca, pca_embedding]

def pca_visualize_3d(plot):
    fig = px.scatter_3d(plot[1], x =0 , y = 1, z = 2, hover_data={"name": plot[1].index.get_level_values(0),
                                                              "industry": plot[1].index.get_level_values(1)},
                    color = plot[1].index.get_level_values(1), width=1200, height=600)
    fig.show()

In [None]:
# From the explained variance ratio, we see that the top three dimensions don't actually explain that much of the variation that exists within our data/companies.
plot_pca[0].explained_variance_ratio_

In [None]:
# feature_names = pd.DataFrame(pipe['count'].get_feature_names_out())
# feature_names

components = pd.DataFrame(plot_pca[0].components_, columns = feature_names)
components.reindex(components.abs().sort_values(0, axis = 1).index)
# components[0:2]

In [None]:
components.abs().sort_values(2, axis = 1, ascending = False)

## Accuracy Score

### Dot product

In [1]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
def dot_product(embedding_matrix):
    """calculates percentage of correct category predictions based on 1-NN using dot product
    
    args: embedding matrix of size nxm (n companies each with an embedding of size m). NOTE: embeddings should be normalized.
    
    returns: float representation of percentage of correct category predictions
    """
    dot_product = np.matmul(embedding_matrix, embedding_matrix.T)
    np.fill_diagonal(dot_product.values, 0)
    dot_product.index = df["SIC_desc"]
    dot_product.columns = df["SIC_desc"]
    dot_product_df = pd.DataFrame(dot_product.idxmax(axis=1))
    dot_product_df.reset_index(level=0, inplace=True)
    dot_product_df.columns = ["y_true","y_pred"]
    return dot_product_df, np.sum(np.where(dot_product_df.iloc[:,1] == dot_product_df.iloc[:,0], 1, 0))/len(embedding_matrix), confusion_matrix(dot_product_df["y_true"], dot_product_df["y_pred"], labels=None, sample_weight=None, normalize='true')

dot_product_df, accuracy, cm = dot_produt(embedding_matrix)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=df["SIC_desc"].unique())
disp.plot(xticks_rotation='vertical')

Percentage of correct category predictions: 



### Euclidean distance

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
def euclidean_distance(embedding_matrix):
    """calculates percentage of correct category predictions based on 1-NN using euclidean_distance
    
    args: embedding matrix of size nxm (n companies each with an embedding of size m)
    
    returns: float representation of percentage of correct category predictions
    """
    Edist_matrix = pd.DataFrame(columns = data["SIC_desc"], index = data["SIC_desc"])

    for i in range(len(data)):
        row = []
        for j in range(len(data)):
            row.append(np.linalg.norm(embedding_matrix.iloc[i] - embedding_matrix.iloc[j]))
        Edist_matrix.iloc[i] = row

    Edist_matrix = Edist_matrix.apply(pd.to_numeric)
    np.fill_diagonal(Edist_matrix.values, float('inf'))
    Edist_matrix = pd.DataFrame(Edist_matrix.idxmin(axis=1))
    Edist_matrix.reset_index(level=0, inplace=True)
    Edist_matrix.columns = ["y_true","y_pred"]
    return Edist_matrix, np.sum(np.where(Edist_matrix.iloc[:,1] == Edist_matrix.iloc[:,0], 1, 0))/len(embedding_matrix), confusion_matrix(Edist_matrix["y_true"], Edist_matrix["y_pred"], labels=None, sample_weight=None, normalize='true')

Edist_df, accuracy, cm = euclidean_distance(embedding_matrix)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=df["SIC_desc"].unique())
disp.plot(xticks_rotation='vertical')

### Cosine Distance

In [None]:
from scipy import spatial
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
def cosine_distance(embedding_matrix):
    """calculates percentage of correct category predictions based on 1-NN using euclidean_distance
    
    args: embedding matrix of size nxm (n companies each with an embedding of size m)
    
    returns: float representation of percentage of correct category predictions
    """
    CosDist_matrix = pd.DataFrame(columns = data["SIC_desc"], index = data["SIC_desc"])
    for i in range(len(data)):
        row = []
        for j in range(len(data)):
            row.append(spatial.distance.cosine(embedding_matrix.iloc[i], embedding_matrix.iloc[j]))
        CosDist_matrix.iloc[i] = row
    CosDist_matrix = CosDist_matrix.apply(pd.to_numeric)
    np.fill_diagonal(CosDist_matrix.values, float('inf'))
    CosDist_matrix = pd.DataFrame(CosDist_matrix.idxmin(axis=1))
    CosDist_matrix.reset_index(level=0, inplace=True)
    CosDist_matrix.columns = ["y_true","y_pred"]
    return CosDist_matrix, np.sum(np.where(CosDist_matrix.iloc[:,1] == CosDist_matrix.iloc[:,0], 1, 0))/len(embedding_matrix), confusion_matrix(CosDist_matrix["y_true"], CosDist_matrix["y_pred"], labels=None, sample_weight=None, normalize='true')

CosDist_df, accuracy, cm = cosine_distance(embedding_matrix)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=df["SIC_desc"].unique())
disp.plot(xticks_rotation='vertical')

## Plot ROC Curve

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
def show_ROC_curves(df, similarity_matrix):
    for i in df["SIC_desc"].unique():
        y_true = similarity_matrix["y_true"] == i
        y_pred = similarity_matrix["y_pred"] == i
        fpr,tpr, _ = roc_curve(y_true, y_pred)
        roc_auc = auc(fpr, tpr)

        plt.plot(
            fpr,
            tpr,
            label="ROC curve of class {0} (area = {1:0.2f})".format(i, roc_auc),
        )
    plt.plot([0, 1], [0, 1], "k--")
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("Some extension of Receiver operating characteristic to multiclass")
    plt.legend(loc="lower right")
    plt.show()

show_ROC_curves(df, dot_product_df)