In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from Helpers import load, textpreprocess
import pandas as pd
import re

# Load Data:

In [3]:
edgar_filing_urls = [
    "https://www.sec.gov/ix?doc=/Archives/edgar/data/1093691/000155837021010611/plug-20210630x10q.htm", "https://www.sec.gov/ix?doc=/Archives/edgar/data/1655210/000165521021000181/bynd-20210703.htm", 
    "https://www.sec.gov/ix?doc=/Archives/edgar/data/1800667/000095017021001008/frog-20210630.htm", "https://www.sec.gov/ix?doc=/Archives/edgar/data/1517413/000151741321000154/fsly-20210630.htm",
    "https://www.sec.gov/ix?doc=/Archives/edgar/data/1691421/000169142121000081/lmnd-20210630.htm", "https://www.sec.gov/Archives/edgar/data/1617640/000161764021000055/z-20210630.htm",
    "https://www.sec.gov/ix?doc=/Archives/edgar/data/1679688/000167968821000085/dbrg-20210630.htm", "https://www.sec.gov/ix?doc=/Archives/edgar/data/1734722/000173472221000009/path-20210731.htm"
]
edgar_filing_url = edgar_filing_urls[-1]

In [4]:
entry_soup = load.load_soup_from_file_or_edgar(edgar_filing_url)
full_text = textpreprocess.get_text_from_soup(entry_soup)
sentences = textpreprocess.split_into_sentences(full_text)
sentences = [sentence for sentence in sentences if len(sentence.split(" ")) > 7]
cleaned_sentences = textpreprocess.lemmatize_sentences_and_remove_stop_words(sentences, remove_digits=True, additional_stop_words=["—"])

# Fit Model

In [5]:
vectorizer = TfidfVectorizer().fit(cleaned_sentences)
sentence_vectors = vectorizer.transform(cleaned_sentences).toarray()

# Using Pandas to filter similar paragraphs using the cosine similarity

In [6]:
df = pd.DataFrame(cosine_similarity(sentence_vectors))
df = df.reset_index()
df = df.melt("index")
df.columns = ["TextIndex", "ComparedToIndex", "Similarity"]

In [12]:
similar_sentences = df.query(".8 < Similarity < .96 and TextIndex != ComparedToIndex")

similar_text_tuple = set()

# this is a list comprehension to add paragraphs to a tuple. Because cosine similarity has duplicate values where TextOne and TextTwo will be swapped. So this makes sure our pairs are unique
temp_ = [similar_text_tuple.add((a, b, c)) for (a, b, c) in similar_sentences[["TextIndex", "ComparedToIndex", "Similarity"]].values if (a, b, c) and (b, a, c) not in similar_text_tuple]

similar_text_df = pd.DataFrame(similar_text_tuple)
similar_text_df.columns = ["TextIndex", "ComparedToIndex", "Similarity"]
similar_text_df["TextOne"] = similar_text_df["TextIndex"].apply(lambda x: sentences[x])
similar_text_df["TextTwo"] = similar_text_df["ComparedToIndex"].apply(lambda x: sentences[x])
similar_text_df["TextOneContext"] = similar_text_df["TextIndex"].apply(lambda x: " ".join(sentences[x-1: x+2]))

similar_text_df = similar_text_df[~similar_text_df["TextOne"].str.lower().str.contains("table")]
similar_text_df = similar_text_df[~similar_text_df["TextOne"].str.lower().str.contains("see note")]

if len(similar_text_df) > 80:
    similar_text_df = similar_text_df.drop_duplicates(keep="first", subset=["TextIndex"])

# to ensure that similar sentences are not in the same section
similar_text_df = similar_text_df.query("TextIndex - ComparedToIndex > 3").sort_values("Similarity")

In [13]:
similar_text_df

Unnamed: 0,TextIndex,ComparedToIndex,Similarity,TextOne,TextTwo
82,672,384,0.801297,"Our annualized renewal run-rate, or ARR, was $...",We generated revenue of $195.5 million and $13...
71,461,365,0.802081,Although these investments may adversely affec...,We generate revenue from the sale of licenses ...
87,465,45,0.804006,"In addition, we offer a managed, multi-tenant,...","Additionally, we offer maintenance and support..."
20,951,459,0.806133,We cannot be certain when or if our operations...,We have a history of introducing successful ne...
69,342,8,0.80674,These equity awards have an approximate aggreg...,Management’s Discussion and Analysis of Financ...
50,785,385,0.813276,We have experienced net losses in each fiscal ...,We generated revenue of $381.7 million and $25...
75,672,385,0.813531,"Our annualized renewal run-rate, or ARR, was $...",We generated revenue of $381.7 million and $25...
83,409,393,0.816916,"Moreover, ARR may differ from similarly titled...","For further information, see the section title..."
43,540,516,0.818031,Maintenance and support revenue is recognized ...,38 Table of Contents Cost of Revenue and Gross...
0,551,526,0.831068,These increases were partially offset by a dec...,Research and Development 39 Table of Contents ...


In [14]:
similar_text_df.to_clipboard(index=False)