In [None]:
import os

In [None]:
BASEDIR = '/home/sabine/git/whitepaper'
WHITEPAPERDIR = os.path.join(BASEDIR, 'whitepaper_pdfs')

## Scraper

## Preprocessing

In [None]:
from whitepaper.preprocessing import Preprocess

In [None]:
processor = Preprocess()

In [None]:
whitepaper_library = dict()
for pdf in os.listdir(WHITEPAPERDIR):
    company = pdf[:-4]
    (pages, whitepaper) = processor.read(os.path.join(WHITEPAPERDIR, pdf))
    tokenized_text = processor.tokenize(whitepaper)
    whitepaper_library[company] = tokenized_text

In [None]:
tagged_documents = processor.create_tagged_documents(whitepaper_library)

## Learn document vectors

In [None]:
import gensim

In [None]:
doc2vec_model = gensim.models.doc2vec.Doc2Vec(
    tagged_documents, 
    dm = 1, 
    vector_size = 300, 
    window = 5, 
    min_count = 3, 
    workers = 4, 
    sample = 1e-3, 
    hs = 0, 
    negative = 5, 
    epochs = 10
)

## Compute Similarity

In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances, euclidean_distances

In [None]:
whitepaper_vectors = dict()
for company, _ in whitepaper_library.items():
    whitepaper_vectors[company] = doc2vec_model[company]

In [None]:
whitepaper_frame = pd.DataFrame(whitepaper_vectors)

### Cosine Similarity

In [None]:
pd.DataFrame(cosine_similarity(whitepaper_frame.transpose()))

### Cosine Distance

In [None]:
pd.DataFrame(cosine_distances(whitepaper_frame.transpose()))

### Euclidean Distances

In [None]:
pd.DataFrame(euclidean_distances(whitepaper_frame.transpose()))