In [9]:
import re
import textract
import numpy as np

from pandas import DataFrame

In [10]:
text = '200309-sustainable-finance-teg-final-report-taxonomy-annexes_en.pdf'

# Extract raw text from PDF file
raw_text = textract.process(text, method='pdfminer')

In [11]:
raw_text



In [12]:
# Split by paragraph and remove blank spaces
text_split = re.split('\n\n', raw_text.decode())
stripped = [x.strip() for x in text_split]

In [13]:
stripped

['',
 '',
 '',
 'Updated methodology & Updated Technical Screening Criteria',
 '',
 '-  1 -',
 'March 2020',
 '',
 '',
 '',
 'About this report',
 'This document includes an updated Part B: Methodology from the June 2019 report and an updated Part \nF: Full list of technical screening criteria. The other original sections from the June 2019 report can be \nfound as labelled in the June 2019 report.',
 'PART A  Explanation of the Taxonomy approach. This section sets out the role and importance of \nsustainable finance in Europe from a policy and investment perspective, the rationale for \nthe development of an EU Taxonomy, the daft regulation and the mandate of the TEG.',
 'PART B  Methodology. This explains the methodologies for developing technical screening \ncriteria for climate change mitigation objectives, adaptation objectives and ‘do no \nsignificant harm’ to other environmental objectives in the legislative proposal. \nThis has been updated since 2019.',
 'PART C  Taxonomy user

In [14]:
# Store paragraphs in a Pandas DataFrame
df = DataFrame(stripped, columns=['paragraph'])

In [15]:
df

Unnamed: 0,paragraph
0,
1,
2,
3,Updated methodology & Updated Technical Screen...
4,
...,...
18920,
18921,
18922,
18923,


In [16]:
# Convert the emptly strings to NaN so that we can drop them
df['paragraph'].replace('', np.nan, inplace=True)
df.dropna(subset=['paragraph'], inplace=True)

In [None]:
# Save to csv
df.to_csv('paragrapsh.csv')

#### Vectorise corpus

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [27]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['paragraph'])

In [28]:
X

<12937x6808 sparse matrix of type '<class 'numpy.float64'>'
	with 150506 stored elements in Compressed Sparse Row format>

In [29]:
print(vectorizer.get_feature_names())



In [31]:
print(X.shape)

(12937, 6808)


In [33]:
from sklearn.metrics.pairwise import linear_kernel

In [34]:
linear_kernel(X)

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])