In [1]:
import os
import pdfplumber
from keybert import KeyBERT

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def extract_text_from_pdfs_with_pdfplumber(pdf_folder_path):
    """Extract text from all PDF files in a folder using pdfplumber."""
    documents_with_names = []
    for file in os.listdir(pdf_folder_path):
        if file.endswith('.pdf'):
            pdf_path = os.path.join(pdf_folder_path, file)
            with pdfplumber.open(pdf_path) as pdf:
                text = ""
                for page in pdf.pages:
                    text += page.extract_text() or ""  # Ensure empty strings for missing text
                documents_with_names.append((file, text))  # Append as tuple (file_name, text)
    return documents_with_names

# Provide the folder containing your PDFs
pdf_folder_path = "/Users/rohanpersonal/git_projs/echr_freedom_expression_analysis/factsheets/"
documents_with_names = extract_text_from_pdfs_with_pdfplumber(pdf_folder_path)

# Display the text extracted from each document
for file_name, text in documents_with_names:
    print(f"\n{'='*40}\nExtracted Text from: {file_name}\n{'='*40}")
    print(text[:200])  # Print the first 1000 characters for readability
    print(f"... (truncated, total {len(text)} characters)\n{'='*40}")

# Separate the text content for further analysis
documents = [text for _, text in documents_with_names]


Extracted Text from: FS_Access_Internet_ENG.pdf
Factsheet – Access to Internet and
freedom to receive and impart information and ideas
June 2024
This Factsheet does not bind the Court and is not exhaustive
Access to Internet and freedom to receive

... (truncated, total 22642 characters)

Extracted Text from: FS_Journalistic_sources_ENG.pdf
Factsheet – Protection of journalistic sources
January 2024
This Factsheet does not bind the Court and is not exhaustive
Protection of journalistic sources
Article 10 (freedom of expression) of the Eu
... (truncated, total 30688 characters)

Extracted Text from: FS_Hate_speech_ENG.pdf
Factsheet – Hate speech
November 2023
This factsheet does not bind the Court and is not exhaustive
Hate speech
“Freedom of expression constitutes one of the essential foundations of [a democratic]
soc
... (truncated, total 112856 characters)

Extracted Text from: FS_Whistleblowers_ENG.pdf
Factsheet – Whistleblowers and
freedom to impart and to receive information
Sept

In [4]:
from keybert import KeyBERT

# Initialize KeyBERT model
kw_model = KeyBERT(model='all-MiniLM-L6-v2')

individual_keywords = []
for i, doc in enumerate(documents):
    keywords = kw_model.extract_keywords(
        doc,
        keyphrase_ngram_range=(1, 3),  # Allow unigrams to trigrams
        top_n=20,                   # Extract top 20 keyphrases
         stop_words=None
    )
    individual_keywords.append((f"Document {i + 1}", keywords))

# Display results
for doc_name, keywords in individual_keywords:
    print(f"\nKeyphrases for {doc_name}:")
    for phrase, score in keywords:
        print(f"- {phrase} (Score: {score:.2f})")


Keyphrases for Document 1:
- freedom of expression (Score: 0.63)
- rights of internet (Score: 0.59)
- internet and freedom (Score: 0.57)
- article freedom (Score: 0.50)
- of article freedom (Score: 0.49)
- internet users rights (Score: 0.48)
- article freedom of (Score: 0.47)
- internet ahmet yıldırım (Score: 0.47)
- restrictions on rights (Score: 0.46)
- information the internet (Score: 0.45)
- of europe web (Score: 0.45)
- users rights media (Score: 0.44)
- article 10 freedom (Score: 0.44)
- against websites (Score: 0.44)
- access to information (Score: 0.44)
- concerning freedom of (Score: 0.44)
- concerning freedom (Score: 0.44)
- human rights 4factsheet (Score: 0.44)
- expression and information (Score: 0.44)
- websites containing legal (Score: 0.43)

Keyphrases for Document 2:
- journalistic sources protection (Score: 0.74)
- protect journalistic sources (Score: 0.70)
- protection of journalistic (Score: 0.69)
- confidentiality of journalistic (Score: 0.69)
- disclose journalist