In [1]:
import os
from docx import Document

In [2]:
def extract_text_from_docx_files(docx_folder_path):
    """Extract text from all valid DOCX files in a folder."""
    documents_with_names = []
    for file in os.listdir(docx_folder_path):
        # Exclude temporary or invalid files
        if file.endswith('.docx') and not file.startswith('~$'):
            docx_path = os.path.join(docx_folder_path, file)
            document = Document(docx_path)
            text = "\n".join([paragraph.text for paragraph in document.paragraphs])  # Join all paragraphs
            documents_with_names.append((file, text))  # Append as tuple (file_name, text)
    return documents_with_names

# Provide the folder containing your DOCX files
docx_folder_path = "/Users/rohanpersonal/git_projs/echr_freedom_expression_analysis/word_docs_facts/"
documents_with_names = extract_text_from_docx_files(docx_folder_path)

# Display the text extracted from each document
for file_name, text in documents_with_names:
    print(f"\n{'='*40}\nExtracted Text from: {file_name}\n{'='*40}")
    print(text[:1000])  # Print the first 1000 characters for readability
    print(f"... (truncated, total {len(text)} characters)\n{'='*40}")

# Separate the text content for further analysis
documents = [text for _, text in documents_with_names]



Extracted Text from: hate speech factsheet - facts.docx
Hate speech
Threat to the democratic order
As a rule, the Court will declare inadmissible, on grounds of incompatibility with the values of the Convention, applications which are inspired by totalitarian doctrine or which express ideas that represent a threat to the democratic order and are liable to lead to the restoration of a totalitarian regime.
Racial hate
Glimmerveen and Haqenbeek v. the Netherlands 
11 October 1979 (decision of the European Commission of Human Rights4) 
In this case, the applicants had been convicted for possessing leaflets addressed to “White Dutch people”, which tended to make sure notably that everyone who was not white left the Netherlands.
Negationism and revisionism
Garaudy v. France 
24 June 2003 (decision on the admissibility) 
The applicant, the author of a book entitled The Founding Myths of Modern Israel, was convicted of the offences of disputing the existence of crimes against humanity, defama

In [3]:
from keybert import KeyBERT

# Initialize KeyBERT model
kw_model = KeyBERT(model='all-MiniLM-L6-v2')

individual_keywords = []
for file_name, doc in documents_with_names:
    keywords = kw_model.extract_keywords(
        doc,
        keyphrase_ngram_range=(1, 2),  # Allow unigrams to trigrams
        top_n=35,                      # Extract top 35 keyphrases
        stop_words=None                # Use default stop words or specify your own
    )
    individual_keywords.append((file_name, keywords))  # Use real file names

# Display results
for file_name, keywords in individual_keywords:
    print(f"\nKeyphrases for {file_name}:")
    for phrase, score in keywords:
        print(f"- {phrase} (Score: {score:.2f})")


  from .autonotebook import tqdm as notebook_tqdm



Keyphrases for hate speech factsheet - facts.docx:
- netherlands negationism (Score: 0.54)
- discrimination hatred (Score: 0.50)
- hatred jersild (Score: 0.48)
- condemning the (Score: 0.47)
- inciting discrimination (Score: 0.46)
- discrimination incitement (Score: 0.45)
- condemning (Score: 0.45)
- humanity defamation (Score: 0.45)
- defamation (Score: 0.45)
- inciting hatred (Score: 0.45)
- hatred incitement (Score: 0.45)
- defamation in (Score: 0.45)
- incited hatred (Score: 0.44)
- ethnic hatred (Score: 0.44)
- slander against (Score: 0.43)
- violated zemmour (Score: 0.43)
- discrimination against (Score: 0.43)
- racial hatred (Score: 0.42)
- extremism legislation (Score: 0.42)
- the holocaust (Score: 0.42)
- germans extremism (Score: 0.42)
- hatred among (Score: 0.42)
- extremism on (Score: 0.42)
- vehemently condemning (Score: 0.42)
- disseminating extremist (Score: 0.41)
- provoking enmity (Score: 0.41)
- hatred violence (Score: 0.41)
- national hatred (Score: 0.41)
- were ext

In [7]:
import pke
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk



def extract_keywords_textrank(documents_with_names, top_n=30):
    """Extract keywords using TextRank for each document."""
    keyphrases_by_document = []
    
    # Load stop words for English
    stoplist = stopwords.words('english')
    
    for file_name, doc in documents_with_names:
        # Initialize the TextRank extractor
        extractor = pke.unsupervised.TextRank()
        
        # Load the document content with stop words applied
        extractor.load_document(input=doc, language='en', normalization='lower', stoplist=stoplist)
        
        # Candidate selection: Use words as keyphrase candidates
        extractor.candidate_selection(pos={'NOUN', 'PROPN', 'ADJ'})  # Focus on nouns, proper nouns, adjectives
        
        # Weight candidates using the TextRank algorithm
        extractor.candidate_weighting(window=2)
        
        # Extract top N keyphrases
        keyphrases = extractor.get_n_best(n=top_n)
        keyphrases_by_document.append((file_name, keyphrases))
    
    return keyphrases_by_document

# Extract keyphrases
keyphrases_by_document = extract_keywords_textrank(documents_with_names, top_n=30)

# Display results
for file_name, keyphrases in keyphrases_by_document:
    print(f"\nKeyphrases for {file_name}:")
    for phrase, score in keyphrases:
        print(f"- {phrase} (Score: {score:.4f})")



Keyphrases for hate speech factsheet - facts.docx:
- political party front national (Score: 0.0185)
- national courts (Score: 0.0132)
- youth human rights group v. russia10 (Score: 0.0128)
- daily newspaper ülkede özgür gündem (Score: 0.0128)
- left-wing basque separatist parliamentary group (Score: 0.0128)
- terrorist group action directe (Score: 0.0125)
- british national party (Score: 0.0122)
- national court decisions (Score: 0.0121)
- daily newspaper le monde (Score: 0.0118)
- le monde daily newspaper (Score: 0.0118)
- basque daily newspaper (Score: 0.0116)
- basque weekly newspaper (Score: 0.0112)
- online hate speech (Score: 0.0108)
- popular online social network (Score: 0.0106)
- national association (Score: 0.0105)
- bilingual turkish-armenian weekly newspaper (Score: 0.0105)
- online news articles (Score: 0.0105)
- well-known turkish muslim theologian (Score: 0.0104)
- public speech (Score: 0.0102)
- specific ethnic group (Score: 0.0102)
- national front (Score: 0.0102)
- i