# OvP project

## Imports and data loading

In [1]:
# importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import yake
# import rake
# from rake import RAKE
from rake_nltk import Rake
from keybert import KeyBERT
from langdetect import detect
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

import spacy
import keyword_spacy
from keyword_spacy import KeywordExtractor

# import openai
# from keybert.llm import OpenAI
# from keybert import KeyLLM

# nltk.download('stopwords')
# nltk.download('punkt_tab')
# nltk.download('punkt')

In [2]:
# import data
data = pd.read_excel("raw_data.xlsx")

print(data.head(2))

        CURSUS             LANGE_NAAM_NL  \
0   ASB-ACT-EN  ACT - Leef zoals je wilt   
1  ASB-ACTLEEF  ACT - Leef zoals je wilt   

                                          DOCENT_ROL  \
0  Brasjen DOCENT; Bruinsma DOCENT; Buiks DOCENT;...   
1  Brasjen DOCENT; Bruinsma DOCENT; Buiks DOCENT;...   

  OSS_ADF_UTILITY.HTML_TO_TEXT(H.INHOUD)  
0                                    NaN  
1                                    NaN  


### Exploring data

In [3]:

print(data.columns)
teacher_list_raw = data['DOCENT_ROL'].tolist()
print(teacher_list_raw[:5])


Index(['CURSUS', 'LANGE_NAAM_NL', 'DOCENT_ROL',
       'OSS_ADF_UTILITY.HTML_TO_TEXT(H.INHOUD)'],
      dtype='object')
['Brasjen DOCENT; Bruinsma DOCENT; Buiks DOCENT; Burgh DOCENT; Galip√≤ DOCENT; Gronden DOCENT; Kaathoven CONTACTPERSOON; Verhagen DOCENT', 'Brasjen DOCENT; Bruinsma DOCENT; Buiks DOCENT; Burgh DOCENT; Galip√≤ DOCENT; Gronden DOCENT; Kaathoven CONTACTPERSOON; Verhagen DOCENT', 'Bertens DOCENT; Brasjen DOCENT; Galip√≤ DOCENT', 'Baars DOCENT; Brasjen DOCENT; Breemen DOCENT; Bruinsma DOCENT; Buiks DOCENT; Burgh DOCENT; Galip√≤ DOCENT; Gronden DOCENT; Kaathoven CONTACTPERSOON; Schrijen DOCENT; Spreeuwenberg DOCENT; Verhagen DOCENT; Winters DOCENT', 'Baars DOCENT; Brasjen DOCENT; Breemen DOCENT; Bruinsma DOCENT; Buiks DOCENT; Burgh DOCENT; Galip√≤ DOCENT; Gronden DOCENT; Schrijen DOCENT; Spanbroek CONTACTPERSOON; Spreeuwenberg DOCENT; Verhagen DOCENT']


In [4]:
teacher_list = [
    item.strip()
    for entry in teacher_list_raw
    for item in entry.split(';')
]
print(teacher_list[:10])

['Brasjen DOCENT', 'Bruinsma DOCENT', 'Buiks DOCENT', 'Burgh DOCENT', 'Galip√≤ DOCENT', 'Gronden DOCENT', 'Kaathoven CONTACTPERSOON', 'Verhagen DOCENT', 'Brasjen DOCENT', 'Bruinsma DOCENT']


### Choose keyword model by example

In [5]:
stop_words_dict = {'eng':stopwords.words('english'), 'nl': stopwords.words('dutch')}

In [6]:
example_text_nl = "Deze cursus gaat over de visuele cultuur van de Nederlandse zeventiende-eeuwse Republiek. De ongekende commerci√´le groei van de Verenigde Nederlanden kwam voort uit expansiedrang en kolonialisme, wat onder meer resulteerde in een bloeiende kunst- en rariteitenhandel en een explosie van artistieke creativiteit. Deze cursus onderzoekt de impact van de steeds groter wordende wereld op de beeldende kunsten. Welke rol speelden kunstwerken en andere exotische kunstobjecten die in de huiselijke sfeer werden verzameld en gekoesterd? Wat werd er afgebeeld, waarom en voor wie? Wat onthullen schilderijen over de houding van de Nederlanders ten opzichte van de koloni√´n, van andere Europese landen en van zichzelf? Wat vertellen rariteitenverzamelingen ons over de verhouding tussen het zelf en de wereld, het vertrouwde en het vreemde, het lokale en het internationale? Maar ook zal er stil worden gestaan bij ontdekkingen die op wetenschappelijk gebied worden gedaan en hoe deze al vlug hun weg weten te vinden in de beeldende kunst. [NB: de cursusinhoud is onder voorbehoud en kan wijzigen]"


example_text_en = "Content What the course is about: This course is centered around the data analysis pipelines typically found in brain-computer interfaces: the processing of multi-channel recordings of neuronal signals, the extraction of informative features (‚Äúneural markers‚Äù) from these signals, and the decoding of brain states by regression or classification models. As informative features are subject-dependent, and as neuronal signals typically show large dimensionality, high noise, and low signal amplitudes, machine learning methods play an important role in all steps of the pipeline. Thus we have to assume from course participants three important skills: - sound mathematical background to understand the statistical methods and machine learning algorithms dealt with in the course (mainly probability theory and linear algebra). Please expect, that this will go into the math foundations of methods and not stay superficial. - familiarity with machine learning concepts, data handling, and model evaluation strategies. - the ability to implement these in Python. Word of caution: please note that if a participant lacks these assumed skills / background, the successful completion of the course may be very difficult and exceed the expected workload for a 6 ECTS course substantially.  What the course is not about: The course will be able to cover and train the ability to perform offline analyses. We try to include hands-on experience with an online / closed-loop BCI system. How feasible this is will depend on the size of the course. The course covers a selection of data processing methods relevant for the most widely used BCI applications. However, the course can neither cover the full spectrum of neurotechnological applications nor the large number of different processing algorithms proposed in the literature. To cover the curriculum, familiarity with mathematical concepts like dot products, matrix/vector calculations, matrix inversion, covariance matrix, eigenvalue decomposition, Bayes theorem, basic probability theory, etc. needs to be assumed and can not be taught in this course. The course will also assume, that students are familiar with basic signal processing methods taught in BKI316 (Fourier transform, Hilbert transform, filtering of time series data in the time- and frequency domain) and their mathematical background. The course will focus on processing methods applicable for invasive mesoscopic (local field potentials ‚ÄúLFP‚Äù, electrocorticogram ‚ÄúECoG‚Äù) and non-invasive macroscopic (electroencephalogram ‚ÄúEEG‚Äù, magnetoencephalogram ‚ÄúMEG‚Äù) electromagnetic signals, while the processing of action potentials / single-unit activity and the analysis of functional MRI and functional near-infrared signal data cannot be covered in this course. While neurotechnological systems may combine neural recordings with signals of non-neural origin (electromyogram ‚ÄúEMG‚Äù, body tracking, behavioral performance recordings by accelerometers, etc.), the course will have to focus on neural signals. While developing a successful BCI / neurotechnological system is a highly interdisciplinary endeavor that profits from the collaboration of neuroscientists, psychologists, signal processing- and artificial intelligence specialists, this course will put the most emphasis on the latter two aspects."





lang_nl = detect(example_text_nl)

# print(f"Language: {lang} ‚Üí Stopwords: {stop_words_dict[lang][:5]}")
print(f"Language: {lang_nl}")


lang_en = detect(example_text_en)
print(f"Language: {lang_en}")

Language: nl
Language: en


In [7]:
# yake function
def yake_extract_keywords(text, lang='eng'):
    kw_extractor = yake.KeywordExtractor(
                            lan=lang,           # or appropriate language code
                            n=3,                # max n-gram size (try 2-3)
                            dedupLim=0.8,       # make deduplication stricter
                            dedupFunc='levs',   # try 'levs' or 'jaro' instead of default 'seqm'
                            windowsSize=2,      # context window size
                            top=15              # how many keywords you want
                        )
    keywords = kw_extractor.extract_keywords(text)
    return [kw for kw, score in keywords]

yake_kw = yake_extract_keywords(example_text_nl, lang_nl)
print("yake: ", yake_kw)

yake_kw = yake_extract_keywords(example_text_en, lang_en)
print("yake: ", yake_kw)

yake:  ['Nederlandse zeventiende-eeuwse Republiek', 'zeventiende-eeuwse Republiek', 'Nederlandse zeventiende-eeuwse', 'Verenigde Nederlanden kwam', 'visuele cultuur', 'cursus gaat', 'Verenigde Nederlanden', 'ongekende commerci√´le groei', 'Republiek', 'Nederlandse', 'Nederlanden kwam', 'cursus onderzoekt', 'groter wordende wereld', 'cursus', 'expansiedrang en kolonialisme']
yake:  ['pipelines typically found', 'neuronal signals typically', 'informative features', 'machine learning', 'brain-computer interfaces', 'found in brain-computer', 'decoding of brain', 'brain states', 'states by regression', 'regression or classification', 'machine learning methods', 'analysis pipelines typically', 'processing methods', 'neuronal signals', 'typically found']


In [8]:
def rake_extract_keywords(text, lang='eng'):
    # rake = RAKE()
    rake = Rake(stopwords=stop_words_dict[lang])
    rake.extract_keywords_from_text(text)
    return rake.get_ranked_phrases()

rake_kw = rake_extract_keywords(example_text_nl, lang_nl)
print("rake: ", rake_kw)
rake_kw = rake_extract_keywords(example_text_en)
print("rake: ", rake_kw)

rake:  ['welke rol speelden kunstwerken', 'verenigde nederlanden kwam voort', 'huiselijke sfeer werden verzameld', 'steeds groter wordende wereld', 'ongekende commerci√´le groei', 'nederlanders ten opzichte', 'wetenschappelijk gebied', 'weg weten', 'visuele cultuur', 'vertellen rariteitenverzamelingen', 'verhouding tussen', 'onthullen schilderijen', 'nederlandse zeventiende', 'exotische kunstobjecten', 'europese landen', 'eeuwse republiek', 'cursus onderzoekt', 'cursus gaat', 'bloeiende kunst', 'beeldende kunsten', 'beeldende kunst', 'artistieke creativiteit', 'wereld', 'zichzelf', 'wijzigen', 'waarom', 'vreemde', 'voorbehoud', 'vlug', 'vinden', 'vertrouwde', 'stil', 'resulteerde', 'rariteitenhandel', 'ontdekkingen', 'nb', 'lokale', 'koloni√´n', 'kolonialisme', 'internationale', 'impact', 'houding', 'gestaan', 'gekoesterd', 'gedaan', 'explosie', 'expansiedrang', 'cursusinhoud', 'afgebeeld']
rake:  ['local field potentials ‚Äú lfp ‚Äù, electrocorticogram ‚Äú ecog ‚Äù)', 'electroencephal

In [9]:
def keybert_extract_keywords(text, lang='eng'):
    kw_model = KeyBERT(model="paraphrase-multilingual-mpnet-base-v2")
    keywords = kw_model.extract_keywords(
                                text,
                                keyphrase_ngram_range=(1,4),
                                stop_words=stop_words_dict[lang],
                                use_maxsum=True,
                                # nr_candidates=20,
                                top_n=20,
                                use_mmr=True,
                                diversity=0.9,
                                highlight=True
    )
    return keywords

keybert_kw = keybert_extract_keywords(example_text_nl, lang_nl)
print("keybert: ", keybert_kw)
keybert_kw = keybert_extract_keywords(example_text_en)
print("keybert: ", keybert_kw)

keybert:  [('gaat visuele cultuur nederlandse', 0.8119), ('creativiteit cursus onderzoekt', 0.5374), ('expansiedrang kolonialisme resulteerde', 0.521), ('cursus onderzoekt impact', 0.3888), ('ongekende commerci√´le', 0.3813), ('gestaan ontdekkingen wetenschappelijk gebied', 0.3742), ('wereld', 0.3598), ('rariteitenverzamelingen verhouding tussen', 0.3348), ('nb cursusinhoud voorbehoud', 0.3217), ('waarom', 0.3204), ('zeventiende', 0.3129), ('republiek', 0.3059), ('rol speelden', 0.2658), ('huiselijke', 0.2581), ('groter', 0.2424), ('gebied gedaan vlug weg', 0.2353), ('commerci√´le groei verenigde', 0.2231), ('gestaan', 0.2215), ('vertrouwde vreemde', 0.1167), ('explosie', 0.1118)]


keybert:  [('content course course', 0.6805), ('neural origin', 0.3638), ('computer interfaces processing', 0.3511), ('multi channel', 0.2537), ('covers selection data', 0.2284), ('body tracking behavioral performance', 0.2245), ('noise low', 0.2104), ('pipelines typically found', 0.1914), ('mainly probability theory linear', 0.1606), ('steps', 0.1495), ('local', 0.135), ('fourier transform hilbert', 0.1322), ('python word', 0.1141), ('large number', 0.0916), ('stay', 0.0877), ('may', 0.079), ('interdisciplinary endeavor profits collaboration', 0.0723), ('invasive macroscopic', 0.0713), ('online closed', 0.06), ('infrared signal data cannot', 0.0051)]


In [10]:
# chatGPT generated

def extract_keywords_advanced(text, top_n=20, lang='eng'):

    # model dat NL + EN begrijpt
    kw_model = KeyBERT(model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2")

    # tekst splitsen in zinnen
    sentences = sent_tokenize(text)

    all_keywords = []

    for sent in sentences:
        # per zin keywords halen
        kw = kw_model.extract_keywords(
            sent,
            keyphrase_ngram_range=(1, 3),
            stop_words=stop_words_dict[lang],
            use_mmr=True,
            diversity=0.7,
            top_n=5
        )
        all_keywords.extend([k for k, _ in kw])

    # opschonen en unieke termen behouden
    cleaned = []
    for kw in all_keywords:
        kw = kw.lower().strip()
        kw = re.sub(r'[^a-z√†-√ø0-9\s\-]', '', kw)  # speciale tekens eruit
        if len(kw.split()) > 1 and kw not in cleaned:  # liever zinnen dan losse woorden
            cleaned.append(kw)

    # top_n beperken
    cleaned = cleaned[:top_n]

    return {
        "language_detected": lang,
        "keywords": cleaned
    }

# -------------------------------------------------
# 3Ô∏è‚É£ Test
# -------------------------------------------------

results = extract_keywords_advanced(example_text_nl, top_n=20, lang=lang_nl)

print(f"üó£Ô∏è Detected language: {results['language_detected']}")
print("üéØ Top keywords:")
for kw in results["keywords"]:
    print("-", kw)
    
results = extract_keywords_advanced(example_text_en, top_n=20)

print(f"üó£Ô∏è Detected language: {results['language_detected']}")
print("üéØ Top keywords:")
for kw in results["keywords"]:
    print("-", kw)


üó£Ô∏è Detected language: nl
üéØ Top keywords:
- visuele cultuur nederlandse
- zeventiende eeuwse republiek
- cursus gaat
- kolonialisme resulteerde bloeiende
- rariteitenhandel explosie
- artistieke creativiteit
- commerci√´le groei verenigde
- wereld beeldende kunsten
- cursus onderzoekt impact
- wordende wereld
- steeds groter wordende
- exotische kunstobjecten huiselijke
- rol speelden kunstwerken
- huiselijke sfeer werden
- werden verzameld
- welke rol
- afgebeeld waarom
- schilderijen houding nederlanders
- onthullen schilderijen
- europese landen
üó£Ô∏è Detected language: eng
üéØ Top keywords:
- content course course
- around data analysis
- neuronal signals extraction
- computer interfaces processing
- pipelines typically
- machine learning methods
- neuronal signals typically
- dimensionality high noise
- important role
- skills sound mathematical
- assume course participants
- machine learning
- probability theory linear
- math foundations
- go math
- stay superficial
- p

In [21]:
def spacy_extract_keywords(text, lang='en'):
    
    kw_model = spacy.load("en_core_web_md") if lang == 'en' else spacy.load("nl_core_news_md")
    
    
    # Create the keyword extractor component
    keyword_extractor = KeywordExtractor(kw_model, name="keyword_extractor",
        top_n=10,
        min_ngram=1,
        max_ngram=5,
        strict=False
    )
    
    # kw_model.add_pipe(keyword_extractor, last=True)
    kw_model.add_pipe("keyword_extractor", last=True, config={"top_n": 20, "min_ngram": 1, "max_ngram": 5, "strict": True, "top_n_sent": 3})
    doc = kw_model(text)
    return doc._.keywords
    # print("Top Keywords:", doc._.keywords)

    # kw_model = spacy.load("en_core_web_md")
    # kw_model.add_pipe("keyword_extractor", last=True, config={"top_n": 10, "min_ngram": 1, "max_ngram": 5, "strict": False})
    # doc = kw_model(text)
    # return [kw for kw, score in doc._.keywords]
    # # print("Top Keywords:", doc._.keywords)

spacy_kw = spacy_extract_keywords(example_text_nl, lang_nl)
print("spacy: ", spacy_kw)

spacy_kw = spacy_extract_keywords(example_text_en)
print("spacy: ", spacy_kw)


spacy:  [('Nederlandse zeventiende-eeuwse', 1, np.float32(0.6647287)), ('Nederlandse zeventiende-eeuwse Republiek', 1, np.float32(0.645012)), ('visuele cultuur', 1, np.float32(0.62995076)), ('ongekende commerci√´le', 1, np.float32(0.74898523)), ('ongekende', 1, np.float32(0.7087786)), ('ongekende commerci√´le groei', 1, np.float32(0.696306)), ('wordende', 1, np.float32(0.6458637)), ('groter wordende wereld', 1, np.float32(0.6398926)), ('groter wordende', 1, np.float32(0.6291202)), ('exotische kunstobjecten', 1, np.float32(0.69749194)), ('huiselijke', 1, np.float32(0.6860458)), ('rol speelden kunstwerken', 1, np.float32(0.6110443)), ('afgebeeld', 1, np.float32(0.2851031)), ('Europese landen', 1, np.float32(0.53402615)), ('Europese', 1, np.float32(0.51365644)), ('Nederlanders', 1, np.float32(0.49109957)), ('verhouding', 1, np.float32(0.53279394)), ('internationale', 1, np.float32(0.48925865)), ('wereld', 1, np.float32(0.48640022)), ('ontdekkingen', 1, np.float32(0.56499314))]
spacy:  [('

## Get keywords per topic


In [None]:
course_dict = {}

for ind, row in data.iterrows():
    # print(row)
    # print("done: ",row[0], row[1], row[2])
    course_code, course_name, teachers, keywords = row
    if pd.isna(keywords):
        continue
        
    teachers = teachers.split(";")
    teachers = list(set([' '.join(item.split()[:-1]) for item in teachers]))
    
    # using yake, keywords not great
    keywords_yake = yake_extract_keywords(keywords)
    
    # use keybert, slightly better but not perfect yet
    kw_model = KeyBERT()
    keywords_bert = kw_model.extract_keywords(keywords,
                                              stop_words='english',
                                              use_maxsum=True,
                                              nr_candidates=20,
                                              top_n=20, 
                                              keyphrase_ngram_range=(1, 3),
                                              use_mmr=True,
                                              diversity=0.7)
    # print(keywords)
    
    course_dict[row[0]] = {"course_code": course_code,
                           "course_name": course_name,
                           "teachers": teachers,
                           "keywords_yake": keywords_yake,
                           "keywords_bert": keywords_bert}
    break
    

print("--------------------------------")
print(course_dict)
    # if ind > 3:
    #     break

### Try using LLM API