In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
from bertopic import BERTopic
import nltk
import nltk.corpus
from gensim.models import Word2Vec
import os
from wordcloud import WordCloud
import collections
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))

In [10]:
# get words from the text corpus

from nltk.corpus import PlaintextCorpusReader
wordlists = PlaintextCorpusReader('da/processed', '.*')

da_words = []

for file in wordlists.fileids():
        da_words.append(wordlists.words(file))

In [11]:
# get the rosetta data

data_raw = pd.read_excel("rosetta_data_inputs_2023.xls")



In [12]:
# Get main columns and filter rows without headings
data = data_raw[["Overskrift", "Beskrivelse"]]
data = data[data["Overskrift"].notna()]

In [13]:
# remove danish words and prep data for use

def filter_danish(text):
       return " ".join(w.lower() for w in nltk.wordpunct_tokenize(text) 
                       if w.casefold() not in da_words and w.isalpha() and w != 'nan')

# combine heading and description
title_strings = data['Overskrift'].astype('str')
description_strings = data['Beskrivelse'].astype('str')
docs = title_strings + " \n\n" + description_strings

docs = docs.apply(filter_danish)

In [14]:
# tokenize text, keep short tokens and stopwords

def tokenize_text(text, tokenizer):
    tokens = tokenizer(text)  # Get tokens from text
    return tokens

tokens = docs.map(lambda x: tokenize_text(x, nltk.word_tokenize))

### BERT Topic Model

In [20]:
## BERT Topic Model

from sklearn.feature_extraction.text import CountVectorizer

# Remove stopwords after documents are assigned to topics
# Allows the transformer model to get the full context of the 
# data, while removing stopwords that are noise in the topics
vectorizer_model = CountVectorizer(stop_words="english")

# Define and train the model
topic_model = BERTopic(nr_topics=20, vectorizer_model=vectorizer_model)
topics, probs = topic_model.fit_transform(docs)
topic_model.get_topic_info() 

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1266,-1_use_cup_water_make,"[use, cup, water, make, using, surface, robot,...",[rubberband making a flexible cup that can exp...
1,0,531,0_plastic_cups_glands_packaging,"[plastic, cups, glands, packaging, people, rec...",[user designed cups people dont throw ther cup...
2,1,529,1_heat_pumice_oven_wind,"[heat, pumice, oven, wind, pizza, energy, wast...",[use waste heat for desalination of water wast...
3,2,458,2_window_windows_weeds_glass,"[window, windows, weeds, glass, old, plants, f...",[big window made of old smaller windows the bi...
4,3,242,3_tank_random_tires_tanks,"[tank, random, tires, tanks, flywheel, terrain...",[flywheels on tank in theory it s possible to ...
5,4,241,4_detect_tobacco_urine_smell,"[detect, tobacco, urine, smell, sensors, analy...",[in car cleaning gas chamber targeting the iss...
6,5,225,5_esg_employees_employee_sustainability,"[esg, employees, employee, sustainability, fas...",[physical event planner platform pepp a platfo...
7,6,212,6_berries_potatoes_cutting_pulp,"[berries, potatoes, cutting, pulp, potato, sta...",[take berries home instead of cutting strawber...
8,7,208,7_fish_chickens_feed_dead,"[fish, chickens, feed, dead, chicken, water, f...",[ask chatgpt if fish should be fed using image...
9,8,199,8_board_gypsum_boards_wall,"[board, gypsum, boards, wall, holes, screws, m...",[magnet loosens the gypsum board a magnet make...


In [21]:
topic_model.visualize_topics()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed