In [36]:
import pandas as pd
import numpy as np
import matplotlib as plt
import sklearn as sk
from bertopic import BERTopic
import nltk
import nltk.corpus
from gensim.models import Word2Vec
import os

In [38]:
# get words from the text corpus

from nltk.corpus import PlaintextCorpusReader
wordlists = PlaintextCorpusReader('da/processed', '.*')

da_words = []

for file in wordlists.fileids():
        da_words.append(wordlists.words(file))

In [39]:
# get the rosetta data

data_raw = pd.read_excel("rosetta_data_inputs_2023.xls")



In [40]:
# Get main columns and filter rows without headings
data = data_raw[["Overskrift", "Beskrivelse"]]
data = data[data['Overskrift'].notna()]

In [45]:
# remove danish words and prep data for use

def only_english(text):
       return " ".join(w for w in nltk.wordpunct_tokenize(text) if w.lower() not in da_words and w.isalpha())

# combine heading and description
title_strings = data['Overskrift'].astype('str')
description_strings = data['Beskrivelse'].astype('str')
docs = title_strings + " \n\n" + description_strings

docs.apply(only_english)

1       Combi material design Use a combination of mat...
2       Sterilization with UV light UVC light can be u...
3       seperation by oscillation Protein powder is in...
4       high pressure expose the product to high enoug...
5       Filtration push the product through a membrane...
                              ...                        
5487    Weather resistance inspired by arctic animals ...
5488    Multiple options for power generation To provi...
5489    Adaptable To increase the maneuverability of t...
5490    Weather balloon for communication relay Satell...
5491    Adaptable sensor image frequency and resolutio...
Length: 5339, dtype: object

In [46]:
from sklearn.feature_extraction.text import CountVectorizer

# Remove stopwords after documents are assigned to topics
# Allows the transformer model to get the full context of the 
# data, while removing stopwords that are noise in the topics
vectorizer_model = CountVectorizer(stop_words="english")

# Define and train the model
topic_model = BERTopic(nr_topics=20, vectorizer_model=vectorizer_model)
topics, probs = topic_model.fit_transform(docs)

In [47]:
topic_model.get_topic_info()  

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1103,-1_use_water_nan_temperature,"[use, water, nan, temperature, heat, using, en...",[Control the water temperature \n\nHeating/coo...
1,0,927,0_nan_robot_blinds_ground,"[nan, robot, blinds, ground, robots, explosive...","[Use micro-robots \n\nnan, people \n\nnan, Win..."
2,1,685,1_weeds_toilet_protein_detect,"[weeds, toilet, protein, detect, use, kill, po...",[Piezoelectric sensors \n\nPiezoelectric senso...
3,2,495,2_cups_people_plastic_glands,"[cups, people, plastic, glands, cup, packaging...",[A triangle shaped cup made of grassseeds pres...
4,3,373,3_tank_fish_tanks_chickens,"[tank, fish, tanks, chickens, random, tires, m...","[flywheels on tank \n\nIn theory, it's possibl..."
5,4,345,4_heat_exchanger_wind_pumice,"[heat, exchanger, wind, pumice, turbine, turbi...",[Monitor conditions \n\nMonitor conditions to...
6,5,198,5_pizza_oven_buns_bread,"[pizza, oven, buns, bread, rolls, random, diff...",[Big pizza oven \n\nMake the pizza oven bigger...
7,6,174,6_berries_pulp_potatoes_starch,"[berries, pulp, potatoes, starch, potato, prod...",[Take berries home \n\nInstead of cutting stra...
8,7,160,7_chair_wheels_table_elderly,"[chair, wheels, table, elderly, chairs, citize...",[Assumption Future (No care workers) - Fully a...
9,8,153,8_esg_employees_employee_sustainability,"[esg, employees, employee, sustainability, ini...",[Physical Event Planner Platform (PEPP) \n\nA ...


In [48]:
# set up Word2Vec embeddings

w2v = Word2Vec(docs, window=5, workers=4, min_count=3)

In [49]:
# do PCA

from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca_result = pca.fit_transform(w2v.wv.vectors)

In [50]:
import seaborn as sns

sns.scatterplot(data=pca_result, x=0, y=1)
plt.title("Transformed data")
plt.xlabel("First Eigenvector")
plt.ylabel("Second Eigenvector")
plt.show()

TypeError: Data source must be a DataFrame or Mapping, not <class 'numpy.ndarray'>.