In [1]:
import pandas as pd
from ast import literal_eval
from nltk.tokenize import sent_tokenize, word_tokenize
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic
import spacy

# Load Datasets

We load the preprocessed datasets. Make sure to run the previous notebooks first, so the files are present.

## media articles

In [2]:
df_media = pd.read_csv('../data/cleantech-media.csv', converters={"token_content": literal_eval}, parse_dates=['date'])
df_media.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9588 entries, 0 to 9587
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   title          9588 non-null   object        
 1   date           9588 non-null   datetime64[ns]
 2   author         31 non-null     object        
 3   content        9588 non-null   object        
 4   domain         9588 non-null   object        
 5   url            9588 non-null   object        
 6   token_content  9588 non-null   object        
dtypes: datetime64[ns](1), object(6)
memory usage: 524.5+ KB


In [3]:
df_media.head(5)

Unnamed: 0,title,date,author,content,domain,url,token_content
0,Qatar to Slash Emissions as LNG Expansion Adva...,2021-01-13,,Qatar Petroleum ( QP) is targeting aggressive ...,energyintel,https://www.energyintel.com/0000017b-a7dc-de4c...,"[qatar, petroleum, qp, target, aggress, cut, g..."
1,India Launches Its First 700 MW PHWR,2021-01-15,,Nuclear Power Corp. of India Ltd. ( NPCIL) syn...,energyintel,https://www.energyintel.com/0000017b-a7dc-de4c...,"[nuclear, power, corp, india, ltd, npcil, sync..."
2,New Chapter for US-China Energy Trade,2021-01-20,,New US President Joe Biden took office this we...,energyintel,https://www.energyintel.com/0000017b-a7dc-de4c...,"[new, us, presid, joe, biden, take, offic, wee..."
3,Japan: Slow Restarts Cast Doubt on 2030 Energy...,2021-01-22,,The slow pace of Japanese reactor restarts con...,energyintel,https://www.energyintel.com/0000017b-a7dc-de4c...,"[slow, pace, japanes, reactor, restart, contin..."
4,NYC Pension Funds to Divest Fossil Fuel Shares,2021-01-25,,Two of New York City's largest pension funds s...,energyintel,https://www.energyintel.com/0000017b-a7dc-de4c...,"[two, new, york, citi, larg, pension, fund, sa..."


## patents

In [4]:
df_patents = pd.read_csv('../data/google_patents.csv', converters={"token_content": literal_eval}, parse_dates=['publication_date'])
df_patents.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13297 entries, 0 to 13296
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   publication_number  13297 non-null  object        
 1   application_number  13297 non-null  object        
 2   country_code        13297 non-null  object        
 3   publication_date    13297 non-null  datetime64[ns]
 4   inventor            13297 non-null  object        
 5   title               13297 non-null  object        
 6   title_lang          13297 non-null  object        
 7   abstract            13297 non-null  object        
 8   abstract_lang       13297 non-null  object        
 9   token_content       13297 non-null  object        
dtypes: datetime64[ns](1), object(9)
memory usage: 1.0+ MB


In [5]:
# TODO: remove this as soon as translated
df_patents = df_patents.loc[df_patents.abstract_lang == 'en', :]
df_patents.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6680 entries, 0 to 13295
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   publication_number  6680 non-null   object        
 1   application_number  6680 non-null   object        
 2   country_code        6680 non-null   object        
 3   publication_date    6680 non-null   datetime64[ns]
 4   inventor            6680 non-null   object        
 5   title               6680 non-null   object        
 6   title_lang          6680 non-null   object        
 7   abstract            6680 non-null   object        
 8   abstract_lang       6680 non-null   object        
 9   token_content       6680 non-null   object        
dtypes: datetime64[ns](1), object(9)
memory usage: 574.1+ KB


In [6]:
df_patents.head()

Unnamed: 0,publication_number,application_number,country_code,publication_date,inventor,title,title_lang,abstract,abstract_lang,token_content
0,US-2022239235-A1,US-202217717397-A,US,2022-07-28,[],Adaptable DC-AC Inverter Drive System and Oper...,en,Disclosed is an adaptable DC-AC inverter syste...,en,"[disclos, adapt, dc, ac, invert, system, oper,..."
1,US-2022239251-A1,US-202217580956-A,US,2022-07-28,[],System for providing the energy from a single ...,en,"In accordance with an example embodiment, a so...",en,"[accord, exampl, embodi, solar, energi, system..."
3,US-11396827-B2,US-202117606042-A,US,2022-07-26,[],Control method for optimizing solar-to-power e...,en,A control method for optimizing a solar-to-pow...,en,"[control, method, optim, solar, power, effici,..."
6,CN-114777546-A,CN-202210702520-A,CN,2022-07-22,[],Cold and hot medium energy storage hot water a...,en,The invention relates to the technical field o...,en,"[invent, relat, technic, field, energi, storag..."
11,CN-114771214-A,CN-202210621034-A,CN,2022-07-22,[],氢能源汽车空调滤清器,zh,The invention discloses a hydrogen energy auto...,en,"[invent, disclos, hydrogen, energi, automobil,..."


# Modeling with BERTopic

## Preprocessing

With BERTopic it is advised to work with the raw texts: "removing stop words as a preprocessing step is not advised as the transformer-based embedding models that we use need the full context in order to create accurate embeddings." (<https://maartengr.github.io/BERTopic/getting_started/tips_and_tricks/tips_and_tricks.html>). Therefore we use the slightly cleaned, but original content- and abstract-column.

The same document also advises to split up long documents - that's our first step:

### Split up large chunks of text

In [34]:
text_corpus_media = df_media.content

sentences_media = [sent_tokenize(doc) for doc in text_corpus_media]
sentences_media = [sentence for doc in sentences_media for sentence in doc]

sentences_media[0:3]

['Qatar Petroleum ( QP) is targeting aggressive cuts in its greenhouse gas emissions as it prepares to launch Phase 2 of its planned 48 million ton per year LNG expansion.',
 "In its latest Sustainability Report published on Wednesday, QP said its goals include `` reducing the emissions intensity of Qatar's LNG facilities by 25% and of its upstream facilities by at least 15%. ''",
 'The company is also aiming to reduce gas flaring intensity across its upstream facilities by more than 75% and has raised its carbon capture and storage ambitions from 5 million tons/yr to 7 million tons/yr by 2027.']

In [35]:
len(sentences_media)

331630

In [37]:
text_corpus_patents = df_patents.abstract

sentences_patents = [sent_tokenize(doc) for doc in text_corpus_patents]
sentences_patents = [sentence for doc in sentences_patents for sentence in doc]

sentences_patents[0:3]

['Disclosed is an adaptable DC-AC inverter system and its operation.',
 'The system includes multiple DC input sources as input to provide a stable operation under various conditions.',
 'DC input sources may be added to the system or removed from the system without impacting the functionality of the system.']

In [39]:
len(sentences_patents)

14934

### Limit training samples

Depending on where you run this you might want to reduce training samples.

In [40]:
# Limit samples to constant MAX_SAMPLES
MAX_SAMPLES = 400_000

training_sentences_media = sentences_media[0:MAX_SAMPLES]
training_sentences_patents = sentences_patents[0:MAX_SAMPLES]

In [41]:
(len(training_sentences_media), len(training_sentences_patents))

(10000, 10000)

### Calculate Embeddings

Embeddings are the numerical representation of our text data, which are needed as an input to our model.

We pre-calculate the embeddings to feed them into BERT. This saves us time later when optimising our model, as we do not have to recalculate this step every time.

In [42]:
embedding_model_media = SentenceTransformer("all-MiniLM-L6-v2")

embeddings_media = embedding_model_media.encode(training_sentences_media, show_progress_bar=True)

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

In [43]:
embeddings_media.shape

(10000, 384)

In [44]:
embedding_model_patents = SentenceTransformer("all-MiniLM-L6-v2")

embeddings_patents = embedding_model_patents.encode(training_sentences_patents, show_progress_bar=True)

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

In [45]:
embeddings_patents.shape

(10000, 384)

### Dimensionality reduction (UMAP)

As seen above each document is represented in a 384-dimensional space. Per default BERTopic is using UMAP to reduce the dimensions, we use it here too. As UMAP introduces some randomness we initialise the model here and include a `random_state` in order to get reproducible results.

In [47]:
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)

### Clustering (HDBSCAN)

We use a clustering algorithm to group similar texts together in order to find our topics. We use the default HDBSCAN with some custom parameters. Most important is probably `min_cluster_size` as it defines the minimum size of a cluster and therefore indirectly determines how many different topics there are (the lower the number, the more topics found in general).

In [50]:
hdbscan_model = HDBSCAN(min_cluster_size=75, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

### Vectorizer

After clustering our documents into different topic groups, we can use the Vectorizer to steer how topics are represented. Here we introduce a CountVectorizer with english stop word removal, to make sure we have no stop words in our topic definitions, as they do not really tell something about any topic in general. With `min_df` we also make sure only words which occur at least twice are considered.

In [51]:
vectorizer_model = CountVectorizer(stop_words="english", min_df=2)

### Representation models

Topics can also be fine-tuned. Here we use two approaches:

In [52]:
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, PartOfSpeech

# KeyBERT
keybert_model = KeyBERTInspired()

# MMR
mmr_model = MaximalMarginalRelevance(diversity=0.3)

# All representation models
representation_model = {
    "KeyBERT": keybert_model,
    "MMR": mmr_model,
}

## Train 

Let's finally train our models, for the media- and patent-dataset seperately.

In [54]:
topic_model_media = BERTopic(

  # Pipeline models
  embedding_model=embedding_model_media,
  umap_model=umap_model,
  vectorizer_model=vectorizer_model,
  hdbscan_model=hdbscan_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=10,
  verbose=True
)

topics_media, probs_media = topic_model_media.fit_transform(training_sentences_media, embeddings_media)

2024-04-26 23:30:21,881 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-04-26 23:30:34,307 - BERTopic - Dimensionality - Completed ✓
2024-04-26 23:30:34,308 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-04-26 23:30:34,722 - BERTopic - Cluster - Completed ✓
2024-04-26 23:30:34,726 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-04-26 23:30:39,308 - BERTopic - Representation - Completed ✓


In [55]:
topic_model_media.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,MMR,Representative_Docs
0,-1,4274,-1_energy_gas_oil_said,"[energy, gas, oil, said, year, carbon, new, bi...","[renewables, fuels, renewable, demand, emissio...","[energy, gas, oil, said, year, carbon, new, bi...",[Capex Pertamina plans to allocate 9% or about...
1,0,1288,0_carbon_emissions_climate_capture,"[carbon, emissions, climate, capture, zero, co...","[emissions, carbon, co2, 2050, 2030, greenhous...","[carbon, emissions, climate, capture, zero, co...","[', ""Carbon capture capacity today stands at j..."
2,1,1213,1_wind_solar_energy_offshore,"[wind, solar, energy, offshore, power, renewab...","[renewables, renewable, turbines, offshore, wi...","[wind, solar, energy, offshore, power, renewab...",[We believe Scotland has the potential to buil...
3,2,899,2_said_think_need_companies,"[said, think, need, companies, supply, market,...","[future, say, demand, economic, risk, investme...","[said, think, need, companies, supply, market,...","[Because that, I think, is where most of the f..."
4,3,615,3_nuclear_reactors_power_reactor,"[nuclear, reactors, power, reactor, energy, pl...","[nuclear, reactors, reactor, energy, 2050, 203...","[nuclear, reactors, power, reactor, energy, pl...",[But he also refocused his pro-nuclear argumen...
5,4,429,4_hydrogen_green_ammonia_blue,"[hydrogen, green, ammonia, blue, production, t...","[hydrogen, renewable, fuels, fuel, green, elec...","[hydrogen, green, ammonia, blue, production, t...",[The Neom plant is the world’ s largest commer...
6,5,335,5_lng_santos_barossa_gas,"[lng, santos, barossa, gas, million, year, dar...","[lng, lngi, santos, imports, supply, cargoes, ...","[lng, santos, barossa, gas, million, year, dar...","[', 'Gasunie said that new and existing LNG si..."
7,6,192,6_china_saudi_arabia_xi,"[china, saudi, arabia, xi, energy, beijing, ch...","[china, renewable, shanghai, beijing, guangdon...","[china, saudi, arabia, xi, energy, beijing, ch...","[', 'While many in the West question China’ s ..."
8,7,159,7_oil_gas_prices_demand,"[oil, gas, prices, demand, industry, said, glo...","[oil, petroleum, fuels, gasoline, markets, eco...","[oil, gas, prices, demand, industry, said, glo...","[Oil prices were rising ahead of winter, mostl..."
9,8,146,8_million_barrels_000_day,"[million, barrels, 000, day, oil, 11, output, ...","[refinery, opec, oil, barrels, 2021, forecast,...","[million, barrels, 000, day, oil, 11, output, ...","[About 93.6% of oil output was still off line,..."


In [56]:
topic_model_patents = BERTopic(

  # Pipeline models
  embedding_model=embedding_model_patents,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  vectorizer_model=vectorizer_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=10,
  verbose=True
)

topics_patents, probs_patents = topic_model_patents.fit_transform(training_sentences_patents, embeddings_patents)

2024-04-26 23:31:58,363 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-04-26 23:32:10,999 - BERTopic - Dimensionality - Completed ✓
2024-04-26 23:32:11,000 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-04-26 23:32:11,367 - BERTopic - Cluster - Completed ✓
2024-04-26 23:32:11,369 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-04-26 23:32:14,037 - BERTopic - Representation - Completed ✓


In [57]:
topic_model_patents.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,MMR,Representative_Docs
0,-1,4199,-1_energy_solar_power_device,"[energy, solar, power, device, arranged, heat,...","[solar, photovoltaic, energy, electricity, uti...","[energy, solar, power, device, arranged, heat,...",[The solar energy power generation device comp...
1,0,1701,0_hydropower_water_plate_station,"[hydropower, water, plate, station, conservanc...","[hydropower, hydroelectric, utility, hydraulic...","[hydropower, water, plate, station, conservanc...",[The utility model provides a water diversion ...
2,1,704,1_wind_power_energy_generator,"[wind, power, energy, generator, generation, b...","[generator, turbine, wind, utility, control, g...","[wind, power, energy, generator, generation, b...",[The invention belongs to the technical field ...
3,2,505,2_hydrogen_fuel_energy_gas,"[hydrogen, fuel, energy, gas, production, stor...","[hydrogen, hydrogenation, fuel, energy, electr...","[hydrogen, fuel, energy, gas, production, stor...",[The invention provides a hydrogen energy auto...
4,3,469,3_photovoltaic_panel_solar_plate,"[photovoltaic, panel, solar, plate, frame, rod...","[photovoltaic, solar, utility, panel, mechanis...","[photovoltaic, panel, solar, plate, frame, rod...",[The utility model discloses a high-efficiency...
5,4,447,4_heat_water_heating_solar,"[heat, water, heating, solar, tank, energy, pu...","[heating, heater, photovoltaic, solar, evapora...","[heat, water, heating, solar, tank, energy, pu...",[The invention belongs to the technical field ...
6,5,347,5_lamp_street_solar_led,"[lamp, street, solar, led, light, connected, a...","[lamp, lamps, solar, photovoltaic, lampshade, ...","[lamp, street, solar, led, light, connected, a...",[The utility model discloses a solar LED stree...
7,6,331,6_water_treatment_sewage_solar,"[water, treatment, sewage, solar, seawater, de...","[photovoltaic, desalination, solar, wastewater...","[water, treatment, sewage, solar, seawater, de...",[According to the outdoor water body pollution...
8,7,322,7_charging_solar_battery_energy,"[charging, solar, battery, energy, power, stor...","[photovoltaic, solar, charging, electricity, c...","[charging, solar, battery, energy, power, stor...",[The invention discloses a power transmission ...
9,8,236,8_geothermal_heat_pipe_exchange,"[geothermal, heat, pipe, exchange, energy, pum...","[geothermal, heating, thermal, heat, solar, en...","[geothermal, heat, pipe, exchange, energy, pum...",[The embodiment of the application discloses a...


### first short conclusion

In the media dataset much of the topics are separated along geopolitical dimensions like large projects in certain countries/continents (e.g. Africa, China, Saudi-Arabia), sanctions due to the Russia-Ukraine war, etc. Also many topics are dominated by technologies or ressources which are clearly not considered "clean" like oil and gas. But as these are part of the motivation and maybe reasons for cleaner technologies they seem to appear hear as media is reporting in a broader societal/political context.

The patent dataset topics however are clearly focused on different technologies. We see hydropower, wind, geothermal, solar and so on. This also makes sense as they are focused on exactly describing the technology used and not so much on the motivation behind it. 

### Experimental: get topic names with local LLM

As an experiment (and to save money on OpenAi-tokens) we wrote an experimental function to fine-tune the topic with a local LLM. In order to work this requires a local running Ollama-instance. See here for details: <https://ollama.com/>

Here we use `llama2` to generate names for our topics analogously to the ChatGPT example in the [BERTopic-documentation](https://maartengr.github.io/BERTopic/getting_started/representation/llm.html#openai).

In [25]:
import json
import requests

In [58]:
def get_ollama_topic(representative_docs, keywords, model='llama2'):

    # URL of the API
    url = "http://localhost:11434/api/generate"

    topic_prompt = f"""
    I have a topic that contains the following documents:
    {representative_docs}
    The topic is described by the following keywords: {keywords}
    
    Based on the information above, extract a short but highly descriptive topic label of at most 5 words. Make sure it is in the following format:
    topic: <topic label>
    """
    
    # Data to be sent
    data = {
        "model": model,
        "prompt": topic_prompt,
        "stream": False
    }
    
    # Send POST request with data
    response = requests.post(url, json=data)
    
    # Print response
    response_text = json.loads(response.text).get('response', None)
    
    return response_text.strip().replace('Topic: ', '').splitlines()[0]

In [59]:
# get topic names from LLM and prepare dict
topic_dict_media = {x['Topic']: get_ollama_topic(x['Representative_Docs'], x['Representation']) for x in topic_model_media.get_topic_info().to_dict(orient='records')}
topic_dict_media[-1] = 'Outlier Topic'
topic_dict_media

{-1: 'Outlier Topic',
 0: 'Carbon Capture & Emissions Reduction',
 1: 'Offshore Wind',
 2: 'Future Investment',
 3: 'Nuclear Power',
 4: 'Green Hydrogen Production',
 5: 'LNG Expansion',
 6: 'China-Saudi Energy Cooperation',
 7: 'Oil Prices',
 8: 'Oil Production',
 9: 'Uganda-Tanzania Oil Project',
 10: 'Electric Vehicles',
 11: 'Shell-BP Drilling',
 12: 'Russian Gas Imports'}

In [60]:
# get topic names from LLM and prepare dict
topic_dict_patents = {x['Topic']: get_ollama_topic(x['Representative_Docs'], x['Representation']) for x in topic_model_patents.get_topic_info().to_dict(orient='records')}
topic_dict_patents[-1] = 'Outlier Topic'
topic_dict_patents

{-1: 'Outlier Topic',
 0: 'Water Conservancy and Hydropower',
 1: 'Wind Power Generation',
 2: 'Hydrogen Energy System',
 3: 'Solar Power Generation',
 4: 'Solar Heating System',
 5: 'Solar LED Street Lamp',
 6: 'Water Treatment and Solar Energy',
 7: 'Solar Charging System',
 8: 'topic: Geothermal Energy Heating',
 9: 'Solar Cell Preparation Method and Device',
 10: 'Monitoring Device',
 11: 'Irrigation devices',
 12: 'Photocatalysts',
 13: 'Drying Solar Device',
 14: 'Simple Invention Advantages'}

In [61]:
# update model
topic_model_media.set_topic_labels(topic_dict_media)
topic_model_patents.set_topic_labels(topic_dict_patents)

In [62]:
# see new topic names
topic_model_media.get_topic_info()

Unnamed: 0,Topic,Count,Name,CustomName,Representation,KeyBERT,MMR,Representative_Docs
0,-1,4274,-1_energy_gas_oil_said,Outlier Topic,"[energy, gas, oil, said, year, carbon, new, bi...","[renewables, fuels, renewable, demand, emissio...","[energy, gas, oil, said, year, carbon, new, bi...",[Capex Pertamina plans to allocate 9% or about...
1,0,1288,0_carbon_emissions_climate_capture,Carbon Capture & Emissions Reduction,"[carbon, emissions, climate, capture, zero, co...","[emissions, carbon, co2, 2050, 2030, greenhous...","[carbon, emissions, climate, capture, zero, co...","[', ""Carbon capture capacity today stands at j..."
2,1,1213,1_wind_solar_energy_offshore,Offshore Wind,"[wind, solar, energy, offshore, power, renewab...","[renewables, renewable, turbines, offshore, wi...","[wind, solar, energy, offshore, power, renewab...",[We believe Scotland has the potential to buil...
3,2,899,2_said_think_need_companies,Future Investment,"[said, think, need, companies, supply, market,...","[future, say, demand, economic, risk, investme...","[said, think, need, companies, supply, market,...","[Because that, I think, is where most of the f..."
4,3,615,3_nuclear_reactors_power_reactor,Nuclear Power,"[nuclear, reactors, power, reactor, energy, pl...","[nuclear, reactors, reactor, energy, 2050, 203...","[nuclear, reactors, power, reactor, energy, pl...",[But he also refocused his pro-nuclear argumen...
5,4,429,4_hydrogen_green_ammonia_blue,Green Hydrogen Production,"[hydrogen, green, ammonia, blue, production, t...","[hydrogen, renewable, fuels, fuel, green, elec...","[hydrogen, green, ammonia, blue, production, t...",[The Neom plant is the world’ s largest commer...
6,5,335,5_lng_santos_barossa_gas,LNG Expansion,"[lng, santos, barossa, gas, million, year, dar...","[lng, lngi, santos, imports, supply, cargoes, ...","[lng, santos, barossa, gas, million, year, dar...","[', 'Gasunie said that new and existing LNG si..."
7,6,192,6_china_saudi_arabia_xi,China-Saudi Energy Cooperation,"[china, saudi, arabia, xi, energy, beijing, ch...","[china, renewable, shanghai, beijing, guangdon...","[china, saudi, arabia, xi, energy, beijing, ch...","[', 'While many in the West question China’ s ..."
8,7,159,7_oil_gas_prices_demand,Oil Prices,"[oil, gas, prices, demand, industry, said, glo...","[oil, petroleum, fuels, gasoline, markets, eco...","[oil, gas, prices, demand, industry, said, glo...","[Oil prices were rising ahead of winter, mostl..."
9,8,146,8_million_barrels_000_day,Oil Production,"[million, barrels, 000, day, oil, 11, output, ...","[refinery, opec, oil, barrels, 2021, forecast,...","[million, barrels, 000, day, oil, 11, output, ...","[About 93.6% of oil output was still off line,..."


In [63]:
topic_model_patents.get_topic_info()

Unnamed: 0,Topic,Count,Name,CustomName,Representation,KeyBERT,MMR,Representative_Docs
0,-1,4199,-1_energy_solar_power_device,Outlier Topic,"[energy, solar, power, device, arranged, heat,...","[solar, photovoltaic, energy, electricity, uti...","[energy, solar, power, device, arranged, heat,...",[The solar energy power generation device comp...
1,0,1701,0_hydropower_water_plate_station,Water Conservancy and Hydropower,"[hydropower, water, plate, station, conservanc...","[hydropower, hydroelectric, utility, hydraulic...","[hydropower, water, plate, station, conservanc...",[The utility model provides a water diversion ...
2,1,704,1_wind_power_energy_generator,Wind Power Generation,"[wind, power, energy, generator, generation, b...","[generator, turbine, wind, utility, control, g...","[wind, power, energy, generator, generation, b...",[The invention belongs to the technical field ...
3,2,505,2_hydrogen_fuel_energy_gas,Hydrogen Energy System,"[hydrogen, fuel, energy, gas, production, stor...","[hydrogen, hydrogenation, fuel, energy, electr...","[hydrogen, fuel, energy, gas, production, stor...",[The invention provides a hydrogen energy auto...
4,3,469,3_photovoltaic_panel_solar_plate,Solar Power Generation,"[photovoltaic, panel, solar, plate, frame, rod...","[photovoltaic, solar, utility, panel, mechanis...","[photovoltaic, panel, solar, plate, frame, rod...",[The utility model discloses a high-efficiency...
5,4,447,4_heat_water_heating_solar,Solar Heating System,"[heat, water, heating, solar, tank, energy, pu...","[heating, heater, photovoltaic, solar, evapora...","[heat, water, heating, solar, tank, energy, pu...",[The invention belongs to the technical field ...
6,5,347,5_lamp_street_solar_led,Solar LED Street Lamp,"[lamp, street, solar, led, light, connected, a...","[lamp, lamps, solar, photovoltaic, lampshade, ...","[lamp, street, solar, led, light, connected, a...",[The utility model discloses a solar LED stree...
7,6,331,6_water_treatment_sewage_solar,Water Treatment and Solar Energy,"[water, treatment, sewage, solar, seawater, de...","[photovoltaic, desalination, solar, wastewater...","[water, treatment, sewage, solar, seawater, de...",[According to the outdoor water body pollution...
8,7,322,7_charging_solar_battery_energy,Solar Charging System,"[charging, solar, battery, energy, power, stor...","[photovoltaic, solar, charging, electricity, c...","[charging, solar, battery, energy, power, stor...",[The invention discloses a power transmission ...
9,8,236,8_geothermal_heat_pipe_exchange,topic: Geothermal Energy Heating,"[geothermal, heat, pipe, exchange, energy, pum...","[geothermal, heating, thermal, heat, solar, en...","[geothermal, heat, pipe, exchange, energy, pum...",[The embodiment of the application discloses a...


## Visualise the topics

In [32]:
# TODO