# Media coverage of climate change : Bertopic

In this notebook, we apply Bertopic (https://maartengr.github.io/BERTopic/index.html) to the selected articles. Actually, we only dispose of titles and extracts of articles, so we concatenate these before topic modeling. 

**! Warning**
As BERTopic has some random components, you will not get our results from this notebook. However, you can directly find our resulting dataframes on 
the github repository.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

In [None]:
from datetime import datetime

In [None]:
# set a color theme
colors=px.colors.qualitative.Prism

## Import data

In [None]:
# dataframes containing articles with climate change as their main topic

lemonde = pd.read_csv('lemonde.csv')
lesechos = pd.read_csv('lesechos.csv')
libé = pd.read_csv('libé.csv')
lacroix = pd.read_csv('lacroix.csv')
lefigaro = pd.read_csv('lefigaro.csv')

"\nlemonde = pd.read_csv('lemonde.csv')\nlesechos = pd.read_csv('lesechos.csv')\nlibé = pd.read_csv('libé.csv')\nlacroix = pd.read_csv('lacroix.csv')\nlefigaro = pd.read_csv('lefigaro.csv')\n"

In [None]:
lemonde.head()

Unnamed: 0,date,publication,words,author,title,intro
0,2013-01-05,Le Monde,401mots,Pierre Le Hir,"Avec le réchauffement, les tourbières se trans...",... pourraient se comporter non plus en pui...
1,2013-01-11,Le Monde,1066mots,"Propos recueillis par Marie-Béatrice Baudet, D...",Christophe de Margerie : « Le changement clima...,"... population mondiale, 2 % des énergies p..."
2,2013-01-16,Le Monde,324mots,P. L. H.,Transition énergétique : le clair-obscur de l'...,... la transition énergétique n'a de vrai s...


In [None]:
# number of articles per journal
len(lemonde), len(lesechos), len(libé), len(lacroix), len(lefigaro)

(2659, 1825, 1048, 983, 1242)

In [None]:
# Convert dates to datetime format

lemonde['date'] = pd.to_datetime(lemonde['date'])
lesechos['date'] = pd.to_datetime(lesechos['date'])
libé['date'] = pd.to_datetime(libé['date'])
lacroix['date'] = pd.to_datetime(lacroix['date'])
lefigaro['date'] = pd.to_datetime(lefigaro['date'])

### Build final dataframe

In [None]:
# concatenate title and intro into a 'text' column

lemonde['text'] = lemonde[['title', 'intro']].agg('.'.join, axis=1)
lesechos['text'] = lesechos[['title', 'intro']].agg('.'.join, axis=1)
lefigaro['text'] = lefigaro[['title', 'intro']].agg('.'.join, axis=1)
lacroix['text'] = lacroix[['title', 'intro']].agg('.'.join, axis=1)
libé['text'] = libé[['title', 'intro']].agg('.'.join, axis=1)

In [None]:
# concatenate all journals in one dataframe (we perform bertopic on all articles)

all_docs = pd.concat([lemonde, lesechos, lefigaro, lacroix, libé])
all_docs.insert(loc=1, column= 'YearMonth', value= pd.to_datetime(all_docs['date']).apply(lambda x: x.strftime('%Y-%m')))

In [None]:
all_docs.head(3)

Unnamed: 0,date,YearMonth,publication,words,author,title,intro,text
0,2013-01-05,2013-01,Le Monde,401mots,Pierre Le Hir,"Avec le réchauffement, les tourbières se trans...",... pourraient se comporter non plus en pui...,"Avec le réchauffement, les tourbières se trans..."
1,2013-01-11,2013-01,Le Monde,1066mots,"Propos recueillis par Marie-Béatrice Baudet, D...",Christophe de Margerie : « Le changement clima...,"... population mondiale, 2 % des énergies p...",Christophe de Margerie : « Le changement clima...
2,2013-01-16,2013-01,Le Monde,324mots,P. L. H.,Transition énergétique : le clair-obscur de l'...,... la transition énergétique n'a de vrai s...,Transition énergétique : le clair-obscur de l'...


In [None]:
len(all_docs)

7757

In [None]:
# convert the text to be modelled into a list
text_list = list(all_docs.text)

## Bertopic on concatenation of title and intro

In [None]:
!pip install bertopic

In [None]:
! pip install bertopic[visualization]

In [None]:
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import MaximalMarginalRelevance
from umap import UMAP

# Model 

In [None]:
# set model

# number of topics
nr_topics = 40 

# to reduce the weight of frequent words ("de", "la", "les", etc)
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

# to encourage word diversity among topics: 0 being not at all diverse and 1 being completely diverse
representation_model = MaximalMarginalRelevance(diversity=0.8)

# set model
topic_model = BERTopic(nr_topics=nr_topics,
                       ctfidf_model=ctfidf_model, 
                       representation_model=representation_model, 
                       language="multilingual", # as we deal with French newspapers
                       n_gram_range=(1, 3), # unigrams, bigrams, and trigrams are included in the model
                       top_n_words=5, # only the top five most representative words are included
                       calculate_probabilities=False, # to save time as we will not need probabilities
                       verbose=True)

In [None]:
# apply model to all articles
topics, probabilities = topic_model.fit_transform(text_list)

In [None]:
# get topics info
topic_model.get_topic_info()

In [None]:
# assign a theme to outliers
new_topics = topic_model.reduce_outliers(text_list, topics)
#topic_model.update_topics(text_list, topics=topics_unsupervised)

# update topics frequencies
documents = pd.DataFrame({"Document": text_list, "Topic": new_topics})
topic_model._update_topic_size(documents)

100%|██████████| 4/4 [00:01<00:00,  2.27it/s]


In [None]:
# get updated topics info
topic_model.get_topic_info()

## Add topics to initial dataframe

In [None]:
# get topic dataframe
topic_df = topic_model.get_document_info(text_list)

In [None]:
# merge initial dataframe and topic dataframe
all_docs_topics = all_docs.copy()

In [None]:
all_docs_topics['topic_nr'] = topic_df['Topic']
all_docs_topics['topic_name'] = topic_df['Name']
all_docs_topics.drop(['date', 'words', 'author', 'title', 'intro'], axis=1, inplace=True)

In [None]:
all_docs_topics.head(2)

Unnamed: 0,YearMonth,publication,text,topic_nr,topic_name
0,2013-01,Le Monde,"Avec le réchauffement, les tourbières se trans...",1,1_carbone_france_gaz effet de_émissions de
1,2013-01,Le Monde,Christophe de Margerie : « Le changement clima...,2,2_europe_union_bruxelles_émissions de


## Create time series

In [None]:
# get topic names
topics_names = topic_model.get_topic_info().Name

In [None]:
month_df = all_docs_topics.groupby(['topic_name','YearMonth'])['YearMonth'].count().reset_index(name="count")

In [None]:
month_df.head(2)

In [None]:
# create dictionary of dataframes, one for each topic
month_dict = {}
for i in topics_names:
  month_dict['month_df_{0}'.format(i)] = month_df[month_df['topic_name'] == i]

In [None]:
# list of the dictionary keys
key_list = list(month_dict.keys())

In [None]:
# for each dataframe in the dictionary, sort the dataframe by date and change the name of the 'count' column

for i in range(len(key_list)):
  month_dict[key_list[i]]['YearMonth'] = pd.to_datetime(month_dict[key_list[i]]['YearMonth'])
  month_dict[key_list[i]].sort_values(by='YearMonth', inplace = True) 
  month_dict[key_list[i]]['YearMonth'] = month_dict[key_list[i]]['YearMonth'].dt.date.apply(lambda x: x.strftime('%Y-%m'))
  month_dict[key_list[i]].rename(columns = {'count': topics_names[i]}, inplace=True)

In [None]:
alldates = pd.date_range(start="2013-01",end="2022-12", freq='MS').strftime("%Y-%m").tolist()

In [None]:
# initialize a dataframe with all dates by month
all_themes = pd.DataFrame({'YearMonth' : alldates})

In [None]:
# fill the dataframe with the number of articles for each month for each topic
for i in range(len(key_list)):
  all_themes = all_themes.merge(month_dict[key_list[i]][['YearMonth', topics_names[i]]], 
             how='left', on='YearMonth')

all_themes = all_themes.fillna(0)

# Visualization

From now on, we import our results (see folder 'data' on github) and work on them.

In [None]:
all_themes = pd.read_csv('all_themes.csv')

In [None]:
all_themes.head(2)

Unnamed: 0.1,Unnamed: 0,YearMonth,0_cop_paris_changement_accord,1_carbone_france_gaz effet de_émissions de,2_europe_union_bruxelles_émissions de,3_joe biden_américain_son_plan,4_macron_les villes_entreprises_dollars,5_bolsonaro_changement_brésil_ne,6_des glaciers_stations_les alpes_glaces,7_inondations_phénomènes_eaux_par le,...,29_justin trudeau_justin_harper_oléoduc géant,30_les vins_bordeaux_vigne et du_viticole française,31_barrière_barrière de corail_récifs_récifs coralliens,32_rénovation_rénovation énergétique_électricité_énergie en france,33_bangladesh_les réfugiés climatiques_climatique crée plus_utilise étranger comme,34_tourisme est responsable_crémation_hébergement_voyages,35_concentrations de gaz_ppm_record_année les concentrations,36_stables_les émissions mondiales_vont continuer grimper_37 ici,37_climatique hydrogène vert_électrolyse_être produit_azotés,38_climatique notre_droit reste identifié_mouvements migratoires dans_monde gouverné en
0,0,2013-01,0.0,13.0,5.0,2.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1,2013-02,4.0,3.0,1.0,5.0,4.0,1.0,2.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,2.0,0.0,0.0


In [None]:
topics_names = list(all_themes.columns[2:])

In [None]:
!pip install -U kaleido

In [None]:
import kaleido

### Visualise topics of your choice

In [None]:
fig_themes = px.line(all_themes, x='YearMonth', y=topics_names[0],
                    color_discrete_sequence=colors, markers=True)


fig_themes.update_layout(
    xaxis_title="Date",
    yaxis_title="Number of articles",
    legend_title="Theme"
)

fig_themes.update_xaxes(ticklabelmode="period")
fig_themes.update_layout(xaxis_range=['2013-01','2022-12'])

fig_themes.write_image("fig_themes_non_aggregated.png") #static export
#fig_themes.write_html("fig_themes_non_aggregated.html") #dynamic view

fig_themes

# Aggregate topics

In [None]:
topics_names

['0_cop_paris_changement_accord',
 '1_carbone_france_gaz effet de_émissions de',
 '2_europe_union_bruxelles_émissions de',
 '3_joe biden_américain_son_plan',
 '4_macron_les villes_entreprises_dollars',
 '5_bolsonaro_changement_brésil_ne',
 '6_des glaciers_stations_les alpes_glaces',
 '7_inondations_phénomènes_eaux_par le',
 '8_bce_banque centrale_christine lagarde_politique monétaire',
 '9_chinois_jinping_pic_la chine et',
 '10_émissions de_de gaz_effet de serre_gaz effet de',
 '11_déforestation_les forêts_pompiers_tropicales',
 '12_angela merkel_ses_chancelière_respecter ses engagements',
 '13_oiseaux_invasives_exotiques_pollution',
 '14_climat australie_incendies_abbott_par habitant',
 '15_énergie nucléaire_atome_nucléaire qui_japon',
 '16_premières victimes_enfants_santé mentale_pandémie',
 '17_vaches_de élevage_14 des_production de viande',
 '18_navires_le diesel_volkswagen_constructeurs',
 '19_au moins 40_un esprit_40 les émissions_de justice sociale',
 '20_la plus chaude_copernic

In [None]:
# Aggregate topics into themes

Causes_of_Solutions_to_Climate_Change = ['17_vaches_de élevage_14 des_production de viande',
                                         '18_navires_le diesel_volkswagen_constructeurs',
                                         '36_stables_les émissions mondiales_vont continuer grimper_37 ici',
                                         '37_climatique hydrogène vert_électrolyse_être produit_azotés',
                                         '10_émissions de_de gaz_effet de serre_gaz effet de',
                                         '15_énergie nucléaire_atome_nucléaire qui_japon',
                                         '32_rénovation_rénovation énergétique_électricité_énergie en france'
                                         ]
COPs = ['0_cop_paris_changement_accord']
Climate_Politics = ['1_carbone_france_gaz effet de_émissions de',
                    '2_europe_union_bruxelles_émissions de',
                    '3_joe biden_américain_son_plan',
                    '5_bolsonaro_changement_brésil_ne',
                    '8_bce_banque centrale_christine lagarde_politique monétaire',
                    '9_chinois_jinping_pic_la chine et',
                    '12_angela merkel_ses_chancelière_respecter ses engagements',
                    '29_justin trudeau_justin_harper_oléoduc géant',
                    '33_bangladesh_les réfugiés climatiques_climatique crée plus_utilise étranger comme',
                    '19_au moins 40_un esprit_40 les émissions_de justice sociale'
                    ]
Awareness  = ['27_communautés locales agissent_climatique vid_en croissance_économie circulaire est']
Impacts_on_humans = ['7_inondations_phénomènes_eaux_par le',
                     '14_climat australie_incendies_abbott_par habitant',
                     '16_premières victimes_enfants_santé mentale_pandémie',
                     '22_vanuatu_qui subit_petites îles_continent africain',
                     '25_famine_madagascar_kenya_milliards habitants',
                     '28_océans en_île_courants_du niveau des',
                     '30_les vins_bordeaux_vigne et du_viticole française',
                     '38_climatique notre_droit reste identifié_mouvements migratoires dans_monde gouverné en'
                     ]
Economic_Impacts = ['4_macron_les villes_entreprises_dollars',
                    '6_des glaciers_stations_les alpes_glaces',
                    '23_transport aérien_les avions_le kérosène_aéroport de',
                    '24_banques_les banques centrales_centrales_risques financiers',
                    '26_actionnaires_exxonmobil_pétrolier_majors',
                    '34_tourisme est responsable_crémation_hébergement_voyages'
                    ]
Climate_Science_Impacts_on_the_Ecosystem = ['11_déforestation_les forêts_pompiers_tropicales',
                                           '13_oiseaux_invasives_exotiques_pollution',
                                           '20_la plus chaude_copernicus_record_européen copernicus',
                                           '21_glaciologie_température_surface_années 1990 aux',
                                           '31_barrière_barrière de corail_récifs_récifs coralliens',
                                           '35_concentrations de gaz_ppm_record_année les concentrations'
                                           ]

In [None]:
aggregated_themes = all_themes.copy()

In [None]:
# Create dataframe with time series for each theme

aggregated_themes['Causes of & Solutions to Climate Change'] = aggregated_themes[Causes_of_Solutions_to_Climate_Change].sum(axis=1)
aggregated_themes['COPs'] = aggregated_themes[COPs].sum(axis=1)
aggregated_themes['Climate Politics'] = aggregated_themes[Climate_Politics].sum(axis=1)
aggregated_themes['Awareness'] = aggregated_themes[Awareness].sum(axis=1)
aggregated_themes['Impacts on humans'] = aggregated_themes[Impacts_on_humans].sum(axis=1)
aggregated_themes['Economic Impacts'] = aggregated_themes[Economic_Impacts].sum(axis=1)
aggregated_themes['Climate Science & Impacts on the Ecosystem'] = aggregated_themes[Climate_Science_Impacts_on_the_Ecosystem].sum(axis=1)

In [None]:
# Keep only interesting columns for readability
aggregated_themes = aggregated_themes[['YearMonth', 
                                       'Causes of & Solutions to Climate Change',
                                       'COPs', 'Climate Politics', 'Awareness',
                                       'Impacts on humans', 'Economic Impacts',
                                       'Climate Science & Impacts on the Ecosystem']]

In [None]:
aggregated_themes.to_csv('aggregated_themes.csv')

In [None]:
agg_topics = pd.Series(['Causes of & Solutions to Climate Change',
                                       'COPs', 'Climate Politics', 'Awareness',
                                       'Impacts on humans', 'Economic Impacts',
                                       'Climate Science & Impacts on the Ecosystem'])

### Visualization

In [None]:
# Evolution of themes

fig_agg_themes = px.line(aggregated_themes, x='YearMonth', y=agg_topics,
                title="Number of articles per theme, all journals", 
                    color_discrete_sequence=colors, markers=True)



fig_agg_themes.update_layout(
    xaxis_title="Date",
    yaxis_title="Number of articles",
    legend_title=None
)

fig_agg_themes.update_xaxes(ticklabelmode="period")
fig_agg_themes.update_layout(xaxis_range=['2013-01','2022-12'])

fig_agg_themes.update_layout(legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="left",
    x=0.01,
    font = dict(size = 16)
    ))

fig_agg_themes.write_image("fig_themes_aggregated.png") #static export
#fig_agg_themes.write_html("fig_themes_aggregated.html") #dynamic view

fig_agg_themes

# A bit of statistics

In [None]:
# percentage of articles per theme
for i in range(2, 41):
  print('{}:{} '.format(all_themes.columns[i], round(all_themes[all_themes.columns[i]].sum()/len(all_docs)*100, 2)))

0_cop_paris_changement_accord:20.64 
1_carbone_france_gaz effet de_émissions de:8.75 
2_europe_union_bruxelles_émissions de:5.75 
3_joe biden_américain_son_plan:6.59 
4_macron_les villes_entreprises_dollars:4.13 
5_bolsonaro_changement_brésil_ne:3.21 
6_des glaciers_stations_les alpes_glaces:2.93 
7_inondations_phénomènes_eaux_par le:3.33 
8_bce_banque centrale_christine lagarde_politique monétaire:1.61 
9_chinois_jinping_pic_la chine et:3.62 
10_émissions de_de gaz_effet de serre_gaz effet de:3.25 
11_déforestation_les forêts_pompiers_tropicales:1.65 
12_angela merkel_ses_chancelière_respecter ses engagements:1.55 
13_oiseaux_invasives_exotiques_pollution:1.97 
14_climat australie_incendies_abbott_par habitant:1.75 
15_énergie nucléaire_atome_nucléaire qui_japon:1.31 
16_premières victimes_enfants_santé mentale_pandémie:2.62 
17_vaches_de élevage_14 des_production de viande:1.01 
18_navires_le diesel_volkswagen_constructeurs:0.84 
19_au moins 40_un esprit_40 les émissions_de justice s