# Media coverage of climate change : Le Monde

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

In [None]:
from datetime import datetime

In [None]:
# set a color theme
colors=px.colors.qualitative.Prism

## Import data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
lemonde_df = pd.read_csv('/content/drive/MyDrive/lemonde_v2.csv')

In [None]:
lemonde_df['date'] = pd.to_datetime(lemonde_df['date'])

In [None]:
lemonde_df.head(3)

Unnamed: 0,date,publication,words,author,title,intro,topic,YearMonth
0,2013-01-05,Le Monde,401mots,Pierre Le Hir,"Avec le réchauffement, les tourbières se trans...",... pourraient se comporter non plus en pui...,21_co_émissions_co2_mondiales,2013-1
1,2013-01-11,Le Monde,1066mots,"Propos recueillis par Marie-Béatrice Baudet, D...",Christophe de Margerie : « Le changement clima...,"... population mondiale, 2 % des énergies p...",26_changement_climatique_adaptation_assureurs,2013-1
2,2013-01-16,Le Monde,324mots,P. L. H.,Transition énergétique : le clair-obscur de l'...,... la transition énergétique n'a de vrai s...,3_énergétique_transition_renouvelables_énergies,2013-1


## Bertopic: remove non-relevant articles and find themes

In [None]:
!pip install bertopic

In [None]:
! pip install bertopic[visualization]

In [None]:
from bertopic import BERTopic

### Bertopic on titles

Classical model then reduced

In [None]:
topic_model = BERTopic(language="multilingual")
#nr_topic_model = BERTopic(language="multilingual", nr_topics = 10)
# attention prend du temps (10min environ)
topics_title, probabilities_title = topic_model.fit_transform(lemonde_df.title)

In [None]:
# reduce the number of topics after training the model
#topic_model.reduce_topics(lemonde_df.title, nr_topics=5)

<bertopic._bertopic.BERTopic at 0x7f0e441e3370>

In [None]:
topic_model.get_topic_info()
# 3, 4, 7, 12, 13, 17, 19, 24, 27, 28, 31, 34, 37
# pour 30 topics, 886 articles perdus
# pour 20 topics, 833+845 'climat'

In [None]:
topics_title_names = topic_model.get_topic_info().Name

In [None]:
topic_model.visualize_barchart()

In [None]:
topics_title = topic_model.topics_
probabilities_title = topic_model.probabilities_

In [None]:
topic_model.save("topic_model")

In [None]:
# select by hand climate topics
#climate_topic = list(topic_model.get_topic_info().Name[i] for i in [3, 4, 7, 12, 13, 17, 19, 24, 27, 28, 31, 34, 37])
#climate_topic

In [None]:
topic_model.get_document_info(lemonde_df.title)

In [None]:
article_topic_name = topic_model.get_document_info(lemonde_df.title).Name

In [None]:
# add topics to the initial dataframe
lemonde_df['topic'] = article_topic_name

In [None]:
# filter only climate topics
#lemonde_climate_title = lemonde_df[lemonde_df['topic'].isin(climate_topic)]

In [None]:
# how many articles are lost here?
#len(lemonde_df), len(lemonde_climate_title)

(18080, 1944)

In [None]:
#topic_model.visualize_topics()

### Bertopic on the intro

In [None]:
topics_intro, probabilities_intro = topic_model.fit_transform(lemonde_df.intro)

In [None]:
topic_model.reduce_topics(lemonde_df.intro, nr_topics=40)

<bertopic._bertopic.BERTopic at 0x7f8c1a662c10>

In [None]:
topic_model.get_topic_info()

## Visualization

### by month

In [None]:
lemonde_climate_title = lemonde_df

(2662, 8)

In [None]:
lemonde_climate_title['YearMonth'] = pd.to_datetime(lemonde_climate_title['date']).apply(lambda x: '{year}-{month}'.format(year=x.year, month=x.month))

In [None]:
month_df = lemonde_climate_title.groupby(['topic','YearMonth'])['YearMonth'].count().reset_index(name="count")

In [None]:
# create dictionary of dataframes, one for each topic
month_dict = {}
for i in topics_title_names:
  month_dict['month_df_{0}'.format(i)] = month_df[month_df['topic'] == i]

In [None]:
# list of the dictionary keys
key_list = list(month_dict.keys())

In [None]:
# for each dataframe in the dictionary, sort the dataframe by date and change the name of the 'count' column

for i in range(len(key_list)):
  month_dict[key_list[i]]['YearMonth'] = pd.to_datetime(month_dict[key_list[i]]['YearMonth'])
  month_dict[key_list[i]].sort_values(by='YearMonth', inplace = True) 
  month_dict[key_list[i]]['YearMonth'] = month_dict[key_list[i]]['YearMonth'].dt.date.apply(lambda x: x.strftime('%Y-%m'))
  month_dict[key_list[i]].rename(columns = {'count': topics_title_names[i]}, inplace=True)

In [None]:
alldates = pd.date_range(start="2013-01",end="2022-12", freq='MS').strftime("%Y-%m").tolist()

In [None]:
# initialize a dataframe with all dates by month
all_themes_df = pd.DataFrame({'YearMonth' : alldates})

In [None]:
# fill the dataframe with the number of articles for each month for each topic
for i in range(len(key_list)):
  all_themes_df = all_themes_df.merge(month_dict[key_list[i]][['YearMonth', topics_title_names[i]]], 
             how='left', on='YearMonth')
  
all_themes_df = all_themes_df.fillna(0)

In [None]:
!pip install -U kaleido

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import kaleido

In [None]:
fig_themes_month = px.line(all_themes_df, x='YearMonth', y=topics_title_names,
                title="Number of articles per theme, Le Monde", 
                    color_discrete_sequence=colors, markers=True)

#fig_themes_month.write_image("fig_themes_month_non_aggregated.png") #static export
fig_themes_month.write_html("fig_themes_month_non_aggregated.html") #dynamic view

fig_themes_month.update_layout(
    xaxis_title="Date",
    yaxis_title="Number of articles",
    legend_title="Theme"
)
fig_themes_month

## Aggregate themes

In [None]:
scientific = list(topics_title_names[i] for i in [1, 2, 3, 48, 47, 43, 42, 41, 52, 51, 5, 22])
politics_economy_finance = list(topics_title_names[i] for i in [57, 58, 53, 54, 46, 44, 49, 38, 23, 24, 26, 32, 33, 34, 28, 29, 30, 12, 13, 9])
energy = list(topics_title_names[i] for i in [4, 15, 21, 31, 45, 39])
extreme_events = list(topics_title_names[i] for i in [55, 35, 20, 40])
cop = list(topics_title_names[i] for i in [8, 26])

agg_topics_names = ['scientific', 'politics_economy_finance', 'energy', 'extreme_events', 'cop']

In [None]:
aggregated_themes_df = all_themes_df

In [None]:
aggregated_themes_df['scientific'] = aggregated_themes_df[scientific].sum(axis=1)
aggregated_themes_df['politics_economy_finance'] = aggregated_themes_df[politics_economy_finance].sum(axis=1)
aggregated_themes_df['energy'] = aggregated_themes_df[energy].sum(axis=1)
aggregated_themes_df['extreme_events'] = aggregated_themes_df[extreme_events].sum(axis=1)
aggregated_themes_df['cop'] = aggregated_themes_df[cop].sum(axis=1)

In [None]:
aggregated_themes_df = aggregated_themes_df[['YearMonth', 'scientific', 'politics_economy_finance', 'energy', 'extreme_events', 'cop']]

In [None]:
agg_topics = pd.Series(agg_topics_names)
type(agg_topics)

pandas.core.series.Series

In [None]:
fig_agg_themes_month = px.line(aggregated_themes_df, x='YearMonth', y=agg_topics,
                title="Number of articles per theme (aggregated), Le Monde", 
                    color_discrete_sequence=colors, markers=True)

#fig_agg_themes_month.write_image("fig_themes_month_aggregated.png") #static export

fig_agg_themes_month.update_layout(
    xaxis_title="Date",
    yaxis_title="Number of articles",
    legend_title="Theme (aggregated)"
)

fig_agg_themes_month.write_html("fig_themes_month_aggregated.html") #dynamic view

fig_agg_themes_month