In [1]:
import pandas as pd
import os
import numpy as np
from pathlib import Path
import sys

In [2]:
# 0) set up
from pathlib import Path
import sys
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

project_root = Path.cwd().parent
sys.path.append(str(project_root))
from backend.classes import CORDIS_data

h = CORDIS_data(parent_dir=project_root, enrich=False)

# rename for brevity
proj = h.project_df
link = h.project_sci_voc       # columns: project_id, sci_voc_code
sci  = h.sci_voc_df            # columns: code, path, title, description


In [3]:
# 1) define your metrics_list in snake_case
metrics_list = [
    'total_cost',
    'ec_max_contribution',
    'total_cost_per_year',
    'ec_contribution_per_year',
    'duration_days',
    'duration_months',
    'duration_years',
    'n_institutions'
]


Make dataframe with information per field/topic. The supcategories are made as separate columns for use in tree maps or sunburst diagrams.

In [4]:
# 2) build a single DataFrame with one row per (project, theme) + all your metrics
df = (
    link
      .merge(sci.rename(columns={'code':'sci_voc_code',
                                 'path':'sci_voc_path',
                                 'title':'sci_voc_title'}),
             on='sci_voc_code',
             how='left')
      .merge(proj[['id'] + metrics_list].rename(columns={'id':'project_id'}),
             on='project_id',
             how='left')
)

# explode the slash-delimited path into lvl0, lvl1, … 
levels = df['sci_voc_path']\
           .str.strip('/')\
           .str.split('/', expand=True)\
           .rename(columns=lambda i: f'path_lvl_{i}')

df = pd.concat([df, levels], axis=1)


Make lists for labels, parents and values for treemap/sunburst diagram structure. These are all that is needed for visualization here.

In [5]:
labels = []
parents = []
values_dict = {m: [] for m in metrics_list}
max_lvl = 4

for lvl in range(max_lvl+1):
    lvl_col     = f'path_lvl_{lvl}'
    parent_col  = f'path_lvl_{lvl-1}' if lvl>0 else None

    # unique nodes at this level, in original order
    uniques = df[lvl_col].dropna().unique().tolist()
    labels.extend(uniques)

    # parents: empty for root, else the parent string
    if lvl==0:
        parents.extend(['']*len(uniques))
    else:
        # for each unique node, look up its parent
        parents += [
            df.loc[df[lvl_col]==node, parent_col].iloc[0] 
            for node in uniques
        ]

    # now collect sums per metric
    grp = df.groupby(lvl_col)
    for m in metrics_list:
        # ensure we sum in the same order as `uniques`
        vals = grp[m].sum().reindex(uniques).fillna(0).tolist()
        values_dict[m].extend(vals)


## Plot

In [6]:
# finally pick one metric and draw
metric = 'ec_max_contribution'
fig = go.Figure(go.Sunburst(
    labels=labels,
    parents=parents,
    values=values_dict[metric],
    branchvalues='total',
    maxdepth=3
))
fig.update_layout(margin=dict(t=0,l=0,r=0,b=0))
fig.show()


todo:
- make beginning part of data preprocessing
- add better names and description of each metric

Notes:
- projects with no euroscivoc title specified are dropped
- for projects that belong to different fields, the total ecMaxContribution (or other metric) is added to all fields. (alternatively: distribute equally among the categories??)