In [2]:
import pandas as pd
import os
import numpy as np
from pathlib import Path
import sys

In [3]:
# Import data classes
project_root = Path.cwd().parent  # assumes you're in /notebooks
sys.path.append(str(project_root))

from backend.classes import CORDIS_data, Project_data
horizon_data = CORDIS_data(parent_dir=project_root, enrich=True)

project = horizon_data.project_df
project = project.drop(columns=['projectID_x', 'projectID_y']) # unnecesary columns, either NaN or same as 'id'
project = project.drop(columns=['field_class', 'field','subfield', 'niche'])

Enriching the projects dataset with temporal information.
Enriching the projects dataset with people and institutions information.
Enriching the projects dataset with financial information.
Enriching the projects dataset with thematic / scientific information.
Enriched project_df with scientific and thematic information: 15863 projects
Columns of the project dataframe after enrichment:
  - 35 columns
  - Columns: id, acronym, status, title, startDate, endDate, totalCost, ecMaxContribution, legalBasis, topics, ecSignatureDate, frameworkProgramme, masterCall, subCall, fundingScheme, nature, objective, contentUpdateDate, rcn, grantDoi, duration_days, duration_months, duration_years, projectID_x, n_institutions, projectID_y, institutions, projectID, coordinator_name, ecContribution_per_year, totalCost_per_year, field_class, field, subfield, niche


KeyError: 'path'

In [12]:
metrics_list = ['totalCost',
                'ecMaxContribution',
                'totalCost_per_year',
                'ecContribution_per_year', 
                'duration_days',
                'duration_months',
                'duration_years',
                'n_institutions'
               ]

Make dataframe with information per field/topic. The supcategories are made as separate columns for use in tree maps or sunburst diagrams.

In [13]:
# df with multiple rows per proj, corresponding to different fields
euroscivoc = horizon_data.sci_voc_df
fields = pd.merge(project, euroscivoc, left_on='id', right_on='projectID', how='outer')
fields = fields.drop(columns=['projectID_x', 'projectID_y'])

# Group by euroSciVocTitle and calculate the sum of metrics
metrics_per_euroSciVocTitle = fields.groupby('euroSciVocTitle')[metrics_list].sum()

# make df with fields only and add metrics
fields = fields.drop_duplicates('euroSciVocTitle').set_index('euroSciVocTitle')
fields = fields.drop(columns=metrics_list)
fields = pd.merge(fields, metrics_per_euroSciVocTitle, on='euroSciVocTitle')

# make field path entries as separate columns
for i in fields.index:
    fullcode = fields.euroSciVocCode[i]
    fullpath = fields.euroSciVocPath[i]
    for level, code in enumerate(fullcode.split('/')[1:]):
        fields.loc[i, f'code_lvl_{level}'] = code
    for level, path in enumerate(fullpath.split('/')[1:]):
        fields.loc[i, f'path_lvl_{level}'] = path

Make lists for labels, parents and values for treemap/sunburst diagram structure. These are all that is needed for visualization here.

In [14]:
labels = []
parents= []
values_dict = {metric:[] for metric in metrics_list}
for level in range(0,7):
    new_labels = fields.drop_duplicates(f'path_lvl_{level}')[f'path_lvl_{level}'].to_list()
    labels += new_labels
    if level==0:
        parents += ['']*len(new_labels)
    else:
        parents += fields.drop_duplicates(f'path_lvl_{level}')[f'path_lvl_{level-1}'].to_list()
    for metric in metrics_list:
        values_dict[metric] += fields.groupby(f'path_lvl_{level}', sort=False, dropna=False)[metric].sum().to_list()

## Plot

In [15]:
metric = 'ecMaxContribution'#'n_institutions' 
values = values_dict[metric]
import plotly.graph_objects as go

print(f'showing {metric}')
fig =go.Figure(go.Sunburst(
    labels=labels,
    parents=parents,
    values=values,
    insidetextorientation='radial',
    branchvalues='total',
    maxdepth=3
))
fig.update_layout(margin = dict(t=0, l=0, r=0, b=0))

fig.show()

showing ecMaxContribution


todo:
- make beginning part of data preprocessing
- add better names and description of each metric

Notes:
- projects with no euroscivoc title specified are dropped
- for projects that belong to different fields, the total ecMaxContribution (or other metric) is added to all fields. (alternatively: distribute equally among the categories??)