- Importing necessary libraries
- Utility functions for Excel formatting (e.g., adjusting column width)

In [2]:
#Collect Eusostat data
from pyjstat import pyjstat


# Data Manipulation
import pandas as pd
import numpy as np
from datetime import datetime

# Excel Formating Libraries
from openpyxl import Workbook
from openpyxl.utils.dataframe import dataframe_to_rows
from openpyxl.styles import PatternFill, Font, Border, Side, Alignment
from openpyxl.utils import get_column_letter
#from openpyxl.styles import NumberFormat

In [3]:
# Get the data from api

url = "https://ec.europa.eu/eurostat/api/dissemination/statistics/1.0/data/nama_10_gdp?format=JSON&time=2020&time=2021&time=2022&geo=BE&geo=BG&geo=CZ&geo=DK&geo=DE&geo=EE&geo=IE&geo=EL&geo=ES&geo=FR&geo=HR&geo=IT&geo=CY&geo=LV&geo=LT&geo=LU&geo=HU&geo=MT&geo=NL&geo=AT&geo=PL&geo=PT&geo=RO&geo=SI&geo=SK&geo=FI&geo=SE&geo=IS&geo=LI&geo=NO&geo=CH&geo=UK&geo=BA&geo=ME&geo=MK&geo=AL&geo=RS&geo=TR&geo=XK&unit=CP_MEUR&na_item=B1GQ&lang=en"

# read from eurostat
dataset = pyjstat.Dataset.read(url)

# write to dataframe
gdp = dataset.write('dataframe')

In [4]:
gdp.head()

Unnamed: 0,Time frequency,Unit of measure,National accounts indicator (ESA 2010),Geopolitical entity (reporting),Time,value
0,Annual,"Current prices, million euro",Gross domestic product at market prices,Belgium,2020,463750.9
1,Annual,"Current prices, million euro",Gross domestic product at market prices,Belgium,2021,506023.2
2,Annual,"Current prices, million euro",Gross domestic product at market prices,Belgium,2022,563543.6
3,Annual,"Current prices, million euro",Gross domestic product at market prices,Bulgaria,2020,61912.5
4,Annual,"Current prices, million euro",Gross domestic product at market prices,Bulgaria,2021,71378.4


In [5]:
def adjust_column_width(worksheet):
    for column in worksheet.columns:
        max_length = 0
        column_letter = get_column_letter(column[0].column)  # Get the column letter

        for cell in column:
            try: 
                # Check the length of the cell value
                if len(str(cell.value)) > max_length:
                    max_length = len(cell.value)
            except:
                pass

        adjusted_width = (max_length + 2)  # Add 2 for a little extra space
        worksheet.column_dimensions[column_letter].width = adjusted_width


Set the formatings

In [7]:
# Colors
light_greyFill = PatternFill(start_color='FFCCCCCC',
                       end_color='FFCCCCCC',
                       fill_type='solid')

#Size of the borders
thin_border = Border(left=Side(style='thin', color = '00000000'), 
                     right=Side(style='thin', color = '00000000'), 
                     top=Side(style='thin', color = '00000000'), 
                     bottom=Side(style='thin', color = '00000000'))

thick_border = Border(left=Side(style='thick', color = '00000000'), 
                     right=Side(style='thick', color = '00000000'), 
                     top=Side(style='thick', color = '00000000'), 
                     bottom=Side(style='thick', color = '00000000'))

### Data Cleaning

- Dropping null values
- Filling missing values
- Replacing specific values in the dataset

In [9]:
gdp.dropna(subset = 'value', inplace = True)

In [10]:
gdp_pivot = pd.pivot_table(gdp, index = 'Geopolitical entity (reporting)', columns = ['Unit of measure','Time'], values= 'value')

In [11]:
gdp_pivot.reset_index(inplace = True)

In [12]:
gdp_pivot.sort_values([('Current prices, million euro', '2022')], ascending = False, inplace = True)

In [13]:
gdp_pivot.columns = [' '.join(col) for col in gdp_pivot.columns.values]

In [14]:
gdp_pivot.to_excel('without format.xlsx')

In [15]:
gdp_pivot.to_excel('data_without_format.xlsx')

In [16]:
gdp_pivot_ = gdp_pivot.to_dict(orient='records')

### Transformation in Excel

- Grouping and pivoting data
- Merging datasets for transformation
- Excel formating

In [19]:
num_of_rows = gdp_pivot['Geopolitical entity (reporting) '].count()

In [21]:
wb = Workbook()
ws = wb.active
for r in dataframe_to_rows(gdp_pivot, index=False, header=True): # if index = True add one row between header and the values
    ws.append(r)

In [30]:
# Get the position of the columns and then the column letter based in excel
list_of_columns_name = gdp_pivot.columns
dict_col = {}

for col in range(len(list_of_columns_name)):
    coordinate = ws[1][col].coordinate
    coordinate = coordinate.replace('1','')
    dict_col[list_of_columns_name[col]] = coordinate


In [31]:
list_counties = gdp_pivot[[col for col in gdp_pivot.columns if col.startswith('Geopolitical ')]].columns
list_values = gdp_pivot[[col for col in gdp_pivot.columns if col.startswith('Current ')]].columns

In [32]:
# Format for all columns
for col in dict_col.values():
    for row in range(num_of_rows+1):
        ws[col+str(row+1)].border = thin_border

In [33]:
# Format for countries
for col in list_counties:
    for row in range(num_of_rows+1):
        ws[dict_col[col]+str(row+1)].font = Font(bold = True)
        ws[dict_col[col]+str(row+1)].alignment  = Alignment(horizontal='center')

In [34]:
# Format for values
for col in list_values:
    for row in range(num_of_rows+1):
        #ws[dict_col[col]+str(row+1)].font = Font(bold = True)
        ws[dict_col[col]+str(row+1)].alignment  = Alignment(horizontal='right')
        ws[dict_col[col]+str(row+1)].number_format = '€#,##0.00'

In [35]:
# Format for headers
for col in dict_col.values():
    ws[col+str(1)].font = Font(bold = True)
    ws[col+str(1)].alignment  = Alignment(horizontal='center', vertical = 'center', wrapText = True)
    ws[col+str(1)].border = thick_border
    ws[col+str(1)].fill = light_greyFill

In [36]:
adjust_column_width(ws)

In [37]:
wb.save('Manipulations.xlsx')