In [18]:
# loading libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
sns.set_theme(style='whitegrid')

In [2]:
# Loading datasets
folder = 'c:/Users/saaye/OneDrive/Documents/Machine Learning Projects/DATAnalyze/data/'
df1 = pd.read_excel(folder + 'DIGITAL_ECONOMY_SOCIETY-GLOBAL.xlsx')
df2 = pd.read_excel(folder + 'HEALTH INDICATOR-GLOBAL.xlsx')
df3 = pd.read_excel(folder + 'HEALTH STATUS- GLOBAL.xlsx')

In [3]:
# Removing columns (mostly single observation in these rows and others not necessary)
df1.drop(['Super Region', 'Source', 'Information society indicator','Sub-Sector', 'Time Level', 'Sub-Sector Level 2', 'Sub-Sector Level 3'], axis=1, inplace=True)
df2.drop(['Sub-Sector', 'Super Region', 'SOURCE ORGANIZATION', 'Indicator Name', 'Source', 'Special Notes', 'Region'], axis=1, inplace=True)
df3.drop(['Sub-Sector', 'Super Region', 'Measure', 'Frequency', 'Source', 'Indicator'], axis=1, inplace=True)

In [4]:
# Digital dataset has records for three years. I filtered only these years so we have common datasets
df2 = df2[df2['Date'].isin(list(df1['Date'].unique()))]
df3 = df3[df3['Date'].isin(list(df1['Date'].unique()))]

In [5]:
# Renamed the columns for merging
df1.rename(columns={'Geopolitical entity (reporting)': 'Country Name'}, inplace=True)
df3.rename(columns={'Region': 'Country Name'}, inplace=True)

In [6]:
# first merge of the column is based on Country Name. So we are working on Europeans countries only
# second merge of the column is on Date. So the observations correspond to same dates on both datasets
data = pd.merge(df1, df2,  how='inner', on=['Country Name', 'Date'], sort=True, suffixes=('_digital', '_health'))

In [7]:
# Converted the date column to datetime object
# Also converted all the dates to year since they are all reports of end of the year
data['Date'] = pd.to_datetime(data['Date'])
data['Date'] = pd.DatetimeIndex(data['Date']).year

In [24]:
data.sort_values(by=['Value_digital'], inplace=True)
data['Date'] = data['Date'].astype(str)

In [9]:
# A lot of columns have similar data on several rows. So i grouped and aggregated them for visualisation
#data = data.groupby(['Country Name', 'Date', 'Individual type', 'Unit of measure', 'Income Group']).agg('mean')

In [10]:
# saved clean data locally
data.to_csv('clean_data.csv')

In [29]:
data

Unnamed: 0,Value_digital,Date,Country Name,Individual type,Unit of measure,Value_health,Income Group
182,15,2009,SERBIA,ACTIVE LABOUR FORCE (EMPLOYED AND UNEMPLOYED),PERCENTAGE OF INDIVIDUALS,19.5,UPPER MIDDLE INCOME
176,17,2009,ROMANIA,ACTIVE LABOUR FORCE (EMPLOYED AND UNEMPLOYED),PERCENTAGE OF INDIVIDUALS,12.2,UPPER MIDDLE INCOME
114,17,2009,IRELAND,ACTIVE LABOUR FORCE (EMPLOYED AND UNEMPLOYED),PERCENTAGE OF INDIVIDUALS,14.0,HIGH INCOME
92,18,2009,GREECE,ALL INDIVIDUALS,PERCENTAGE OF INDIVIDUALS,3.6,HIGH INCOME
178,19,2010,ROMANIA,ACTIVE LABOUR FORCE (EMPLOYED AND UNEMPLOYED),PERCENTAGE OF INDIVIDUALS,14.0,UPPER MIDDLE INCOME
...,...,...,...,...,...,...,...
137,67,2012,LITHUANIA,ACTIVE LABOUR FORCE (EMPLOYED AND UNEMPLOYED),PERCENTAGE OF INDIVIDUALS WHO USED INTERNET IN...,35.0,HIGH INCOME
156,73,2012,NETHERLANDS,ACTIVE LABOUR FORCE (EMPLOYED AND UNEMPLOYED),PERCENTAGE OF INDIVIDUALS,10.7,HIGH INCOME
151,74,2012,MONTENEGRO,ACTIVE LABOUR FORCE (EMPLOYED AND UNEMPLOYED),PERCENTAGE OF INDIVIDUALS WHO USED INTERNET IN...,21.0,UPPER MIDDLE INCOME
157,74,2012,NETHERLANDS,ACTIVE LABOUR FORCE (EMPLOYED AND UNEMPLOYED),PERCENTAGE OF INDIVIDUALS WHO USED INTERNET IN...,10.7,HIGH INCOME


In [140]:
fig = px.bar(data, x='Country Name', y='Value_digital', color='Unit of measure', 
                facet_col='Income Group', facet_row='Date', hover_name='Individual type')
fig.update_yaxes(matches=None)
fig.for_each_yaxis(lambda y: y.update(title='', showticklabels=True)) # 
fig.for_each_xaxis(lambda y: y.update(title=''))
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_layout(yaxis=dict(title="Percentage of Individuals Online"))
fig.show()