In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Dataset Link https://www.kaggle.com/tunguz/pharmaceutical-drug-spending-by-countries 
# Notebook Link https://www.kaggle.com/michielvanuytsel/pharmaceutical-drug-spending-by-countries-eda/edit 
# Dataset downloaded using "kaggle datasets download -d tunguz/pharmaceutical-drug-spending-by-countries --path ../input --unzip" while in working-folder
df_file = pd.read_csv('../input/pharmaceutical-drug-spending-by-countries/data_csv.csv')
df_file.head()

# Initial dataset profiling using pandas_profiling
Pandas profiling offers an easy way to analyze the dataset from several perspectives such as specific variable analysis, interactions between variables, ...

In [None]:
import pandas_profiling
pandas_profile = pandas_profiling.ProfileReport(df_file, progress_bar=False)
pandas_profile.to_widgets()

# Data cleaning
Based on pandas profiling, let's clean the data as our results will only be as good as the data we use as input

In [None]:
# Let's first clean the already existing columns
df_clean = df_file.drop(columns={'FLAG_CODES'}).astype({'LOCATION': 'category'}).set_index(['LOCATION', 'TIME'])
df_clean

In [None]:
# Additional column derivations

# Considering that:
# PC_HEALTHXP = drug spending / total healthcare spending
# PC_GDP = drug spending / total GPD = (drug spending / healthcare spending) * (healthcare spending / GDP) = PC_HEALTHXP * (healthcare spending / GDP)
# Thus we can derive the percentage of total healthcare spending as part of the GDP
# PC_HCSpending_GPD = (healthcare spending / GDP) = PC_GDP / PC_HEALTHXP
df_clean["PC_HCSpending_GPD"] = df_clean["PC_GDP"]*100 / df_clean["PC_HEALTHXP"]

# Let's also add the delta between the previous year for the relevant values
# Note that we first have to perform the groupby, else we'll also create the delta between the last TIME of the previous LOCATION and the first TIME of the next LOCATION
for column in ["PC_HEALTHXP", "PC_GDP", "USD_CAP", "TOTAL_SPEND", "PC_HCSpending_GPD"]:
    df_clean["delta_{}".format(column)] = df_clean.groupby('LOCATION').diff()[column]

df_clean.head(10)

# Data Analysis

In [None]:
!pip install pandas_visual_analysis --quiet
from pandas_visual_analysis import VisualAnalysis
VisualAnalysis(df_clean.reset_index(), categorical_columns=["LOCATION"])
# The widgets don't always seem to load correctly in Kaggle, so my apologies if they don't show in Kaggle

In [None]:
import plotly.express as px
import plotly.graph_objects as go
fig = px.line(df_clean, x=df_clean.index.get_level_values('TIME'), y="PC_HEALTHXP", color=df_clean.index.get_level_values('LOCATION'))
fig.add_trace(go.Scatter(x=df_clean.groupby('TIME').mean().index, y=df_clean.groupby('TIME').mean()["PC_HEALTHXP"], mode='lines', line={'width': 6, 'color': 'black'}, name='Mean'))
fig.show()
# We see fluctuations in the PC_HEALTHXP across the years but on first sight there isn't a direct pattern.
# We had already seen during the pandas_profiling that there was no strong correlation between TIME and PC_HEALTHXP, but it is always best to also look at the graph in case there were periodic patterns, ...

In [None]:
import plotly.express as px
fig = px.scatter(df_clean, y="PC_GDP", x="USD_CAP", color=df_clean.index.get_level_values('LOCATION'))
fig.add_trace(go.Scatter(x=pd.cut(df_clean['USD_CAP'],15, retbins=True)[1], y=df_clean.groupby(pd.cut(df_clean['USD_CAP'],15, retbins=True)[0]).mean()["PC_GDP"], mode='lines', line={'width': 6, 'color': 'black'}, name='Mean'))
fig.show()
# We did see a stronger correlation during the pandas_profiling between PC_HEALTHXP and USD_CAP. We see this returning in below scatter plot

In [None]:
import plotly.express as px
fig = px.scatter(df_clean, y="PC_HEALTHXP", x="USD_CAP", color=df_clean.index.get_level_values('LOCATION'))
fig.add_trace(go.Scatter(x=pd.cut(df_clean['USD_CAP'],15, retbins=True)[1], y=df_clean.groupby(pd.cut(df_clean['USD_CAP'],15, retbins=True)[0]).mean()["PC_HEALTHXP"], mode='lines', line={'width': 6, 'color': 'black'}, name='Mean'))
fig.show()
# However, we do not see the same correlation in PC_HEALTHXP which seems to suggest that if nations spent more per capita on drugs, it is because they spent more on healthcare in general. Thus there is no/little correlation between USD_CAP and PC_HEALTHXP

In [None]:
import plotly.express as px
fig = px.scatter(df_clean, y="PC_HCSpending_GPD", x="USD_CAP", color=df_clean.index.get_level_values('LOCATION'))
fig.add_trace(go.Scatter(x=pd.cut(df_clean['USD_CAP'],15, retbins=True)[1], y=df_clean.groupby(pd.cut(df_clean['USD_CAP'],15, retbins=True)[0]).mean()["PC_HCSpending_GPD"], mode='lines', line={'width': 6, 'color': 'black'}, name='Mean'))
fig.show()
# In below graph you see the HC spending as part of GDP does increase. So an increase in spending per capita is not related to an relative equal increase in GPD, as else we'd not see a correlation (see analysis between PC_HEALTHXP and USD_CAP)