# Visual Analysis of the data
* Explorations of categorical and numerical features
* Chi-Square-Test for independence of EU membership and Level of development
* Visualize Correlations
* Comparison of features: "Developed" vs "Developing" Countries
* Comparison of features: EU vs non-EU Countries (the latter split by Level of development)

In [None]:
# packages

# standard
import pandas as pd
import numpy as np

# plot
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# statistics
from scipy.stats import chi2_contingency

In [None]:
# import data
df = pd.read_csv('../input/women-entrepreneurship-and-labor-force/Dataset3.csv', sep=';')
df.head()

# Basic Explorations

## Categorical Features

In [None]:
# show list of countries
print(list(df.Country))

In [None]:
print('Number of countries: ', len(df.Country))

In [None]:
# categorical features
features_cat = ['Level of development', 'European Union Membership', 'Currency']

In [None]:
# plot categorical features
for v in features_cat:
    df[v].value_counts().plot(kind='bar')
    plt.title(v)
    plt.grid()
    plt.show()

### Cross Tables and Chi-Square-Test

In [None]:
# cross table
ctab = pd.crosstab(df['Level of development'], df['European Union Membership'])
ctab

All EU Members are "Developed", but there are 7 more "developed" countries outside of the EU.

In [None]:
# Chi Square Independence Test - Null Hypothesis H0: EU Membership and Level of development are independent
chi2, p, dof, expected = chi2_contingency(ctab.to_numpy(), lambda_="log-likelihood")

In [None]:
# test stats
print('Chi^2 Test Statistic: ', chi2)
print('Chi^2 Indepence Test - p-value: ', p)

In [None]:
# what would cross tab look like if H0 true
print('Expected frequencies (assuming H0) based on the marginal sums:')
print(expected)

In [None]:
# compare expected frequencies vs. actual cross table
fig = plt.figure(figsize = (10,6))
ax1 = fig.add_subplot(1, 2, 1)
ax2 = fig.add_subplot(1, 2, 2)

# plot heatmaps side by side (use vmin and vmax to sync the colors!)
sns.heatmap(data=ctab, ax=ax1, cbar=1, square=True, vmin=0, vmax=24, cbar_kws={'shrink': .3}, annot=True)
sns.heatmap(data=expected, ax=ax2, cbar=1, square=True, vmin=0, vmax=24, cbar_kws={'shrink': .3}, annot=True)
plt.show()

### The matrix of expected frequencies looks pretty different from our actual cross table. This is confirmed by the extremely small p-value. This means the observed high Chi^2 statistics value (~ strong deviation from independence situation) is significant so we have to reject the null hypothesis of independence and can safely assume that EU Membership and Level of development are strongly dependent in our context.

### For the sake of completeness let's look also at the other two pairs:

In [None]:
ctab2 = pd.crosstab(df['European Union Membership'], df['Currency'])
ctab2

In [None]:
ctab3 = pd.crosstab(df['Level of development'], df['Currency'])
ctab3

## Numerical Features

In [None]:
# numerical features
features_num = ['Women Entrepreneurship Index', 'Entrepreneurship Index', 
                'Inflation rate', 'Female Labor Force Participation Rate']

In [None]:
# plot numerical features
for v in features_num:
    df[v].plot(kind='hist')
    plt.title(v)
    plt.grid()
    plt.show()

In [None]:
# summary stats
df[features_num].describe()

In [None]:
# pairwise scatter plot including regression lines
sns.pairplot(df[features_num], kind='reg', plot_kws={'line_kws':{'color':'magenta'}, 'scatter_kws': {'alpha': 0.5}})
plt.title('Numerical Features - Pairplot')
plt.show()

### Correlations

In [None]:
# correlation matrix - Pearson
cor_pearson = df[features_num].corr(method='pearson')

In [None]:
sns.heatmap(cor_pearson, cmap=plt.cm.RdYlGn, annot=True)
plt.show()

In [None]:
# correlation matrix - Spearman (rank correlation)
cor_spearman = df[features_num].corr(method='spearman')

In [None]:
sns.heatmap(cor_spearman, cmap=plt.cm.RdYlGn, annot=True)
plt.show()

In [None]:
# have a closer look at the relation 
# 'Women Entrepreneurship Index' vs 'Entrepreneurship Index':
plt.rcParams['figure.figsize']=(8,8)
sns.scatterplot(df['Entrepreneurship Index'], df['Women Entrepreneurship Index'],
                hue = df['Level of development'],
                size = df['Female Labor Force Participation Rate'],
                sizes = (1,150),
                alpha = 0.6)
plt.legend(bbox_to_anchor=(1.01, 1),borderaxespad=0) # move legend out of the box
plt.grid()
plt.show()

# Analytics by country

## Let's plot all the figures by country

In [None]:
# add a new variable first to compare entrepreneurship indices
df['Delta Index'] = df['Women Entrepreneurship Index'] - df['Entrepreneurship Index']

In [None]:
var_list = ['Women Entrepreneurship Index', 'Entrepreneurship Index', 'Delta Index', 
            'Female Labor Force Participation Rate', 'Inflation rate']
for v in var_list:
    plt.rcParams['figure.figsize']=(8,10)
    sns.barplot(y='Country', x=v, data=df, hue='Level of development')
    plt.title(v+' by Country')
    plt.grid()
    plt.show()

### Let's have a closer look on the "Delta Index". Remember, we measure the difference between the Women Index and the General Index here.

In [None]:
# sort data first
df_aux = df[['Country','Level of development','Delta Index']].sort_values(['Delta Index']).reset_index(drop=True)
# now plot
v = 'Delta Index'
plt.rcParams['figure.figsize']=(8,10)
sns.barplot(y='Country', x=v, data=df_aux, hue='Level of development')
plt.title(v+' by Country [sorted]')
plt.grid()
plt.show()

# "Developed" vs "Developing"

### One can clearly see significant differences between 'Developed' and 'Developing' countries in some of the barplots above. Exceptions are the Delta Index and maybe also the Female Labor Force Participation Rate. Let's have a closer look into the numbers.

In [None]:
# evaluate averages for Women Entrepreneurship Index
df_stats_WEntInd = df.groupby('Level of development', as_index=False).agg(
    mean_Women_Entr_Ind = pd.NamedAgg(column='Women Entrepreneurship Index', aggfunc=np.mean),
    min_Women_Entr_Ind = pd.NamedAgg(column='Women Entrepreneurship Index', aggfunc=np.min),
    median_Women_Entr_Ind = pd.NamedAgg(column='Women Entrepreneurship Index', aggfunc=np.median),
    max_Women_Entr_Ind = pd.NamedAgg(column='Women Entrepreneurship Index', aggfunc=np.max))
df_stats_WEntInd

In [None]:
# same for Entrepreneurship Index
df_stats_EntInd = df.groupby('Level of development', as_index=False).agg(
    mean_Entr_Ind = pd.NamedAgg(column='Entrepreneurship Index', aggfunc=np.mean),
    min_Entr_Ind = pd.NamedAgg(column='Entrepreneurship Index', aggfunc=np.min),
    median_Entr_Ind = pd.NamedAgg(column='Entrepreneurship Index', aggfunc=np.median),
    max_Entr_Ind = pd.NamedAgg(column='Entrepreneurship Index', aggfunc=np.max))
df_stats_EntInd

In [None]:
# same for Delta Index
df_stats_DeltaInd = df.groupby('Level of development', as_index=False).agg(
    mean_DeltaInd = pd.NamedAgg(column='Delta Index', aggfunc=np.mean),
    min_DeltaInd = pd.NamedAgg(column='Delta Index', aggfunc=np.min),
    median_DeltaInd = pd.NamedAgg(column='Delta Index', aggfunc=np.median),
    max_DeltaInd = pd.NamedAgg(column='Delta Index', aggfunc=np.max))
df_stats_DeltaInd

Interestingly there seems to be not much difference in the Delta Index between "Developed" and "Developing"...

In [None]:
# same for Female Labor Force Participation Rate
df_stats_FemLabF = df.groupby('Level of development', as_index=False).agg(
    mean_FemLabForcePart = pd.NamedAgg(column='Female Labor Force Participation Rate', aggfunc=np.mean),
    min_FemLabForcePart = pd.NamedAgg(column='Female Labor Force Participation Rate', aggfunc=np.min),
    median_FemLabForcePart = pd.NamedAgg(column='Female Labor Force Participation Rate', aggfunc=np.median),
    max_FemLabForcePart = pd.NamedAgg(column='Female Labor Force Participation Rate', aggfunc=np.max))
df_stats_FemLabF

In [None]:
# same for Inflation Rate
df_stats_Infl = df.groupby('Level of development', as_index=False).agg(
    mean_Inflation = pd.NamedAgg(column='Inflation rate', aggfunc=np.mean),
    min_Inflation = pd.NamedAgg(column='Inflation rate', aggfunc=np.min),
    median_Inflation = pd.NamedAgg(column='Inflation rate', aggfunc=np.median),
    max_Inflation = pd.NamedAgg(column='Inflation rate', aggfunc=np.max))
df_stats_Infl

### Let's visualize

In [None]:
# use violinplots to visualize (potential) differences between "developed" and "developing"
var_list = ['Women Entrepreneurship Index', 'Entrepreneurship Index', 'Female Labor Force Participation Rate',
            'Delta Index', 'Inflation rate']
for v in var_list:
    plt.rcParams['figure.figsize']=(6,4)
    sns.violinplot(x=df['Level of development'], y=df[v])
    plt.title(v)
    plt.ylabel(v)
    plt.grid()
    plt.show()

# EU members vs non-members

### Let's once more look at the plot of the Delta Index by country. This time split by EU/non-EU instead of "Developed"/"Developing".

In [None]:
# sort data first
df_aux = df[['Country','European Union Membership','Delta Index']].sort_values(['Delta Index']).reset_index(drop=True)
# now plot
v = 'Delta Index'
plt.rcParams['figure.figsize']=(8,10)
sns.barplot(y='Country', x=v, data=df_aux, hue='European Union Membership')
plt.title(v+' by Country [sorted]')
plt.grid()
plt.show()

### Simply comparing EU and non-EU countries is somewhat problematic: All EU countries are "developed" whereas the non-EU countries are a mix of "developed" and "developing" countries. So we first introduce a more granular split:

In [None]:
# introduce new country category
df['CountryCat'] = df['Level of development'] + '_' + df['European Union Membership']
# plot frequencies
plt.rcParams['figure.figsize']=(6,4)
df.CountryCat.value_counts().plot(kind='bar')
plt.title('Country Category')
plt.grid()
plt.show()

In [None]:
# now use violin plots to visualize (potential) differences between the three categories
var_list = ['Women Entrepreneurship Index', 'Entrepreneurship Index', 'Female Labor Force Participation Rate',
            'Delta Index', 'Inflation rate']
for v in var_list:
    plt.rcParams['figure.figsize']=(8,4)
    sns.violinplot(x=df['CountryCat'], y=df[v])
    plt.title(v)
    plt.ylabel(v)
    plt.grid()
    plt.show()

### The difference in the Delta Index between both "developed" categories is somewhat surprising...

### Let's also check the figures again:

In [None]:
# evaluate averages for Women Entrepreneurship Index
df_stats_WEntInd = df.groupby('CountryCat', as_index=False).agg(
    mean_Women_Entr_Ind = pd.NamedAgg(column='Women Entrepreneurship Index', aggfunc=np.mean),
    min_Women_Entr_Ind = pd.NamedAgg(column='Women Entrepreneurship Index', aggfunc=np.min),
    median_Women_Entr_Ind = pd.NamedAgg(column='Women Entrepreneurship Index', aggfunc=np.median),
    max_Women_Entr_Ind = pd.NamedAgg(column='Women Entrepreneurship Index', aggfunc=np.max))
df_stats_WEntInd

In [None]:
# same for Entrepreneurship Index
df_stats_EntInd = df.groupby('CountryCat', as_index=False).agg(
    mean_Entr_Ind = pd.NamedAgg(column='Entrepreneurship Index', aggfunc=np.mean),
    min_Entr_Ind = pd.NamedAgg(column='Entrepreneurship Index', aggfunc=np.min),
    median_Entr_Ind = pd.NamedAgg(column='Entrepreneurship Index', aggfunc=np.median),
    max_Entr_Ind = pd.NamedAgg(column='Entrepreneurship Index', aggfunc=np.max))
df_stats_EntInd

In [None]:
# same for Delta Index
df_stats_DeltaInd = df.groupby('CountryCat', as_index=False).agg(
    mean_DeltaInd = pd.NamedAgg(column='Delta Index', aggfunc=np.mean),
    min_DeltaInd = pd.NamedAgg(column='Delta Index', aggfunc=np.min),
    median_DeltaInd = pd.NamedAgg(column='Delta Index', aggfunc=np.median),
    max_DeltaInd = pd.NamedAgg(column='Delta Index', aggfunc=np.max))
df_stats_DeltaInd

In [None]:
# same for Female Labor Force Participation Rate
df_stats_FemLabF = df.groupby('CountryCat', as_index=False).agg(
    mean_FemLabForcePart = pd.NamedAgg(column='Female Labor Force Participation Rate', aggfunc=np.mean),
    min_FemLabForcePart = pd.NamedAgg(column='Female Labor Force Participation Rate', aggfunc=np.min),
    median_FemLabForcePart = pd.NamedAgg(column='Female Labor Force Participation Rate', aggfunc=np.median),
    max_FemLabForcePart = pd.NamedAgg(column='Female Labor Force Participation Rate', aggfunc=np.max))
df_stats_FemLabF

In [None]:
# same for Inflation Rate
df_stats_Infl = df.groupby('CountryCat', as_index=False).agg(
    mean_Inflation = pd.NamedAgg(column='Inflation rate', aggfunc=np.mean),
    min_Inflation = pd.NamedAgg(column='Inflation rate', aggfunc=np.min),
    median_Inflation = pd.NamedAgg(column='Inflation rate', aggfunc=np.median),
    max_Inflation = pd.NamedAgg(column='Inflation rate', aggfunc=np.max))
df_stats_Infl

### Once more our Delta Index plot by country and the bubble plot comparing the indices; this time split into the three country categories.

In [None]:
# sort data first
df_aux = df[['Country','CountryCat','Delta Index']].sort_values(['Delta Index']).reset_index(drop=True)
# now plot
v = 'Delta Index'
plt.rcParams['figure.figsize']=(8,10)
sns.barplot(y='Country', x=v, data=df_aux, hue='CountryCat')
plt.title(v+' by Country [sorted]')
plt.grid()
plt.show()

In [None]:
# look have closer look at the relation 
# 'Women Entrepreneurship Index' vs 'Entrepreneurship Index':
plt.rcParams['figure.figsize']=(8,8)
sns.scatterplot(df['Entrepreneurship Index'], df['Women Entrepreneurship Index'],
                hue = df['CountryCat'],
                size = df['Female Labor Force Participation Rate'],
                sizes = (1,150),
                alpha = 0.6)
plt.title('Women Entrepreneurship Index vs Entrepreneurship Index')
plt.legend(bbox_to_anchor=(1.01, 1),borderaxespad=0) # move legend out of the box
plt.grid()
plt.show()

### Finally let's add an interactive version of this plot

In [None]:
# display CountryCat via colour, Participation Rate via size, 
# show also Country when hovering over a data point (and inflation rate as well) 
fig = px.scatter(df, x='Entrepreneurship Index', y='Women Entrepreneurship Index',
                 color='CountryCat', size='Female Labor Force Participation Rate',
                 hover_data=['Country','Inflation rate'],
                 width=850, height=650,
                 opacity = 0.5,
                 title='Women Entrepreneurship Index vs Entrepreneurship Index')
fig.show()