[![women-entrepreneurs.jpg](https://i.postimg.cc/FRpCb4Dz/women-entrepreneurs.jpg)](https://postimg.cc/8JFbDq8g)

Hi everyone thank you for viewing this notebook, the women entrepreneurship index measures the development of high potential
female entrepreneurship worldwide. The notebook's objective is to provide an exploratory data analysis and a non-parametric test
using plotly and seaborn for visualization and scipy.stats library for the non-parametric test.

>Also check my other notebooks
Supermarket Business Analysis:<br>
https://www.kaggle.com/jaepin/supermarket-business-analysis-and-visualization


<b>Please do upvote</b> üèπ

<h3 style="color: white; background-color: #A33834;">
Libraries
<br>
</h3>

In [None]:
# Importing Libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
from scipy import stats
import plotly.figure_factory as ff
import os

# Read CSV

filepath = '../input/women-entrepreneurship-and-labor-force/Dataset3.csv'
women = pd.read_csv(filepath, sep = ';', index_col = 'No').sort_index()
women.columns = women.columns.str.replace(' ', '_').str.lower()
women['level_of_development'] = women['level_of_development'].astype('category')


# Colors

palette = [
'#DB4627',
'#EB706C',
'#F58B7F',
'#F58867',
'#EB9F7F',
]

In [None]:
# Functions 

def style(table):
    """
    quick styling
    """
    view = table.style.background_gradient(cmap = 'coolwarm')
    return view

def percentage(s):
    """
    Converts a series to round off - percentage string format.
    """
    x = s.apply(lambda x : round(x/s[:].sum() * 100, 2))
    x = x.apply(lambda x : str(x) + '%')
    return x

def query_this(col, look):
    """
    Easy == Query
    """
    query_to_return = women.query('{} == "{}"'.format(col, look))
    return query_to_return


<h3 style="color: white; background-color: #A33834;">
The DataFrame
<br>
</h3>

In [None]:
style(women)

In [None]:
style(women.describe())

In [None]:
print(women.nunique())
print('')
print(women.info())

<h3 style="color: white; background-color: #A33834;">
Visualization
<br>
</h3>

This part I will present different figures, and so the interpretation will be up to you. 

In [None]:
# Create a subplot
fig = make_subplots(rows = 3, cols = 1, specs = [[{'type':'xy'}],[{'type':'xy'}], [{'type':'xy'}]])

# Add Bar traces

fig.add_trace(go.Bar(
    x = women.european_union_membership.unique(),
    y = women.european_union_membership.value_counts(),
    name = 'EU Membership',
    text = percentage(women.european_union_membership.value_counts()),
    marker_color = palette
),row = 1, col = 1)

fig.add_trace(go.Bar(
    x = women.level_of_development.unique(),
    y = women.level_of_development.value_counts(),
    name = 'Level of Development',
    text = percentage(women.level_of_development.value_counts()),
    marker_color = palette[1:]
),row = 2, col = 1)

fig.add_trace(go.Bar(
    x = women.currency.unique(),
    y = women.currency.value_counts(),
    name = 'Currency',
    text = percentage(women.currency.value_counts()),
    marker_color = palette[2:]
),row = 3, col = 1)

#Update Traces and  Layout

fig.update_traces(textposition = 'inside')
fig.update_layout(autosize=False, height = 800, template = 'plotly_white', title = '<b>Membership</b> - <b> Development </b>  - <b> Currency </b>')
fig.show()

In [None]:
# create a figure
fig = px.treemap(women, path = ['country'], values = 'women_entrepreneurship_index',
    color = 'women_entrepreneurship_index',
    color_continuous_scale = 'peach')

# show and update the figure
fig.update_layout(width = 900, title = '<b>Women Entrepreneurship Index</b>')
fig.show()

In [None]:
# create a figure
fig = px.treemap(women, path = ['country'], values = 'entrepreneurship_index',
    color = 'entrepreneurship_index',
    color_continuous_scale = 'peach')

# show and update the figure
fig.update_layout(width = 900, title = '<b>Entrepreneurship Index</b>')
fig.show()

In [None]:
# prepare DataFrame

ent_index = women[['country','women_entrepreneurship_index', 'entrepreneurship_index']]
ent_index = ent_index.groupby(by = 'country').agg(sum)
ent_group_index = ent_index[['women_entrepreneurship_index', 'entrepreneurship_index']].sort_values(by = 'entrepreneurship_index')

# create a subplot

fig = make_subplots(specs= [[{}]], shared_yaxes = True)

# add first trace
fig.add_trace(go.Bar(
    x = ent_group_index.entrepreneurship_index.values,
    y = ent_group_index.index,
    marker = dict(color = 'gray',
                 line=dict(color = 'black', width=1)),
    name = 'Entrep Index',
))

# add secondary trace
fig.add_trace(go.Bar(
    x = ent_group_index.women_entrepreneurship_index.values,
    y = ent_group_index.index,
    marker = dict(color = 'tomato',
                 line=dict(color = 'black', width=1)),
    name = 'Women Entrep Index'
))

# update traces and layout
fig.update_traces(orientation = 'h')
fig.update_layout(height = 1600, title = '<b>Women Entrepreneurship  Index </b> and <b> Entrepreneurship Index</b> per country', yaxis = dict(showgrid = False), xaxis = dict(showgrid = False))

fig.show()

In [None]:
# Prepare DataFrame

female_participation = women[['country','female_labor_force_participation_rate', 'inflation_rate']]
fp_group = female_participation.groupby(by = 'country').agg(sum)
fp_group_force = fp_group[['female_labor_force_participation_rate', 'inflation_rate']].sort_values(by = 'female_labor_force_participation_rate')
text_participation = percentage(fp_group_force['female_labor_force_participation_rate'])

# Figure

fig = go.Figure()

fig.add_trace(go.Bar(
    x = fp_group_force.female_labor_force_participation_rate.values,
    y = fp_group_force.index,
    marker = dict(color = 'tomato',
                 line=dict(color = 'black', width=1)),
    text = text_participation,
    name = 'Female Labor Particpation'
))


fig.update_traces(orientation = 'h', textposition = 'outside')
fig.update_layout(height = 1000, title = '<b>Female labor force participation rate</b>', yaxis = dict(showgrid = False), xaxis = dict(showgrid = False))

fig.show()

In [None]:
# create subplots.
# this is a shared yaxes subplot.
# specs - plotly will automatically detect trace type.

fig = make_subplots(specs= [[{}]], shared_yaxes = True)

# add traces

fig.add_trace(go.Bar(
    x = fp_group_force.female_labor_force_participation_rate.values,
    y = fp_group_force.index,
    marker = dict(color = 'lightpink',
                 line=dict(color = 'black', width=1)),
    text = text_participation,
    name = 'Female Labor Participation'
))

# add trace

fig.add_trace(go.Bar(
    x = fp_group_force.inflation_rate.values,
    y = fp_group_force.index,
    marker = dict(color = 'red',
                 line=dict(color = 'black', width=1)),
    name = 'Inflation',
    text = fp_group_force.inflation_rate.values
))

# update traces and layout
fig.update_traces(orientation = 'h', textposition = 'outside')
fig.update_layout(height = 1600, title = '<b>Female labor force participation rate</b>: Inflation Rate', yaxis = dict(showgrid = False), xaxis = dict(showgrid = False))

# show
fig.show()

<h3 style="color: white; background-color: #A33834;">
Visualization: Distribution Plots
<br>
</h3>

In [None]:
# Checking Numerical Columns

women.select_dtypes('number').columns

In [None]:
# Data Wrangling

developed_q = query_this('level_of_development', 'Developed')
developing_q = query_this('level_of_development', 'Developing')
developed_q = developed_q.select_dtypes('number')
developing_q = developing_q.select_dtypes('number')

# Colors

colors = ['#F5504C', '#619CF5']

In [None]:
# Prepare dataframe

val1 = developed_q.women_entrepreneurship_index.values
val2 = developing_q.women_entrepreneurship_index.values
hist_data = [val2, val1]
group_labels = women.level_of_development.unique()


fig = ff.create_distplot(hist_data, group_labels, colors=colors,
                         bin_size=2, show_rug=True)

fig.update_layout(autosize = False, title = '<b>Women Entrepreneurship Index Values</b>')
fig.show()

In [None]:
val1 = developed_q.entrepreneurship_index.values
val2 = developing_q.entrepreneurship_index.values
hist_data = [val2, val1]

fig = ff.create_distplot(hist_data, group_labels, colors=colors, bin_size = 5, show_rug=True)

fig.update_layout(autosize = False, title = '<b>Entrepreneurship Index Values</b>')
fig.show()

In [None]:
val1 = developed_q.inflation_rate.values
val2 = developing_q.inflation_rate.values
hist_data = [val2, val1]

fig = ff.create_distplot(hist_data, group_labels, colors=colors, bin_size = 2, show_rug=True)

fig.update_layout(autosize = False, title = '<b>Inflation Rate</b>')
fig.show()

In [None]:
val1 = developed_q.female_labor_force_participation_rate.values
val2 = developing_q.female_labor_force_participation_rate.values
hist_data = [val2, val1]

fig = ff.create_distplot(hist_data, group_labels, colors=colors, bin_size = 10, show_rug=True)

fig.update_layout(autosize = False, title = '<b>Female Labor Force</b> <br>Participation Rate')
fig.show()

In [None]:
# Scatter Plot Matrix

# prepare dataframe
# for color(hue), SPLOM only accepts integer values. Convert the one you  need into cat.dtype

women_n = women.select_dtypes('number')
women_n = pd.concat([women_n, women['level_of_development']], axis = 1)
textd = ['Developing' if cl== 'Developing' else 'Developed' for cl in women_n['level_of_development']]


# add figure
fig = go.Figure(data=go.Splom(
                  dimensions=[dict(label='Women Ent Index', values=women_n['women_entrepreneurship_index']),
                              dict(label='Ent Index', values=women_n['entrepreneurship_index']),
                              dict(label='Inflation Rate', values=women_n['inflation_rate']),
                              dict(label='Female Partic.', values=women_n['female_labor_force_participation_rate'])],
                  marker=dict(
                              color = women_n['level_of_development'].cat.codes,
                              size=7,
                              colorscale='pinkyl',
                              line=dict(width=0.5,
                                        color='black')),
                  text=textd,
                  diagonal=dict(visible=False)))

# update and show figure

title = "<b> Scatter Matrix </b> Developing and Developed Countries "
fig.update_layout(title=title,
                  dragmode='select',
                  width=600,
                  height=600,
                  hovermode='closest')

fig.show()

<h3 style="color: white; background-color: #A33834;">
Non-Parametric Tests ( Chi2, Cramer's V Test, Spearman Correlation )
<br>
</h3>

<b>H0:</b> There's no significant relationship between a countrys' development and EU Membership. <br>
<b>Ha:</b> There's a significant relationship between a countrys' development and EU Membership. 

[![chi2.jpg](https://i.postimg.cc/BvYMtg6P/chi2.jpg)](https://postimg.cc/rd4SP55q)

We will be using chi2 formula above

In [None]:
# Contingency Table
development_members = pd.crosstab(index = women['level_of_development'], columns = [women['european_union_membership']])
style(development_members)

In [None]:
# Chi2 
(chi2, p, dof,_) = stats.chi2_contingency([development_members.iloc[0].values, development_members.iloc[1].values])
values = [chi2, p, dof]

pd.DataFrame(values, index = ['chi2', 'p', 'dof'], columns = {'values'})

[![cramers-v.png](https://i.postimg.cc/C1RJ8qBP/cramers-v.png)](https://postimg.cc/PpkmsNMY)

Cramer V indicates the strength of the association.
The closer to 1 the stronger the association is.

In [None]:
cramer_v_test = np.sqrt(chi2/(women.shape[0]))
cramer_v_test

<b> Spearman Correlation </b>
---

Spearman correlation helps us to find the degree of association of two-variables. It evaluates a monotonic relationship
between a continuous variable or ordinal variable.

In [None]:
women_n_heatmap = women_n.drop(columns = 'level_of_development').reset_index()
women_n_heatmap = women_n_heatmap.drop(columns = 'No')
plt.figure(figsize=(10,8))
women_n_heatmap = women_n_heatmap.corr(method  = 'spearman')
sns.heatmap(women_n_heatmap, annot = True, cmap = 'pink')

In [None]:
style(women[['entrepreneurship_index', 'women_entrepreneurship_index']].corr('spearman'))

<h3 style="color: white; background-color: #A33834;">
Results
<br>
</h3>

In [None]:
print(f'chi2 : {chi2}')
print(f'cramer_v_test : {cramer_v_test}')
print(f'p-value  : {p}')
print(f'degrees of freedom : {dof}')

The results above tells us that there's a high association between the two variables (26.2223), 
with a very strong Cramer V result of <b>0.7170</b>.

Thus we reject our Null hypothesis: <br>
<b>H0:</b> There's no significant relationship between a countrys' development and EU Membership. <br>

And accept our Alternative hypothesis: <br>
<b>Ha:</b> There's a significant relationship between a countrys' development and EU Membership. 

<b>Entrepreneurship Index and Women Entrepreneurship Index</b> shows a high correlation of <b> 0.9077 </b>

<h3 style="color: white; background-color: #A33834;">
END OF NOTEBOOK: If  you've gone this far, thank you and please leave an upvote. 
<br>
</h3>