In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm
from scipy import stats

In [None]:
df = pd.read_csv("/kaggle/input/women-entrepreneurship-and-labor-force/Dataset3.csv", sep=';')
df.head()

In [None]:
df.shape

```
No                   : Country ID
Country              : Country Name
    
    Country Id <--> Country Name
    
Level of Development :
European Union Membership :
Currency             :
Women Entrepreneurship Index : 
Entrepreneurship Index       :
Inflation rate               :
Female Labor Force Participation Rate :
```

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df['Country'].unique()

In [None]:
df['Level of development'].unique()

In [None]:
df['European Union Membership'].unique()

In [None]:
df['Currency'].unique()

In [None]:
df['Country'].value_counts().sort_values(ascending=False).nlargest(10)

In [None]:
def plot_bar(feature, df, color='ocean_r'):
    plt.figure(figsize=(10,5))
    ax = df[feature].value_counts().plot.bar(width=0.8, color=sns.color_palette(color))
    for p in ax.patches:
        ax.annotate(format(p.get_height()), (p.get_x()+0.15, p.get_height()+1))
    plt.xlabel(str(feature))
    plt.ylabel("count")
    plt.show()

In [None]:
plot_bar(feature='European Union Membership', df=df, color='ocean_r')
plot_bar(feature='Level of development',df=df, color='bright')
plot_bar(feature='Currency',df=df, color='YlOrBr')



In [None]:
devp_level_dict = {
    "Developed":0,
    "Developing":1
}

In [None]:
df.head()

In [None]:
sns.pairplot(df)

In [None]:
# adapted from https://plot.ly/python/plotly-express/
import plotly.express as px
fig = px.scatter(df, x="Women Entrepreneurship Index", y="Entrepreneurship Index")
fig.show()

In [None]:
fig = px.box(df, x="Women Entrepreneurship Index")
fig.show()

In [None]:

sns.distplot(df['Women Entrepreneurship Index'], fit=norm)
fig = plt.figure()
res = stats.probplot(df['Women Entrepreneurship Index'], plot=plt)

In [None]:
# df['Women Entrepreneurship Index'] = np.log(df['Women Entrepreneurship Index'])

In [None]:
# sns.distplot(df['Women Entrepreneurship Index'], fit=norm)
# fig = plt.figure()
# res = stats.probplot(df['Women Entrepreneurship Index'], plot=plt)

In [None]:
# making 	Entrepreneurship Index standard normal distributed 

In [None]:
# df['Entrepreneurship Index'] = (df['Entrepreneurship Index'] - df['Entrepreneurship Index'].mean() ) / df['Entrepreneurship Index']
# df['Entrepreneurship Index'].mean()

In [None]:
df.describe()

In [None]:
df['Women Entrepreneurship Index'] = (df['Women Entrepreneurship Index']-df['Women Entrepreneurship Index'].mean())/df['Women Entrepreneurship Index'].std()

In [None]:
df.describe()

In [None]:
sns.distplot(df['Women Entrepreneurship Index'], fit=norm)
fig = plt.figure()
res = stats.probplot(df['Women Entrepreneurship Index'], plot=plt)

> ##### 'Women Entrepreneurship Index' is already standard normally distributed

In [None]:
sns.distplot(df['Entrepreneurship Index'], fit=norm)

In [None]:
sns.distplot(df['Inflation rate'], fit=norm)


In [None]:
sns.boxplot(df['Inflation rate'])

In [None]:
df['Inflation rate'] = np.log(df['Inflation rate']+1)

In [None]:
sns.boxplot(df['Inflation rate'])

In [None]:
sns.distplot(df['Female Labor Force Participation Rate'], fit=norm)


In [None]:
sns.boxplot(df['Female Labor Force Participation Rate'])

In [None]:
def remove_outliers(df, feature, data_status='low_skewed'):
    if data_status == 'low_skewed':
        status = 1.5
    else:
        status = 3.0
        
    q25 = np.percentile(df[feature], 25)
    q75 = np.percentile(df[feature], 75)
    IQR = q75 - q25
    cut_off = IQR * status
    
    lower_bound, upper_bound = q25-cut_off, q75+cut_off
    
    outliers = [x for x in df[feature] if x<lower_bound or x>upper_bound]
    print(outliers)
    
    return [x for x in df[feature] if x>lower_bound and x<upper_bound]

In [None]:
remove_outliers(df=df, feature='Female Labor Force Participation Rate')

In [None]:
remove_outliers(df=df, feature='Inflation rate')

In [None]:
df.head()

### All countries are unique
1. Drop 'country' column
2. leblel encode 'Level of development', 'European Union Membership', 'Currency'
3. Remove outliers from 'Female Labor Force Participation Rate'
4. Make new features ####

In [None]:
df.drop(columns=['Country'], axis=1, inplace=True)

In [None]:
dev_level = {"Developed":0, "Developing":1}
membership = {"Member":0, "Not Member":1}
currency = {"Euro":0, "National Currency":1}

df['Level of development'] = df['Level of development'].map(dev_level)
df['European Union Membership'] = df['European Union Membership'].map(membership)
df['Currency'] = df['Currency'].map(currency)

In [None]:
df.info()

In [None]:
import missingno as msno

In [None]:
msno.matrix(df)

In [None]:
df['Inflation rate'].replace([np.inf, -np.inf], np.NaN, inplace=True)
df.fillna(df['Inflation rate'].mean())
df.head()

In [None]:
corr = df.corr()

sns.heatmap(corr, annot=True)