In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
import pandas as pd
import re

df = pd.read_csv('../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv', usecols=['Q2', 'Q3', 'Q4', 'Q5', 'Q25', 'Q6']).iloc[1:,:].rename(columns={'Q2':'gender', 'Q3':'country', 'Q4':'education', 'Q5':'role', 'Q25': 'salary', 'Q6':'coding_experience'})

## Percentages of men vs. other genders by country
Analyzing the ratio of men we can see that there's a wide range from 61% to 95%.    
No single country have more women than men.  
For me, at least, is striking that some countries known for great inequalities in terms of gender have a good ratio of men vs other genders, such as Saudi Arabia and Iran.

In [None]:
df['n'] = 1
gender_by_country = df[['gender', 'country', 'n']].groupby(['gender', 'country'], as_index=False).count().pivot(index='country', columns='gender',values='n').fillna(0)
gender_by_country['totals'] = gender_by_country.sum(axis=1)
gender_by_country['men_per'] = gender_by_country['Man'] / gender_by_country['totals']

In [None]:
gender_by_country.sort_values('men_per').head(10)

In [None]:
gender_by_country.sort_values('men_per').tail(10)

In [None]:
gender_by_country.sort_values('totals', ascending=False)

In [None]:
gender_by_country.loc[gender_by_country.index == 'Argentina']

## Salaries by gender
The salaries were set as ranges, in this case we took the end of the range to normalize it.   
We also normalized the coding experience in the same way.  
In terms of coding experience and salaries, the median salary of men is higher than women (except for entry level with 0 coding experience). 
The inequality is greater for the junior and semi-senior positions, where the median of men salaries is 50% and 60% higher than women. 
In semi-senior and senior positions there's still inequalities that are around 33%.

In [None]:
#Normalize salary values
df['salary_normalized_max'] = df.salary.apply(lambda x: re.sub("[^0-9]", "", str(x).split('-')[-1]) if str(x) != 'nan' else x).astype(float)

In [None]:
#Normalize coding experience
df['coding_experience_normalized_max'] = df.coding_experience.apply(lambda x: re.sub("[^0-9]", "", str(x).split('-')[-1])).apply(lambda x: 0 if x == '' else x).astype(float)

In [None]:
df[['gender', 'salary_normalized_max']].groupby('gender').median()

In [None]:
df[['gender', 'coding_experience_normalized_max']].groupby('gender').median()

In [None]:
salary_by_experience = df[['gender', 'salary_normalized_max', 'coding_experience_normalized_max']].groupby(['gender',  'coding_experience_normalized_max'], as_index = False).median()
salary_by_experience = salary_by_experience.loc[salary_by_experience.gender.isin(['Man', 'Woman'])].pivot(index='gender', columns='coding_experience_normalized_max', values='salary_normalized_max')
salary_by_experience

In [None]:
pd.DataFrame(salary_by_experience.apply(lambda x: (x[0]-x[1])/x[0], axis=0))

## Distribution of roles by gender

In [None]:
ct = df[['gender', 'role', 'n']].groupby(['gender', 'role'], as_index = False).count().pivot_table(index='role', columns='gender', values = 'n')[['Man', 'Woman']]
ct['men_prop'] = ct['Man'] / ct.Man.sum()
ct['women_prop'] = ct['Woman'] / ct.Woman.sum()
ct['more_w_than_m'] = ct.women_prop - ct.men_prop

In [None]:
ct.sort_values('more_w_than_m')
#This looks promising, more women studying means more professionals in the future!

In [None]:
from scipy.stats import chisquare
#Test some hypotesis and proportions
ctt = ct[['Man', 'Woman']].T
ctt

## Salaries by role and gender