In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# <center> The Gender Wage Gap

According to Bureau of Labor Statistics data, in 2020, women made 82cents for every dollar a man earned. Men earned more than women in nearly all occupations except in healthcare and social services. The Covid pandemic stalled any gains that were made for closing this gap. The lack of child care has forced many women out of the workforce entirely. In February 2021, women’s labor force participation rate was 55.8% – the same rate as April 1987 (more than three decades earlier).

Let us explore the gender wage gap with the Kaggle's annual Machine Learning and Data Science Survey results dataset.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import re
import warnings 

warnings.filterwarnings('ignore')

In [None]:
#read in the csv file that contains ~26,000 responses to survey
df = pd.read_csv("../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv",low_memory=False)

# Analyzing Data:
The dataset consists of 42+ questions and 25,973 responses. First, let's understand the dataset.

In [None]:
#explore the first few rows of the dataset
df.head()

In [None]:
#drop row 0 that contains questions
df = df.drop([0])
df.reset_index(drop=True,inplace=True)
df.head()

In [None]:
# set the plot defaults
sns.set_style("darkgrid")
sns.set(font_scale = 1.2)
sns.set_palette("colorblind")

First let's check how many of the 25,973 participants were women.

In [None]:
#show the count of gender 
g = sns.catplot(x='Q2',data=df,kind='count')
g.set_xticklabels(rotation=60)
g.fig.suptitle("Gender of survey participants",x=0.5,y=1.02)
g.set(xlabel='Gender')
g.fig.set_figwidth(16)
g.fig.set_figheight(6)

for ax in g.axes.ravel():
    for p in ax.patches:
        ax.annotate(p.get_height(), 
                (p.get_x() + p.get_width() / 2., 
                 p.get_height() + 0.95), ha = 'center')
plt.show()

We see that more than 20,000 of the participants were male, less than 5000 were women, less than 100 were nonbinary and less than 50 preferred to self describe. Only a quarter of women responded to the survey when compared to men. Since we are interested in finding the wage gap between men and women, let's focus on those participants and not the 'nonbinary', 'prefer not to say' and 'prefer to self-describe' participants.

In [None]:
#only analyze for gender equal to man or woman since the count of
#others are very small
df_one = df[(df['Q2'] == 'Man') | (df['Q2'] == 'Woman')]

Let us check the age distribution of the participants(male and female). Are they young, middle-aged or old?

In [None]:
#age distribution of the participants
#Q1 is the age column, Q2 is the gender column

age_order=['18-21','22-24','25-29','30-34','35-39','40-44','45-49',
          '50-54','55-59','60-69','70+']
g = sns.catplot(x='Q1',kind='count',data=df_one,hue='Q2',
                order=age_order,legend=False)
g.fig.suptitle("Age distribution of the participants",x=0.5,y=1.02)
g.set(xlabel='Age')
g.set_xticklabels(rotation=60)
g.fig.set_figwidth(18)
g.fig.set_figheight(6)
plt.legend(title="Gender")

#print the count on the bars
for ax in g.axes.ravel():
    for p in ax.patches:
        ax.annotate(p.get_height(), 
                (p.get_x() + p.get_width() / 2., 
                 p.get_height() + 0.95), ha = 'center')
plt.show()

Most of the participants in the survey are between the ages of 18 years and 30 years. Almost 14,000 males and 3600 females are between 18 years and 30 years. We see very few (~2000) participants above age 50 years. 

Let's find out the yearly compensation of these participants.

In [None]:
#Q25 is yearly compensation column and Q2 is gender column
comp_order=['$0-999','1,000-1,999','2,000-2,999','3,000-3,999','4,000-4,999','5,000-7,499','7,500-9,999',
           '10,000-14,999','15,000-19,999','20,000-24,999','25,000-29,999','30,000-39,999','40,000-49,999','50,000-59,999',
           '60,000-69,999','70,000-79,999','80,000-89,999','90,000-99,999','100,000-124,999',
           '125,000-149,999','150,000-199,999','200,000-249,999','250,000-299,999','300,000-499,999',
           '$500,000-999,999','>$1,000,000']

g = sns.catplot(x='Q25',kind='count',data=df_one,hue='Q2',
                order=comp_order,legend=False)
g.fig.suptitle("Yearly compensation ($)",x=0.5,y=1.02)
       
g.set(xlabel='Compensation($)')
g.set_xticklabels(rotation=90)
g.fig.set_figwidth(20)
g.fig.set_figheight(6)
plt.legend(title="Gender")
for ax in g.axes.ravel():
    for p in ax.patches:
        ax.annotate(str(int(p.get_height())), 
                (p.get_x() + p.get_width() / 2., 
                 p.get_height() + 0.99), ha = 'center',fontsize=10)

plt.show()


The above plot shows around 2600 participants make less than $1000. This could be because they are students and probably work part-time. Also, 48 men and 4 women make more than a million dollars!

In [None]:
# convert compensation col to numeric
def str_to_numeric(data):
    if data == '$0-999':
        return ((0+999)/2)
    elif data == '1,000-1,999':
        return((1000+1999/2))
    elif data == '2,000-2,999':
        return((2000+2999)/2)
    elif data == '3,000-3,999':
        return((3000+3999)/2)
    elif data == '4,000-4,999':
        return((4000+4999)/2)
    elif data == '5,000-7,499':
        return((5000+7499)/2)
    elif data == '7,500-9,999':
        return((7500+9999)/2)
    elif data == '10,000-14,999':
        return((10000+14999)/2)
    elif data == '15,000-19,999':
        return((15000+19999)/2)
    elif data == '20,000-24,999':
        return((20000+24999)/2)
    elif data == '25,000-29,999':
        return((25000+29999)/2)
    elif data == '30,000-39,999':
        return((30000+39999)/2)
    elif data == '40,000-49,999':
        return((40000+49999)/2)
    elif data == '50,000-59,999':
        return((50000+59999)/2)
    elif data == '60,000-69,999':
        return((60000+69999)/2)
    elif data == '70,000-79,999':
        return((70000+79999)/2)
    elif data == '80,000-89,999':
        return((80000+89999)/2)
    elif data == '90,000-99,999':
        return((90000+99999)/2)
    elif data == '100,000-124,999':
        return((100000+124999)/2)
    elif data == '125,000-149,999':
        return((125000+149999)/2)
    elif data == '150,000-199,999':
        return((150000+199999)/2)
    elif data == '200,000-249,999':
        return((200000+249999)/2)
    elif data == '250,000-299,999':
        return((250000+299999)/2)
    elif data == '300,000-499,999':
        return((300000+499999)/2)
    elif data == '$500,000-999,999':
        return((500000+999999)/2)
    elif data == '>$1,000,000':
        return(1000000.0)
    else:
        return (0.0)
df_one['Q25_num'] = df_one['Q25'].apply(lambda x: str_to_numeric(x))

### Criteria:
Given various criteria like number of years of experience, education level, etc. let's find out the gender wage gap.


#### 1. Years of experience


#### 2. Level of education


#### 3. Size of company

In [None]:
#compensation wrt years of experience and level of education for men/women
def experience_num(x):
    if x == 'I have never written code':
        return 0
    elif x == '<1 years':
        return 1
    elif x == '1-3 years':
        return 2
    elif x == '3-5 years':
        return 4
    elif x == '5-10 years':
        return 7
    elif x == '10-20 years':
        return 15
    elif x == '20+ years':
        return 25
    else:
        return 0

df_one['Q6_num'] = df_one['Q6'].apply(lambda x:experience_num(x))

experience_order=['I have never written code','<1 years','1-3 years','3-5 years','5-10 years',
                 '10-20 years','20+ years']

degree_order=['I prefer not to answer','No formal education past high school',
             'Some college/university study without earning a bachelor\'s degree',
             'Bachelor\'s degree','Master\'s degree','Professional doctorate','Doctoral degree']


df_one['Q4_ordered'] = pd.Categorical(df_one['Q4'],
                                   categories=degree_order,
                                   ordered=True)
df_one['Q6_ordered'] = pd.Categorical(df_one['Q6'],
                                      categories=experience_order,
                                      ordered=True)

In [None]:
# Q21 is size of company column, Q2 is gender column

company_size_order=['0-49 employees','50-249 employees','250-999 employees','1000-9,999 employees',
                    '10,000 or more employees']

df_one['Q21_ordered'] = pd.Categorical(df_one['Q21'],
                                   categories=company_size_order,
                                   ordered=True)
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True,figsize=(20,6))
sns.lineplot(data=df_one,ax=ax1,x='Q4_ordered',y='Q25_num',sort=True,hue='Q2',ci=None)
ax1.set_title('Compensation with the level of education')
ax1.set_xlabel('Level of Education')
ax1.set_ylabel('Compensation')
ax1.legend(title='Gender')
plt.setp(ax1.get_xticklabels(), rotation=90)
sns.lineplot(data=df_one,ax=ax2,x='Q6_ordered',y='Q25_num',hue='Q2',sort=True,ci=None)
ax2.set_title('Compensation with number of years of experience')
ax2.set_xlabel('Years of Experience')
plt.setp(ax2.get_xticklabels(),rotation=90)
sns.lineplot(data=df_one,ax=ax3,x='Q21_ordered',y='Q25_num',hue='Q2',sort=True,ci=None)
ax3.set_title('Compensation with size of company')
ax3.set_xlabel('Size of company')
plt.setp(ax3.get_xticklabels(),rotation=90)
plt.legend(title='Gender')

plt.show()


Yes, our intuition is right - with the same level of education, and years of experience men make about 25K more than women! It seems to be the same case if the company size is small or large! Alright, let's check if this is dependent on the role of the participant in a company.

#### 4. Current role 

In [None]:
fig, ax = plt.subplots(figsize=(20,6))
sns.barplot(data=df_one,ax=ax,x='Q5',y='Q25_num',hue='Q2',ci=None)
ax.set_title('Compensation for various roles')
ax.set_xlabel('Current role')
ax.set_ylabel('Compensation')
ax.legend(title='Gender')
plt.setp(ax.get_xticklabels(), rotation=90)
plt.show()

Yay, there is at least one role - data analyst - where women seems to be making slightly more than men. In all of the other roles, men certainly seem to have the edge over women! 

How about knowing programming? Does it have an impact on the wage gap?

#### 5. Programming Languages 

In [None]:
col_list=[]
num=0
for names in df_one.columns:
    col_names = re.findall(r"(Q7_Part_\d+)",names)
    if col_names:
        num=+1
        col_list.append(col_names)

fig, axes =plt.subplots(3,4, figsize=(10,15), sharey=True)
axes = axes.flatten()

for ax,cols in zip(axes,col_list):
    sns.barplot(x=cols[0],y='Q25_num',ax=ax,data=df_one,hue='Q2',ci=None)
    ax.set(xlabel=None)
    ax.set(ylabel="Compensation")
    
fig.suptitle("Compensation for the various Programming Languages",fontsize=14)
plt.xlabel("Programming Languages",x=-1.0)
plt.show()

Men get paid more than women in all of the programming languages surveyed. How about in the number of years of using machine learning methods?

#### 6. Number of years of ML methods

In [None]:
number_of_years_ml_order=['I do not use machine learning methods','Under 1 year','1-2 years','2-3 years','3-4 years',
                   '4-5 years','5-10 years','10-20 years','20 or more years']

df_one['Q15_ordered'] = pd.Categorical(df_one['Q15'],
                                   categories=number_of_years_ml_order,
                                   ordered=True)
fig, ax = plt.subplots(figsize=(16,6))
sns.barplot(data=df_one,ax=ax,x='Q15_ordered',y='Q25_num',hue='Q2',ci=None)
ax.set_title('Compensation for experience in ML')
ax.set_xlabel('Number of years of ML experience')
ax.set_ylabel('Compensation')
ax.legend(title='Gender')
plt.setp(ax.get_xticklabels(), rotation=90)
plt.show()

Compensation seems to increase with increase in years of experience but men always seem to make more than women though the gap seem to decrease a little with 20 or more years of experience.

#### 7. ML Frameworks

In [None]:
col_list=[]
num=0
for names in df_one.columns:
    col_names = re.findall(r"(Q16_Part_\d+)",names)
    if col_names:
        num=+1
        col_list.append(col_names)

fig, axes =plt.subplots(4,4, figsize=(10,15), sharey=True)
axes = axes.flatten()

for ax,cols in zip(axes,col_list):
    sns.barplot(x=cols[0],y='Q25_num',ax=ax,data=df_one,hue='Q2')
    ax.set(xlabel=None)
    ax.set(ylabel="Compensation")
    
  

fig.suptitle("ML frameworks used by the participants",fontsize=14)
plt.xlabel("ML framework",x=-1.0)
plt.show()

An interesting observation here is that women seem to be paid more than men when they used 'JAX' and 'MXNet'. Let's look at the result closely.

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, sharey=True,figsize=(20,6))
sns.barplot(data=df_one,ax=ax1,x='Q16_Part_6',y='Q25_num',hue='Q2',ci=None)
ax1.set_title('Compensation with MXNET')
ax1.set_xlabel('ML Framework MXNet')
ax1.set_ylabel('Compensation')
ax1.legend(title='Gender')

for p in ax1.patches:
    ax1.annotate(round(p.get_height()), 
           (p.get_x() + p.get_width() / 2., 
                 p.get_height() + 0.95), ha = 'center')


sns.barplot(data=df_one,ax=ax2,x='Q16_Part_14',y='Q25_num',hue='Q2',ci=None)
ax2.set_title('Compensation with JAX')
ax2.set_xlabel('ML Framework JAX')
ax2.set_ylabel('Compensation')
ax2.legend(title='Gender')

for p in ax2.patches:
    ax2.annotate(round(p.get_height()), 
           (p.get_x() + p.get_width() / 2., 
                 p.get_height() + 0.95), ha = 'center')


plt.show()


Yes, indeed! Women working in MXNet ML framework made ~28K more than men and women in JAX ML framework made almost 35K more than men. 

#### 8. ML algorithms

In [None]:
col_list=[]
num=0
for names in df_one.columns:
    col_names = re.findall(r"(Q17_Part_\d+)",names)
    if col_names:
        num=+1
        col_list.append(col_names)

fig, axes =plt.subplots(5,2, figsize=(10,20), sharey=True)
axes = axes.flatten()

for ax,cols in zip(axes,col_list):
    sns.barplot(x=cols[0],y='Q25_num',ax=ax,data=df_one,hue='Q2',ci=None)
    ax.set(xlabel=None)
    ax.set(ylabel="Compensation")
    
fig.suptitle("ML algorithms used by the participants",fontsize=14)
plt.xlabel("ML algorithm",x=-0.050)
plt.show()

Men make a lot more more than women in ML algorithms specifically in CNN and RNN where they make more than twice of what a woman makes!

#### 9. Computer Vision

In [None]:
col_list=[]
num=0
for names in df_one.columns:
    col_names = re.findall(r"(Q18_Part_\d+)",names)
    if col_names:
        num=+1
        col_list.append(col_names)

fig, axes =plt.subplots(3,2, figsize=(20,20), sharey=True)
axes = axes.flatten()

for ax,cols in zip(axes,col_list):
    sns.barplot(x=cols[0],y='Q25_num',ax=ax,data=df_one,hue='Q2',ci=None)
    ax.set(xlabel=None)
    ax.set(ylabel="Compensation")
    
  

fig.suptitle("Computer Vision used by the participants",fontsize=14)
plt.xlabel("Computer Vision",x=-0.07)
plt.show()

In the area of Computer Vision, men seem to be paid more than women sometimes even twice as much ( as in GAN) just like in ML algorithms (CNN and RNN) above.

#### 10. Natural Language Processing

In [None]:
col_list=[]
num=0
for names in df_one.columns:
    col_names = re.findall(r"(Q19_Part_\d+)",names)
    if col_names:
        num=+1
        col_list.append(col_names)

fig, axes =plt.subplots(2,2, figsize=(20,10), sharey=True)
axes = axes.flatten()

for ax,cols in zip(axes,col_list):
    sns.barplot(x=cols[0],y='Q25_num',ax=ax,data=df_one,hue='Q2',ci=None)
    ax.set(xlabel=None)
    ax.set(ylabel="Compensation")
    
  

fig.suptitle("NLP used by the participants",fontsize=14)
plt.xlabel("Natural Language Processing",x=-0.07)
plt.show()

In the NLP area too, men are paid way more than women.

#### 11. Big Data Products

In [None]:
col_list=[]
num=0
for names in df_one.columns:
    col_names = re.findall(r"(Q32_A_Part_\d+)",names)
    if col_names:
        num=+1
        col_list.append(col_names)

fig, axes =plt.subplots(5,4, figsize=(16,20), sharey=True)
axes = axes.flatten()

for ax,cols in zip(axes,col_list):
    sns.barplot(x=cols[0],y='Q25_num',ax=ax,data=df_one,hue='Q2',ci=None)
    ax.set(xlabel=None)
    ax.set(ylabel="Compensation")
    
fig.suptitle("Big data products used by the participants",fontsize=14)

plt.xlabel("Big data products",x=-0.98)
plt.show()

Interesting! Women are getting higher pay in the big data products area of Microsoft Azure Cosmos DB, Amazon Aurora, and Google Cloud Firestore. They are getting nearly equal pay in IBM Db2, Google Cloud SQL, and Google Cloud Spanner and also they are getting almost as much as men in most of the other products. This is a good area to be in if you are a woman!

#### 12. Cloud Computing Platforms

In [None]:
# Q27 is cloud computing platforms column, Q25_num is compensation column and Q2 is gender column
col_list=[]
num=0
for names in df_one.columns:
    col_names = re.findall(r"(Q27_A_Part_\d+)",names)
    if col_names:
        num=+1
        col_list.append(col_names)

fig, axes =plt.subplots(5,2, figsize=(12,20), sharey=True)
axes = axes.flatten()

for ax,cols in zip(axes,col_list):
    sns.barplot(x=cols[0],y='Q25_num',ax=ax,data=df_one,hue='Q2',ci=None)
    ax.set(xlabel=None)
    ax.set(ylabel="Compensation")
    
fig.suptitle("Cloud computing platforms used by the participants",fontsize=14)

plt.xlabel("Cloud computing platforms",x=-0.098)
plt.show()

Wow, in the area of cloud computing platforms, women are paid way more than men in Tencent Cloud (almost 2.5 times), Alibaba cloud (more than 2 times), a little more than men in SAP cloud and SalesForce cloud and almost same as men in IBM cloud. Very good area for women!

#### 13. Cloud Computing Products

In [None]:
#Q29 is cloud computing products
col_list=[]
num=0
for names in df_one.columns:
    col_names = re.findall(r"(Q29_A_Part_\d+)",names)
    if col_names:
        num=+1
        col_list.append(col_names)

fig, axes =plt.subplots(1,3, figsize=(12,5), sharey=True)
axes = axes.flatten()

for ax,cols in zip(axes,col_list):
    sns.barplot(x=cols[0],y='Q25_num',ax=ax,data=df_one,hue='Q2',ci=None)
    ax.set(xlabel=None)
    ax.set(ylabel="Compensation")
    
  

fig.suptitle("Cloud computing products used by the participants",fontsize=14)

plt.xlabel("Cloud computing products",x=-0.98)
plt.show()

In the cloud computing products area, men are paid more.

#### 14. Storage Products

In [None]:
#storage products
col_list=[]
num=0
for names in df_one.columns:
    col_names = re.findall(r"(Q30_A_Part_\d+)",names)
    if col_names:
        num=+1
        col_list.append(col_names)

fig, axes =plt.subplots(2,3, figsize=(12,9), sharey=True)
axes = axes.flatten()

for ax,cols in zip(axes,col_list):
    sns.barplot(x=cols[0],y='Q25_num',ax=ax,data=df_one,hue='Q2',ci=None)
    ax.set(xlabel=None)
    ax.set(ylabel="Compensation")
    
  

fig.suptitle("Storage products used by the participants",fontsize=14)

plt.xlabel("Storage products",x=-0.98)
plt.show()

Yay! In the storage products area, women get paid more in the Microsoft Azure Disk Storage and almost same as men in the Amazon Elastic File System. Also a good area for women to be in.

#### 15. Managed ML Products

In [None]:
#Q31 is managed ML products
col_list=[]
num=0
for names in df_one.columns:
    col_names = re.findall(r"(Q31_A_Part_\d+)",names)
    if col_names:
        num=+1
        col_list.append(col_names)

fig, axes =plt.subplots(2,4, figsize=(16,9), sharey=True)
axes = axes.flatten()

for ax,cols in zip(axes,col_list):
    sns.barplot(x=cols[0],y='Q25_num',ax=ax,data=df_one,hue='Q2',ci=None)
    ax.set(xlabel=None)
    ax.set(ylabel="Compensation")
    
  

fig.suptitle("Managed ML products used by the participants",fontsize=14)

plt.xlabel("Managed ML products",x=-0.98)
plt.show()

In the managed ML products area, women made almost same as men in Google Cloud Vertex AI, Databricks and Alteryx and almost `$12K` more than men in DataRobot and a whopping `$35K` more than men in Dataiku.

#### 16. BI Tools

In [None]:
#Q34 is BI tools column
col_list=[]
num=0
for names in df_one.columns:
    col_names = re.findall(r"(Q34_A_Part_\d+)",names)
    if col_names:
        num=+1
        col_list.append(col_names)

fig, axes =plt.subplots(5,3, figsize=(12,20), sharey=True)
axes = axes.flatten()

for ax,cols in zip(axes,col_list):
    sns.barplot(x=cols[0],y='Q25_num',ax=ax,data=df_one,hue='Q2',ci=None)
    ax.set(xlabel=None)
    ax.set(ylabel="Compensation")
    
  

fig.suptitle("BI tools used by the participants",fontsize=14)

plt.xlabel("BI products",x=-0.98)
plt.show()

In the usage of BI tools, women seem to make almost `$100K` more than men in Thoughtspot, almost `$50K` more in Sisense and almost same as men in the other products. This is also a very good area for women!

Let's check if there is any relation to compensation for being on social media.


#### 17. Media Sources

In [None]:
# Q42 is favorite media sources to report on ds 
col_list=[]
num=0
for names in df_one.columns:
    col_names = re.findall(r"(Q42_Part_\d+)",names)
    if col_names:
        num=+1
        col_list.append(col_names)

fig, axes =plt.subplots(5,2, figsize=(16,20), sharey=True)
axes = axes.flatten()

for ax,cols in zip(axes,col_list):
    sns.barplot(x=cols[0],y='Q25_num',ax=ax,data=df_one,hue='Q2',ci=None)
    ax.set(xlabel=None)
    ax.set(ylabel="Compensation")
    
fig.suptitle("Favorite media sources used by the participants",fontsize=14)
plt.xlabel("Favorite media sources",x=-0.98)
plt.show()

Looks like men who report more on media sources about data science get higher compensation than women. Women seem to lose out here!

#### 18. Platforms where Data science courses started/completed

In [None]:
#platforms where ds courses were started/completed
col_list=[]
num=0
for names in df_one.columns:
    col_names = re.findall(r"(Q40_Part_\d+)",names)
    if col_names:
        num=+1
        col_list.append(col_names)

fig, axes =plt.subplots(5,2, figsize=(16,20), sharey=True)
axes = axes.flatten()

for ax,cols in zip(axes,col_list):
    sns.barplot(x=cols[0],y='Q25_num',ax=ax,data=df_one,hue='Q2',ci=None)
    ax.set(xlabel=None)
    ax.set(ylabel="Compensation")
    
  

fig.suptitle("Platforms for data science courses used by the participants",fontsize=14)

plt.xlabel("Platforms for data science courses",x=-0.98)
plt.show()

Regardless of the platform where men and women started or completed data science courses, men get paid higher.

#### 19. Money spent on ML

In [None]:
#Q26 is the money spent on ML
money_spent_ml_order=['$0 ($USD)','$1-$99','$100-$999','$1000-$9,999','$10,000-$99,999',
                      '$100,000 or more ($USD)']

df_one['Q26_ordered'] = pd.Categorical(df_one['Q26'],
                                   categories=money_spent_ml_order,
                                   ordered=True)
fig, ax =plt.subplots(figsize=(12,5))

sns.barplot(x='Q26_ordered',y='Q25_num',ax=ax,data=df_one,hue='Q2',ci=None)
ax.set(xlabel=None)
ax.set(ylabel="Compensation")
   
fig.suptitle("Money spent on ML by the participants/team",fontsize=14)

#plt.xlabel("Big data products",x=-0.98)
plt.setp(ax.get_xticklabels(),rotation=90)
plt.xlabel("Money spent on ML")
plt.show()

The more money men and women spent on learning ML, the more they tend to make in compensation. But men are always paid more.

#### 20. Type of Industry

In [None]:
#Q20 is type of industry column
fig, ax =plt.subplots(figsize=(16,5))

sns.barplot(x='Q20',y='Q25_num',ax=ax,data=df_one,hue='Q2',ci=None)
ax.set(xlabel=None)
ax.set(ylabel="Compensation")
   
fig.suptitle("Type of industry of the employer of the participants",fontsize=14)

plt.setp(ax.get_xticklabels(),rotation=90)
plt.xlabel("Type of industry")
plt.show()

Here we see that the type of industry where women are paid more than men are in Hospitality/Entertainment/Sports, Online Business/Internet-based Sales, and in Military/Security/Defense. They are paid almost equal to men in Government/Public Service and Marketing/CRM. But definitely paid below in Computers/Technology just like the other types of industry.

#### 21. Role at work

In [None]:
#Q24 is role at work column
col_list=[]
num=0
for names in df_one.columns:
    col_names = re.findall(r"(Q24_Part_\d+)",names)
    if col_names:
        num=+1
        col_list.append(col_names)

fig, axes =plt.subplots(3,2, figsize=(24,14), sharey=True)
axes = axes.flatten()

for ax,cols in zip(axes,col_list):
    sns.barplot(x=cols[0],y='Q25_num',ax=ax,data=df_one,hue='Q2')
    ax.set(xlabel=None)
    ax.set(ylabel="Compensation")
    
  

fig.suptitle("Role at work of the participants",fontsize=14)

plt.xlabel("Role at work",x=-0.98)
plt.show()

In every kind of role at work, men are compensated more than women.

# Results
* With the same level of education, and years of experience men make about $25K more than women! It does not matter if the company size is small or large!
* There is only one role - that of data analyst where women seems to be making slightly more than men.
* Women working in MXNet ML framework made ~28K more than men and women in JAX ML framework made almost 35K more than men.
* Women are getting higher pay in the big data products area of Microsoft Azure Cosmos DB, Amazon Aurora, and Google Cloud Firestore. They are getting nearly equal pay in IBM Db2, Google Cloud SQL, and Google Cloud Spanner and also they are getting almost as much as men in most of the other big data products.
* Women are paid way more than men in cloud computing platforms like Tencent Cloud (almost 2.5 times), Alibaba cloud (more than 2 times), a little more than men in SAP cloud and SalesForce cloud and almost same as men in IBM cloud platform.
* Women are paid more than men in storage products like Microsoft Azure Disk Storage and almost same as men in the Amazon Elastic File System.
* Women are paid almost same as men in Google Cloud Vertex AI, Databricks and Alteryx and almost `$12K` more than men in DataRobot and a whopping `$35K` more than men in Dataiku. This is the managed ML products area.
* Women seem to make almost `$100K` more than men in Thoughtspot, almost `$50K` more in Sisense and almost same as men in the other BI products.
* The type of industry where women are paid more than men are in Hospitality/Entertainment/Sports, Online Business/Internet-based Sales, and in Military/Security/Defense. They are paid almost equal to men in Government/Public Service and Marketing/CRM.
* **Across all of the other platforms, tools, framework, and products men get paid more than women. The gender wage-gap is huge!**


# Further work
* Combine the dataset with the survey results from previous years and find if the trend is the same or if the gap is narrowing even slightly.
* Develop a ML model that will predict the future results. Any hope for the wage-gap to narrow or even close?

# Conclusion
So what can we do to achieve pay equity? 

Promoting women to leadership roles, training and providing feedback, offering mentoring programs, providing support to women when they get back to work after taking time-off to care for children or sick family members, are some of the ways in which we can achieve pay equity.

# References:
https://blog.dol.gov/2021/03/19/5-facts-about-the-state-of-the-gender-pay-gap

https://www.cio.com/article/3613187/the-gender-pay-gap-an-it-issue-that-must-get-fixed.html

https://www.pewresearch.org/fact-tank/2021/05/25/gender-pay-gap-facts/