<center><h1>Analysing THE Wage Gap</h1></center>
<br>
<center><img src = 'https://media.nature.com/lw800/magazine-assets/d41586-020-00023-6/d41586-020-00023-6_17516216.jpg'></center>
<br>
<center><p>The gender pay gap or gender wage gap is the average difference between the remuneration for men and women who are working. Women are generally considered to be paid less than men. There are two distinct numbers regarding the pay gap: non-adjusted versus adjusted pay gap.</p></center>
<br>
<center><h2>About the Dataset</h2></center>
<br>
<center><p>The data set has been taken from glassdoor and focuses on income for various job titles based on gender. As there have been many studies showcasing that women are paid less than men for the same job titles, this data set will be helpful in identifying the depth of the gender-based pay gap. The features of the data set are:
Job Title
Gender
Age
PerfEval
Education
Dept
Seniority
Base Pay, and
Bonus</p></center>

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import datetime
import matplotlib.pyplot as plt
import seaborn as sns

from plotly.offline import init_notebook_mode, iplot 
import plotly.figure_factory as ff
import plotly.graph_objs as go
import plotly.offline as py
import pycountry
py.init_notebook_mode(connected=True)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/glassdoor-analyze-gender-pay-gap/Glassdoor Gender Pay Gap.csv')
df['TotalPay'] = df['BasePay'] + df['Bonus']
df.head()

In [None]:
title = pd.get_dummies(df, columns=['Gender']).groupby('JobTitle').count().sort_values(by='Age')

fig = go.Figure(data=[go.Bar(
            x = title.index,
            y = title['Age'],
            #text=y,
            width=0.4,
            textposition='auto',
            marker=dict(color=["steelblue","dodgerblue","lightskyblue","powderblue","cyan","deepskyblue","cyan","darkturquoise","paleturquoise","turquoise"])
 )])

fig.data[0].marker.line.width = 1
fig.data[0].marker.line.color = "black"
fig.update_layout(yaxis=dict(title=''),width=700,height=500,
                  xaxis=dict(title='Roles'), title='Job Titles with Number of Entries')
fig.show()

In [None]:
title = pd.get_dummies(df, columns=['Gender']).groupby('JobTitle').sum()

female = go.Pie(labels=title.index,values=title['Gender_Female'],name="Female",hole=0.5,domain={'x': [0,0.46]})
male = go.Pie(labels=title.index,values=title['Gender_Male'],name="Male",hole=0.5,domain={'x': [0.52,1]})

layout = dict(title = 'Job Title Distribution', font=dict(size=14), legend=dict(orientation="h"),
              annotations = [dict(x=0.2, y=0.5, text='Female', showarrow=False, font=dict(size=20)),
                             dict(x=0.8, y=0.5, text='Male', showarrow=False, font=dict(size=20)) ])

fig = dict(data=[female, male], layout=layout)
py.iplot(fig)

In [None]:
edu = pd.get_dummies(df, columns=['Gender']).groupby('Education').sum()

female = go.Pie(labels=edu.index,values=edu['Gender_Female'],name="Female",hole=0.5,domain={'x': [0,0.46]})
male = go.Pie(labels=edu.index,values=edu['Gender_Male'],name="Male",hole=0.5,domain={'x': [0.52,1]})

layout = dict(title = 'Education Level Distribution', font=dict(size=14), legend=dict(orientation="h"),
              annotations = [dict(x=0.2, y=0.5, text='Female', showarrow=False, font=dict(size=20)),
                             dict(x=0.8, y=0.5, text='Male', showarrow=False, font=dict(size=20)) ])

fig = dict(data=[female, male], layout=layout)
py.iplot(fig)

In [None]:
seniority = pd.get_dummies(df, columns=['Gender']).groupby('Seniority').sum()

female = go.Pie(labels=seniority.index,values=seniority['Gender_Female'],name="Female",hole=0.5,domain={'x': [0,0.46]})
male = go.Pie(labels=seniority.index,values=seniority['Gender_Male'],name="Male",hole=0.5,domain={'x': [0.52,1]})

layout = dict(title = 'Seniority Level Distribution', font=dict(size=14), legend=dict(orientation="h"),
              annotations = [dict(x=0.2, y=0.5, text='Female', showarrow=False, font=dict(size=20)),
                             dict(x=0.8, y=0.5, text='Male', showarrow=False, font=dict(size=20)) ])

fig = dict(data=[female, male], layout=layout)
py.iplot(fig)

## We can conclude that the data is well distributed and captures roughly equal number of male and female entries based on seniority, education and Job titles

In [None]:
age_female = []
age_male = []
for i in range(len(df)):
    if df.iloc[i,]['Gender'] == 'Male':
        age_male.append(df.iloc[i,]['Age'])
    else:
        age_female.append(df.iloc[i,]['Age'])

hist_data = [age_female, age_male]

group_labels = ['Female', 'Male']
colors = ['#835AF1', '#333F44']

fig = ff.create_distplot(hist_data, group_labels, colors=colors,
                         show_curve=True, show_hist=False)

# Add title
fig.update(layout_title_text='Distribution of Age')
fig.show()

### No of male employees are more from the age group 18 - 30 years and 55 - 64 years, while females are more from the 30s to mid 50s

In [None]:
gender = df.groupby('Gender').count()

fig = go.Figure(data=[go.Bar(
            x = gender.index,
            y = gender['JobTitle'],
            #text=y,
            width=0.4,
            textposition='auto',
            marker=dict(color='dodgerblue')
 )])

fig.data[0].marker.line.width = 1
fig.data[0].marker.line.color = "black"
fig.update_layout(yaxis=dict(title=''),width=700,height=500,
                  title= 'No of Male and Female Job Entries on the Dataset',
                  xaxis=dict(title='Gender'))
fig.show()

# Preprocessing
- Grouping data by job title for males and females
- Calculating the average bonus, base and total pay for them

In [None]:
female = df[df['Gender'] == 'Female'].groupby('JobTitle').sum()
male = df[df['Gender'] == 'Male'].groupby('JobTitle').sum()

female['BasePay'] /= title['Gender_Female'].tolist()
female['TotalPay'] /= title['Gender_Female'].tolist()
female['Bonus'] /= title['Gender_Female'].tolist()
male['BasePay'] /= title['Gender_Male'].tolist()
male['TotalPay'] /= title['Gender_Male'].tolist()
male['Bonus'] /= title['Gender_Male'].tolist()

In [None]:
fig = go.Figure(data=[
    go.Bar(name='Female', x=female.index, y=female['BasePay']),
    go.Bar(name='Male', x=male.index, y=male['BasePay'])
])
# Change the bar mode
fig.update_layout(barmode='group', title='BasePay Gap by JobTitle')
fig.show()

In [None]:
fig = go.Figure(data=[
    go.Bar(name='Female', x=female.index, y=female['Bonus']),
    go.Bar(name='Male', x=male.index, y=male['Bonus'])
])
# Change the bar mode
fig.update_layout(barmode='group', title='Bonus Pay Gap by JobTitle')
fig.show()

In [None]:
fig = go.Figure(data=[
    go.Bar(name='Female', x=female.index, y=female['TotalPay']),
    go.Bar(name='Male', x=male.index, y=male['TotalPay'])
])
# Change the bar mode
fig.update_layout(barmode='group', title='TotalPay Gap by JobTitle')
fig.show()

## Plotting the difference in pay for each job title

In [None]:
fig = go.Figure(data=[
    go.Bar(name='Difference', x=female.index, y= male['TotalPay'] - female['TotalPay'])
])
# Change the bar mode
fig.update_layout(barmode='group', title='Total Pay [Male - Female]')
fig.show()

In [None]:
diff = (male['TotalPay'] - female['TotalPay']).tolist()
titles = male.index.tolist()

for i in range(len(diff)):
    if diff[i] > 0:
        print('Men make ' + str(int(diff[i])) + ' more than Women as a ' + titles[i])
    else:
        print('Men make ' + str(int(-diff[i])) + ' less than Women as a ' + titles[i])

## Conclusion
- There has been significant improvement in reducing the wage gap
- Not only do the females make almost equal amounts to men they even exceed their pay in certain fields
- Out of the total 10 job titles in the dataset, on an average men make more in 5 while females make more in 5
- A huge gap can be seen for females making 5k less than men as a Marketing associate and a Software Engineer
- While men make around 6k less than females as a Data Scientist and a Warehouse Associate