In [None]:

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")
plt.style.use('seaborn-darkgrid')

In [None]:
df = pd.read_csv('../input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv')

# EDA

In [None]:
print(df.shape)
print(df.columns)
df.head()


In [None]:
categorical = df.select_dtypes(include= 'object').columns
fig, ax = plt.subplots(3,3, figsize = (16,16))
fig.tight_layout(w_pad = 10.0)
axs = ax.ravel()
for i, column in enumerate(categorical):
    df[column].value_counts().plot.barh(ax = axs[i])
    axs[i].set_title(column)    

In [None]:
fig,ax = plt.subplots(figsize=(15,15))
sns.heatmap(df.corr(),annot=True,ax=ax, cmap='plasma', fmt='.2f')

Intersting to notice: little correlation between income and satisfaction but high correlation between income and years worked at company/age

In [None]:
fig, ax = plt.subplots(2,2, figsize = (17,14))
fig.tight_layout(w_pad=11)
axs = ax.ravel()
def income_barplot(ax, predictor):
    df.groupby(by = predictor).mean()['MonthlyIncome'].sort_values().plot.barh(ax = ax, color = 'indianred')

for i,predictor in enumerate(['EducationField','MaritalStatus','Gender','JobRole']):
    income_barplot(axs[i],predictor)

# Men vs. Women

Question: Do women at this company in general get paid more than men? Is this statistically significant?

In [None]:
fig,ax = plt.subplots(figsize = (10,6))
sns.histplot(x= 'MonthlyIncome',hue = 'Gender',data=df, kde = True,ax=ax)

The salary distributions are non-normally distributed so a simple t-test to test the difference won't suffice. Instead we will use a Kolmogorov-Smirnov test, whose null-hypothesis states that the two groups are drawn from the sample distribution. The test reports the KS statistic which is the maximum difference between the cumulative distribution of both groups, and a p-value which reports the probability of observing a difference of that sie if the null hypothesis is true; if the two groups are drawn from the same distribution, hence if the p value is sufficiently low we can reject the null-hypothesis

In [None]:
from scipy.stats import ks_2samp

results = ks_2samp(df.loc[df['Gender']=='Female','MonthlyIncome'], df.loc[df['Gender']=='Male','MonthlyIncome'])

print('KS Statistic = {} with a probability of {}'.format(results[0],results[1]))
if results[1]>0.05:
    print('Insufficient Evidence to Reject Null-Hypothesis that women and men are paid the same')
else:
    print('Reject Null-Hypothesis. Women and men are paid differently')

What could be the cause of the fatter base in men's pay, is this to do with the role of work more frequently conducted by men?

In [None]:
fig, ax = plt.subplots(figsize = (8,8))
sns.barplot(y = 'JobRole',x = 'MonthlyIncome',hue = 'Gender',data = df,ax = ax)

In [None]:
fig,ax =plt.subplots(3,3, figsize = (13,13))
fig.tight_layout(h_pad = 4)
roles = df.JobRole.unique()
axs = ax.ravel()
for i,role in enumerate(roles):
    tempdf = df.loc[df['JobRole']==role]
    sns.histplot(x= 'MonthlyIncome',hue = 'Gender',data=tempdf, kde = True,ax=axs[i])
    axs[i].set_title(role)

Across the job roles, the real discrepancy between jobs comes within the position as research director, where there is very little overlap between the pays of men and women suggesting bias

In [None]:
rd = df.loc[df['JobRole']== 'Research Director']
fig,ax = plt.subplots(figsize = (10,6))
sns.histplot(x= 'MonthlyIncome',hue = 'Gender',data=rd, kde = True,ax=ax)
ax.set_title('Distribution of Pay Between Men and Women for Research Director Role')

In [None]:
def KolmSmirTest(JobRole = 'Research Director'):
    tempdf = df.loc[df['JobRole']== JobRole]
    results = ks_2samp(tempdf.loc[tempdf['Gender']=='Female','MonthlyIncome'], tempdf.loc[tempdf['Gender']=='Male','MonthlyIncome'])

    print('KS Statistic = {} with a probability of {}'.format(results[0],results[1]))
    if results[1]>0.05:
        print(' For Job = {}, there is insufficient evidence to reject null-hypothesis that women and men are paid differently'.format(JobRole))
    else:
        print('For Job = {}, Reject Null-Hypothesis. Women and men are paid differently for this role'.format(JobRole))
        
KolmSmirTest()

For the role of research director there is evidence to suggest there is bas towards men's pay, the KS test provides no insight as to what causes this difference but we can see visually that it is to do with highly paid male Research Directors at the top-end, could there be other factors affecting this??


# Meritocratic Based Pay?

In [None]:
sns.jointplot(
    data=df,
    x="MonthlyIncome", y="TotalWorkingYears", hue="Gender",
    kind="scatter",
)