In [None]:
!pip install seaborn==0.11.0 # upgrading seaborn library to use newer plots and features!!

In [None]:
!pip install --upgrade pip

In [None]:
#Packages Import

#Data processing#
import numpy as np 
import pandas as pd
#import operator 
from datetime import date, datetime
import os

#Vizaulization#
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [None]:
# seaborn color palettes we will be using 

palette_1 = sns.color_palette('Accent', 6)
palette_2 = sns.color_palette('Set1', 6)
palette_3 = sns.color_palette('BrBG')
palette_4 = sns.color_palette('CMRmap')
palette_5 = sns.color_palette('Paired', 6)
palette_6 = sns.color_palette('RdYlBu')
palette_binary_1 = sns.color_palette('Accent_r', 2)
palette_binary_2 = sns.color_palette('Set1', 2)
palette_binary_3 = sns.color_palette('Set2', 2)

plt.style.use('fivethirtyeight')

In [None]:
#find input file path
file_path =[]
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        path =os.path.join(dirname, filename)
        print(path)
        file_path.append(path)

In [None]:
ggpg =pd.read_csv(file_path[0])
ggpg.head()

In [None]:
ggpg.info()

Clean data set - no missing values
Thanks Neelima

In [None]:
# updating following columns to category type : 
columns = ['JobTitle', 'Gender', 'Education', 'Dept']

for col in columns:
    ggpg[col] = ggpg[col].astype('category')

    
ggpg.info()

In [None]:
print(f'Num of diffirent job titles: {ggpg.JobTitle.unique().shape[0]}')
print('Job Titles: ')
ggpg.JobTitle.value_counts()

In [None]:
ggpg.Education.unique()

In [None]:
ggpg['Education'] = ggpg.Education.cat.reorder_categories(['High School', 'College','Masters','PhD'],ordered =True)
ggpg.Education.head()

In [None]:
fig = px.bar(ggpg, x="Dept", y="BasePay", color="Gender", barmode="group",
             facet_col="Education", category_orders = {"Education":['High School', 'College','Masters','PhD']} ,labels={'BasePay':'Base Pay'})
fig.show()

in general base pay gap increase with education level. 

In [None]:
print(f'Num of diffirent job titles: {ggpg.Dept.unique().shape[0]}')
print('Departments: ')
ggpg.Dept.value_counts()

In [None]:
fig = px.bar(ggpg, x="Dept", y="BasePay", color="Gender", 
             facet_col="JobTitle", facet_col_wrap=2, facet_row_spacing =0.1)

fig.update_layout(
    title='Base Pay per JobTite',
    barmode='group',
    bargap=0.15, # gap between bars of adjacent location coordinates.
    bargroupgap=0.1, # gap between bars of the same location coordinate.
    height =1200
    
)


fig.show()

Woman: earns more as Marketing Associate in all department
Man: earns significally more as software engineer 

rest usually man earn higher base pay


In [None]:
#Count gender per department
gender_dept = ggpg.groupby(['Dept','Gender']).size().reset_index(name='counts')

gender_dept.head()

In [None]:
fig = px.bar(gender_dept, x='Dept',y='counts',color='Gender',barmode='group', title ='Count Gender per Department', )
fig.show()

Vizualization inspired by :
https://www.kaggle.com/awwalmalhi/titanic-eda-and-feature-engineering

In [None]:
#Age-Gender Distribution 
fig, ax = plt.subplots(1, 2, figsize=(16, 8))
#sns.set_style('ticks')

sns.kdeplot(data=ggpg, x='Age', hue='Gender', fill=True, palette=palette_binary_3, ax=ax[0])
sns.boxenplot(data=ggpg, x='Gender', y='Age', ax=ax[1], palette=palette_binary_3)

sns.despine()
plt.show()

In [None]:
#Gender-Evaluation distribution 
fig, ax = plt.subplots(1, 3, figsize=(16, 6))
#sns.set_style('ticks')

sns.kdeplot(data=ggpg, x='PerfEval', hue='Gender', fill=True, palette=palette_binary_2, ax=ax[0])
sns.boxenplot(data=ggpg, x='Gender', y='PerfEval', ax=ax[1], palette=palette_binary_2)

sns.violinplot(data=ggpg, x='Gender', y='PerfEval', ax=ax[2], palette=palette_binary_2)

sns.despine()
plt.show()

In [None]:
plt.figure(figsize=(15, 8))
plt.style.use('fivethirtyeight')
#sns.set_style('ticks')
PROPS = {
    'boxprops':{'facecolor':'none', 'edgecolor':'black', 'linewidth':0.3},
    'medianprops':{'color':'black', 'linewidth':1.5},
    'whiskerprops':{'color':'black', 'linewidth':0.3},
    'capprops':{'color':'black', 'linewidth':0.3},
}

sns.boxplot(x='Education', data=ggpg, y='Age', hue='Gender', showfliers=False,  **PROPS)
sns.stripplot(data=ggpg, x='Education', hue='Gender', y='Age', palette=palette_binary_3, dodge=True)

sns.despine()
plt.legend(loc='upper right')
plt.show()

In [None]:
#software engineers by seniority 
SE = ggpg[ggpg['JobTitle'] =='Software Engineer']

In [None]:
sen = np.sort(SE.Seniority.unique())
sen

In [None]:
fig = go.Figure()



for s in SE.Seniority.unique():
    df = SE[SE.Seniority == s]
    fig.add_trace(go.Scatter (
    x=SE['Education'],
    y=SE['BasePay'],
    mode ='markers',
    name = str(s)
    ))

sliders = [
    {'steps':[
        {'method':'update', 'label':'all','args': [{'visible': [True ,True,True,True,True]}]},
        {'method':'update', 'label':'one year','args': [{'visible': [True ,False,False,False,False]}]},
        {'method':'update', 'label':'two years','args': [{'visible': [False ,True,False,False,False]}]},
        {'method':'update', 'label':'3 years','args': [{'visible': [False ,False,True,False,False]}]},
        {'method':'update', 'label':'fouryears','args': [{'visible': [False ,False,False,True,False]}]},
        {'method':'update', 'label':'fiveyears','args': [{'visible': [False ,False,False,False,True]}]}
    ]}]


fig.update_layout({'sliders':sliders})
fig.show()