**PLOTLY TUTORIAL - 1**

*Kaggle ML and Data Science Survey was live from August 7th to August 25th. The median time in the survey was 16.4 minutes. Respondents were allowed to complete the survey at any time.*

*Since Kaggle is one of the best data science community, I would like to share main findings of the survey with interactive plotly library. I hope that recommendations of respondents help data enthusiasts.*

*Let's deep dive into the world of data scientists!*

**PLOTLY TUTORIAL - 0 (S&P 500 Stock Data): **
https://www.kaggle.com/hakkisimsek/plotly-tutorial-0

**PLOTLY TUTORIAL - 2 (2015 Flight Delays and Cancellations):** https://www.kaggle.com/hakkisimsek/plotly-tutorial-2

In [None]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
mcr = pd.read_csv('../input/multipleChoiceResponses.csv', encoding='ISO-8859-1')
mcr.head()

In [None]:
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go

In [None]:
mcr = mcr[(mcr['Age'] < 75) & (mcr['Age'] > 18)]
age = round(mcr['Age'].value_counts())
trace = go.Bar(
    x=age.index,
    y=age.values,
    marker=dict(
        color = age.values,
        colorscale='Reds',
        showscale=True)
)
data = [trace]
layout = go.Layout(title='Age distribution', 
                   yaxis = dict(title = '# of Respondents')
                  )
fig = go.Figure(data=data, layout=layout)
fig['layout']['xaxis'].update(dict(title = 'Age', tickfont = dict(size = 12)))
py.iplot(fig)
20
30
40
50
60


Lets find out their job satisfaction level, gender, countries, age and salaries of US and NON US respondents and convert salaries to standard USD with the help of conversion table data set

In [None]:
mcr['CompensationAmount']=mcr['CompensationAmount'].str.replace('-','')
salary=mcr[['CompensationAmount','CompensationCurrency','Country','JobSatisfaction',
            'CurrentJobTitleSelect', 'Age','GenderSelect']].dropna()
crates=pd.read_csv('../input/conversionRates.csv')
crates.drop('Unnamed: 0', axis=1,inplace=True)
salary=salary.merge(crates, left_on='CompensationCurrency', right_on='originCountry', how='left')
salary['Salary']= pd.to_numeric(salary['CompensationAmount'])*salary['exchangeRate']
us_salary = salary[(salary['Salary'] > 100) & (salary['Salary'] < 500000) & 
                   (salary['Country'] == 'United States')]
non_us_salary = salary[(salary['Salary'] > 100) & (salary['Salary'] < 500000) & 
                       (~(salary['Country'] == 'United States'))]


In [None]:
us_salary.head(5)

In [None]:
non_us_salary.head(5)

We find that median salary in US is higher than NON US for the same age group at the fix rate pattern until the age of 55 but salary difference varies for the people above the age of 55

In [None]:
us_group = us_salary.groupby('Age')['Salary'].median().to_frame()
non_us_group = non_us_salary.groupby('Age')['Salary'].median().to_frame()

trace0 = go.Scatter(
    x = us_group.index,
    y = us_group['Salary'].round(-2),
    name = 'US',
    mode='markers',
    marker=dict(
        size=9,
        color = ('aqua')))

trace1 = go.Scatter(
    x = non_us_group.index,
    y = non_us_group['Salary'].round(-2),
    name = 'non-US',
    mode='markers',
    marker=dict(
        size=9,
        color = ('navy')))

data = [trace0, trace1]
layout = dict(title = 'The Median Salary by Age in US and Non-US Countries',
              xaxis = dict(title = 'Age'),
              yaxis = dict(title = 'Salary ($)')
             )

fig = dict(data=data, layout=layout)
fig['layout']['xaxis'].update(dict(title = 'Age', tickfont = dict(size = 12)))
py.iplot(fig)


   Converting Job satisfaction into 1 to 10 scale and group by us and non us salary based on job satisfaction

In [None]:
import numpy as np
salary['JobSatisfaction'].replace({'10 - Highly Satisfied':'10','1 - Highly Dissatisfied':'0','I prefer not to share':np.NaN},inplace=True)
salary.dropna(subset=['JobSatisfaction'], inplace=True)
salary['JobSatisfaction']=salary['JobSatisfaction'].astype(int)
salary_us = salary[salary.originCountry=='USD'].groupby('JobSatisfaction').Salary.mean().to_frame()
salary_non_us = salary[salary.originCountry!='USD'].groupby('JobSatisfaction').Salary.mean().to_frame()

Now ploting graph in scattered plot with us and non_us salaries based on job satisfaction Job satisfaction trend pattern also shows the similarities among us and non us worker regardless of their salaries Looks like US has higher job satisfaction to the people who has salries higher than 100K and similarly non us respondent has higher job satisfaction if they make higher than 40K

In [None]:
trace0 = go.Scatter(
    x=salary_us.index,
    y=salary_us['Salary'].round(-2),
    name = 'US',
    mode='markers',
    marker=dict(size=11,
        color=('navy')
               )
)

trace1 = go.Scatter(
      x = salary_non_us.index,
    y = salary_non_us['Salary'].round(-2),
    name = 'non_US',
    mode='markers',
    marker=dict(size=11,
        color = ('aqua')
               )
)

data = [trace0, trace1]
layout = dict(title = 'The Median Salary & Satisfaction in US & non-US Countries',
              xaxis = dict(title = 'Job Satisfaction'),
              yaxis = dict(title = 'Salary ($)')
             )

fig = dict(data=data, layout=layout)
fig['layout']['xaxis'].update(dict(title = 'Job Satisfaction', tickmode='linear',
                                   tickfont = dict(size = 10)))
py.iplot(fig)

Survey respondents Majors and job titles Lets see if the job satisfaction is because of their major and job title or because of salary

In [None]:

from plotly import tools 
import plotly.figure_factory as ff
mcr = mcr[~(mcr['MajorSelect'] == 'Other')]
mcr['MajorSelect'].replace({'Information technology, networking, or system administration':'IT, Network, System Admin', 
                      'Mathematics or statistics':'Math or stats',
                      'Engineering (non-computer focused)':'Engineering (non-CS)',
                      'IT, Network, System Admin':'IT-Network-System'}, inplace=True)

ms = round(mcr.MajorSelect.value_counts(normalize=True).to_frame()[:8], 4)
trace1 = go.Bar(
    x=ms.index,
    y=ms.MajorSelect,
    marker=dict(color='orange')
)

mcr['CurrentJobTitleSelect'].replace({'Software Developer/Software Engineer':'Software Developer', 
                      'Machine Learning Engineer':'ML Engineer'}, inplace=True)
cs = round(mcr.CurrentJobTitleSelect.value_counts(normalize=True).to_frame()[:8], 4)
trace2 = go.Bar(
    x=cs.index,
    y=cs.CurrentJobTitleSelect,
    marker=dict(color='navy')
)

fig = tools.make_subplots(rows=1, cols=2, subplot_titles=('Majors', 'Titles'))
fig.append_trace(trace1, 1,1)
fig.append_trace(trace2, 1,2)
fig['layout'].update(height=500, width=820, title='Majors & Titles in Data Science World', 
                     showlegend=False)            
py.iplot(fig)

It looks like respondents has more job satisfaction based on their job title rather than salaries. For example, programmer, data analyst, business analyst has comparitively less salary but has higher job satisfaction

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
salary = salary[~(salary['CurrentJobTitleSelect'] == 'Other')]
mcr = mcr[~(mcr['CurrentJobTitleSelect'] == 'Other')]

f,ax=plt.subplots(1,2,figsize=(13, 8))
sal_job=salary.groupby('CurrentJobTitleSelect')['Salary'].median().to_frame().sort_values(by='Salary'
                                                         ,ascending=False)
sns.barplot(sal_job.Salary,sal_job.index,facecolor=(0, 0, 0, 0), linewidth=3, 
              edgecolor=sns.color_palette("inferno", 25), ax = ax[0])

ax[0].set_title("Salaries & Job Titles", fontsize=12)
ax[0].set_xlabel('Salary ($)', fontsize=12)
ax[0].set_ylabel('')
ax[0].axvline(salary['Salary'].median(),linestyle='dashed', color = "g")
ax[0].grid(color = 'silver',linestyle='--')

satisfy=mcr.copy()
satisfy['JobSatisfaction'].replace({'10 - Highly Satisfied':'10','1 - Highly Dissatisfied':'0'
                                    ,'I prefer not to share':np.NaN},inplace=True)
satisfy.dropna(subset=['JobSatisfaction'], inplace=True)
satisfy['JobSatisfaction']=satisfy['JobSatisfaction'].astype(int)
satisfy_job=satisfy.groupby(['CurrentJobTitleSelect'])['JobSatisfaction'].mean().sort_values(ascending=False).to_frame()
sns.barplot(y=satisfy_job.index, x=satisfy_job.JobSatisfaction,facecolor=(0, 0, 0, 0), linewidth=3, 
              edgecolor=sns.color_palette("winter", 20), ax = ax[1])

ax[1].set_title("Satisfaction & Job Titles", fontsize=12)
ax[1].set_xlabel('Job Satisfaction (0-10)', fontsize =12)
ax[1].set_ylabel('')
ax[1].grid(color = 'silver',linestyle='--')
plt.subplots_adjust(wspace=0.95)
plt.show()

Lets see median salaries of men and women to find out if there is gender inequality in pay

In [None]:
male_salary = salary[salary['GenderSelect'] == 'Male']
female_salary = salary[salary['GenderSelect'] == 'Female']
male = male_salary.groupby('Age').Salary.mean().to_frame()
female = female_salary.groupby('Age').Salary.mean().to_frame()

trace0 = go.Scatter(
    x = male.index,
    y = male['Salary'].round(-2),
    name = 'male',
    line = dict(
        color = 'aqua',
        width = 2,
        dash='dash')
)

trace1 = go.Scatter(
      x = female.index,
    y = female['Salary'].round(-2),
    name = 'female',
    line = dict(
        color = 'navy',
        width = 2,
        dash = 'dash')
    )


data = [trace0, trace1]
layout = dict(title = 'The Median Salary of Men & Women by Age',
              xaxis = dict(title = 'Age'),
              yaxis = dict(title = 'Salary ($)')
             )

fig = dict(data=data, layout=layout)
fig['layout']['xaxis'].update(dict(title = 'Age', tickfont = dict(size = 12)))
py.iplot(fig)

It looks like there is no gender inquality among male and female but there is big spike for men with age 48 to 50 and trends in womon with age 50 and higher shows inconsistency

In [None]:
male_us_salary = salary[(salary['GenderSelect'] == 'Male') & 
                        (salary.originCountry=='USD')]
male_non_us_salary = salary[(salary['GenderSelect'] == 'Male')& 
                        (salary.originCountry!='USD')]
female_us_salary = salary[(salary['GenderSelect'] == 'Female')& 
                        (salary.originCountry=='USD')]
female_non_us_salary = salary[(salary['GenderSelect'] == 'Female')& 
                        (salary.originCountry!='USD')]
male_us = male_us_salary.groupby('Age').Salary.mean().to_frame()
male_nus = male_non_us_salary.groupby('Age').Salary.mean().to_frame()
female_us = female_us_salary.groupby('Age').Salary.mean().to_frame()
female_nus = female_non_us_salary.groupby('Age').Salary.mean().to_frame()

trace0 = go.Scatter(
    x = male_us.index,
    y = male_us['Salary'].round(-2),
    name = 'male',
    mode='markers',
    marker=dict(
        size=8,
        color = ('grey')
    )
)

trace1 = go.Scatter(
      x = female_us.index,
    y = female_us['Salary'].round(-2),
    name = 'female',
    mode='markers',
    marker=dict(
        size=8,
        color = ('red')
    )
)

data = [trace0, trace1]
layout = dict(title = 'The Median Salary of Men & Women by Age in US',
              xaxis = dict(title = 'Age'),
              yaxis = dict(title = 'Salary ($)')
             )

fig = dict(data=data, layout=layout)
fig['layout']['xaxis'].update(dict(title = 'Age', tickfont = dict(size = 12)))
py.iplot(fig)

trace0 = go.Scatter(
    x = male_nus.index,
    y = male_nus['Salary'].round(-2),
    name = 'male',
    mode='markers',
    marker=dict(
        size=8,
        color = ('grey')
    )
)

trace1 = go.Scatter(
      x = female_nus.index,
    y = female_nus['Salary'].round(-2),
    name = 'female',
    mode='markers',
    marker=dict(
        size=8,
        color = ('red')
    )
)

data = [trace0, trace1]
layout = dict(title = 'The Median Salary of Men & Women by Age in non-US countries',
              xaxis = dict(title = 'Age'),
              yaxis = dict(title = 'Salary ($)')
             )

fig = dict(data=data, layout=layout)
fig['layout']['xaxis'].update(dict(title = 'Age', tickfont = dict(size = 12)))
py.iplot(fig)

It looks like women are under paid than men after 35~40 and they are out of labor force after the age of 50, Non-US respondent has more variation in pay compared to US respondent. Is it because of experience and job title? Lets find out

In [None]:
salary['CurrentJobTitleSelect'].replace({'Software Developer/Software Engineer':'Software Developer', 
                      'Machine Learning Engineer':'ML Engineer'}, inplace=True)
male_us_salary = salary[(salary['GenderSelect'] == 'Male') & 
                        (salary.originCountry=='USD')]
male_non_us_salary = salary[(salary['GenderSelect'] == 'Male')& 
                        (salary.originCountry!='USD')]
female_us_salary = salary[(salary['GenderSelect'] == 'Female')& 
                        (salary.originCountry=='USD')]
female_non_us_salary = salary[(salary['GenderSelect'] == 'Female')& 
                        (salary.originCountry!='USD')]
male_us = male_us_salary.groupby('CurrentJobTitleSelect').Salary.mean().to_frame()
male_nus = male_non_us_salary.groupby('CurrentJobTitleSelect').Salary.mean().to_frame()
female_us = female_us_salary.groupby('CurrentJobTitleSelect').Salary.mean().to_frame()
female_nus = female_non_us_salary.groupby('CurrentJobTitleSelect').Salary.mean().to_frame()


In [None]:
trace0 = go.Scatter(
    x=male_us.index,
    y=male_us['Salary'].round(-2),
    name = 'MALE',
    mode='markers',
    marker=dict(size=11,
        color=('navy')
               ) )
trace1 = go.Scatter(
      x = female_us.index,
    y = female_us['Salary'].round(-2),
    name = 'FEMALE',
    mode='markers',
    marker=dict(size=11,
        color = ('aqua')
               ) )
data = [trace0, trace1]
layout = dict(title = 'The Median Salary & job Title for Male and Female in US',
              xaxis = dict(title = 'Job Title'),
              yaxis = dict(title = 'Salary ($)')
             )
fig = dict(data=data, layout=layout)
fig['layout']['xaxis'].update(dict(title = 'Job Title', tickmode='linear',
                                   tickfont = dict(size = 10)))
py.iplot(fig)

In [None]:
trace0 = go.Scatter(
    x=male_nus.index,
    y=male_nus['Salary'].round(-2),
    name = 'MALE',
    mode='markers',
    marker=dict(size=11,
        color=('navy')
               ) )
trace1 = go.Scatter(
      x = female_nus.index,
    y = female_nus['Salary'].round(-2),
    name = 'FEMALE',
    mode='markers',
    marker=dict(size=11,
        color = ('aqua')
               ) )
data = [trace0, trace1]
layout = dict(title = 'The Median Salary & job Title for Male and Female in NON US',
              xaxis = dict(title = 'Job Title'),
              yaxis = dict(title = 'Salary ($)')
             )
fig = dict(data=data, layout=layout)
fig['layout']['xaxis'].update(dict(title = 'Job Title', tickmode='linear',
                                   tickfont = dict(size = 10)))
py.iplot(fig)

We conclude that median salary for men is relatively higher than female in both US and Non US regardless of age and job title