In [None]:
import pandas as pd
import numpy as np
import plotly as ply
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.graph_objects as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
ply.offline.init_notebook_mode (connected = True)
import re
from functools import reduce
import math

from IPython.display import HTML
import base64


<img src="https://storage.googleapis.com/kaggle-competitions/kaggle/3136/media/kaggle-transparent.svg" width="1000">

# Kaggle survey 2021
**1. Library.**

* These are all library I used
* I use plotly library for a visualization because it's so understandable and easy for a beginner

In [None]:
import plotly as ply
print('plotly-'+ply.__version__)
#v4.8.1

from plotly.subplots import make_subplots


import plotly.express as px
print('plotly.express-'+ply.__version__)

import plotly.graph_objects as go

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
ply.offline.init_notebook_mode (connected = True)


import pandas as pd
print('pandas-'+pd.__version__)
#v1.1.3

import numpy as np
print('numpy-'+np.__version__)
#v1.20.1

import re
print('re-'+re.__version__)
#v2.2.1

from functools import reduce

import math

**2. Data**

* I have removed the first row and have edited the data header like as below
* We have a data frame with 5 rows and 369 columns

In [None]:
#data path
data_path = '../input/d/dinhvantrong1991/kaggle-survey-2021/kaggle_survey_2021_responses.csv'
df = pd.read_csv(data_path)#,sheetname='kaggle_survey_2021_responses')
df.head()

## Remove first row
df.columns = df.iloc[0]
df = df.reindex(df.index.drop(0)).reset_index(drop=True)
df.columns.name = None
df.head()

**3. List of variables**

* We actually have 42 variables after remove duplicates variable name from data source

In [None]:
# Distinct elements in columns
col_name = list(set(df.columns))

# Sorted elements in columns from @codinghorror
def natural_key(string_):
    return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_)]

sorted(col_name,key = natural_key)


# DEMOGRAPHICS AND GEOGRAPHY OF KAGGLERs

**1. Age distribution of Kaggler**

* In kaggle survey, most of the age range is less than 40 years old
* The total percent of the group is approximate 80%

In [None]:
#explain function
#   .value_counts() #count
#   .to_frame() #convert dataframe have a header 
#   .reset_index() #create a row index

# Age
df_age = (
        df['1.Age']
        .value_counts()
        .to_frame()
        .reset_index()
        .rename({'index':'Age', '1.Age':'Count'},axis=1)
        .sort_values(by=['Age'], ascending=True) 
        )
#print(df_age)

df_age['Percent'] = ((df_age['Count'] / df_age['Count'].sum())*100).round(1).astype(str) + '%'



colors = ['#4B2991',] * 11
colors[0] = '#872CA2'
colors[1] = '#C0369D'
colors[2] = '#EA4F88'
colors[3] = '#FA7876'
colors[4] = '#F6A97A'
colors[5] = '#C0369D'

fig_age = go.Figure(go.Bar(
            y=df_age['Count'],
            x=df_age['Age'],
            marker_color=colors,
            cliponaxis = False,
            text=df_age['Percent']

                        ))

fig_age.update_traces(
                    texttemplate='%{text}'
                    ,textposition='outside'
                    ,textfont_size=14
                    ,textfont_color='#6b6a6a'
                    ,hovertemplate='<b>%{label} </b> <br> Percent: %{value}%'
)

fig_age.update_layout(
                   title_text="<b>Age | </b> Total - Man - Woman"
                  ,title_font_size=30
                  ,title_font_color='#474747'
                  ,title_x=0.5
                  ,showlegend=False
                  ,yaxis_title=None
                  ,xaxis_title=None
                  ,yaxis={'showticklabels': False}
                  ,plot_bgcolor='#e0e0e0' #'rgba(0, 0, 0, 0)'
                  ,paper_bgcolor='#e0e0e0'#'rgba(0, 0, 0, 0)'
                  ,font=dict(family="Lato", size=16, color='#474747')
                     ,height=550

                 )

fig_age.update_xaxes(showgrid=False)
fig_age.update_yaxes(showgrid=False)


fig_age.add_annotation(
                    dict(
                        font=dict(size=14)
                        ,x=0
                        ,y=-0.15
                        ,showarrow=False
                        ,text='<b>@TrongDV | </b> Kaggle Survey 2021'
                        ,xanchor='left'
                        ,xref="paper"
                        ,yref="paper"
                        )
                  )
#fig_age.show()
iplot(fig_age)


#list_age = ['18-21','22-24','25-29','30-34','35-39']
#most_age_range = (df_age[(df_age['Age'].isin(list_age))][['Count']].sum()/df_age['Count'].sum())*100
#print(most_age_range)


**2. Gender & Age distribution**

* Percent of men distribution is ~ 79% compare to ~19% of women.

* **To men**: The top 3 ranges 18-21 | 22-24 and 25-29
* **To women**: The top 3 ranges are same as the men


In [None]:
# Gender
df_gender = (
        df['2.Gender']
        .value_counts()
        .to_frame()
        .reset_index()
        .rename(columns={'index':'Gender', '2.Gender':'Count'})
        .sort_values(by=['Count'], ascending=True) 
      )
#print(df_age)

df_gender['Percent'] = ((df_gender['Count'] / df_gender['Count'].sum())*100).round(2).apply(str) + '%'

#Select 2 column with multiple condition
df_12=df[(df['2.Gender']=='Man') | (df['2.Gender']=='Woman')][['1.Age','2.Gender']]
#other ways can use .loc / .iloc / .query 

#Create id series
df_12['Id']=df_12.index + 1
df_12=df_12.reset_index(drop=True)

#Use pivot table
df_ga=pd.pivot_table(
     df_12
    ,index=['2.Gender','1.Age']
    ,aggfunc={'Id': len}
).rename(columns={'Id': 'count'})

#convert pivot table to dataframe & rename
df_ga=pd.DataFrame(df_ga.to_records())
df_ga=df_ga.rename({'1.Age':'Age', '2.Gender':'Gender','count':'People'},axis=1)

#print(df_ga)    

#print(df_age)
fig_gender = make_subplots(rows=1, cols=2)

fig_gender.add_trace(
            go.Bar(
                 x=df_gender['Gender'], y=df_gender['Percent']
                #,height=465
                #,color = 'Gender'
                #,color_discrete_sequence=px.colors.sequential.Agsunset
                ,text=df_gender['Percent']
                ,marker=dict(color=px.colors.sequential.Agsunset)
                ,hovertemplate='<b>%{label} </b> <br> Percent: %{value}%'
                )
    ,row=1, col=2
            )


fig_gender.update_traces(texttemplate='%{text}', textposition='outside',textfont_size=14,textfont_color='#6b6a6a')

fig_gender.update_layout(
                   title_text="<b>Gender & Age | </b> Man vs Woman"
                  ,title_font_size=30
                  ,title_font_color='#474747'
                  ,title_x=0.5
                  ,showlegend=False
                  ,yaxis_title=None
                  ,xaxis_title=None
                  ,yaxis={'showticklabels': False}
                  ,plot_bgcolor='#e0e0e0'
                  ,paper_bgcolor='#e0e0e0'#'rgba(0, 0, 0, 0)'
                  ,font=dict(family="Lato", size=16, color='#474747')
    
                 )

fig_gender.update_xaxes(showgrid=False)
fig_gender.update_yaxes(showgrid=False,visible=False)

fig_ga = px.sunburst(df_ga, path=['Gender', 'Age'], values='People', color_discrete_sequence=px.colors.sequential.Agsunset)

#print(fig_ga)
colorsunburst=['','','','','','','','','','','','','','','','','','','','','','','#694EA1','#C51162']

fig_gender.add_trace(go.Sunburst(
                 labels=fig_ga['data'][0]['ids'].tolist()
                ,parents=fig_ga['data'][0]['parents'].tolist()
                ,values=fig_ga['data'][0]['values'].tolist()
                ,branchvalues=fig_ga['data'][0]['branchvalues']
                ,textinfo='label+percent entry'
                ,hovertemplate='<b>%{label} </b> <br> People: %{value}<br>'
                ,hoverlabel_bgcolor='#512DA8'
                ,marker=dict(colors=colorsunburst)
                
    )
#,row=1, col=2
)

fig_gender.update_layout(
    grid= dict(columns=2, rows=1)
    ,margin = dict(t=100, l=100, r=100, b=50)
    ,height = 600
)

fig_gender.add_annotation(
                    dict(
                        font=dict(size=14)
                        ,x=0
                        ,y=-0.2
                        ,showarrow=False
                        ,text='<b>@TrongDV | </b> Kaggle Survey 2021'
                        ,xanchor='left'
                        ,xref="paper"
                        ,yref="paper"
                        )
                  )

#fig_gender.show()
iplot(fig_gender)

**3. Top 7 countries have the largest sample in data survey**

* These rankingse are unchanged for population & men sample
* For women sample, while top 2 countries are the same as population data, From the 3rd country there has been a change.

In [None]:
# exclude the value "other"
# total
df_country=df.loc[df['3.Country']!='Other',['3.Country']]
df_country_total=(
            df_country['3.Country']
            .value_counts()
            .to_frame()
            .reset_index()
            .rename({'index':'Country','3.Country':'Count'},axis=1)
            .head(7)
)

#replace multiple substring to other string
for r in (('United States of America', 'USA')
        ,('United Kingdom of Great Britain and Northern Ireland','UK')
        ,('United Arab Emirates','UAE')):
    df_country_total['Country'] = df_country_total['Country'].replace(*r)
        
# for man
df_country_m=df.loc[(df['2.Gender']=='Man') & (df['3.Country']!='Other'),['3.Country']]
df_country_m=(
            df_country_m['3.Country']
            .value_counts()
            .to_frame()
            .reset_index()
            .rename({'index':'Country','3.Country':'Count'},axis=1)
            .head(7)
)

#replace multiple substring to other string
for r in (('United States of America', 'USA')
        ,('United Kingdom of Great Britain and Northern Ireland','UK')
        ,('United Arab Emirates','UAE')):
    df_country_m['Country'] = df_country_m['Country'].replace(*r)
    
#for woman
df_country_w=df.loc[(df['2.Gender']=='Woman') & (df['3.Country']!='Other'),['3.Country']]
df_country_w=(
            df_country_w['3.Country']
            .value_counts()
            .to_frame()
            .reset_index()
            .rename({'index':'Country','3.Country':'Count'},axis=1)
            .head(7)
)

#replace multiple substring to other string
for r in (('United States of America', 'USA')
        ,('United Kingdom of Great Britain and Northern Ireland','UK')
        ,('United Arab Emirates','UAE')):
    df_country_w['Country'] = df_country_w['Country'].replace(*r)
    

fig_country = make_subplots(rows=1, cols=3)

fig_country.add_trace(go.Funnelarea(
     values = df_country_m['Count'], text = df_country_m['Country']
    ,marker = {'colors': ['#4B2991', '#872CA2', '#C0369D', '#EA4F88', '#FA7876','#F6A97A','#C0369D']}
    ,title = {"text": "<b>Man"}   
    ,textfont = {"family": "Lato", "size": 20, "color": "#ffffff"}, opacity = 0.8
    ,domain = {"x": [0, 0.4], "y": [0, 0.8]}
))

fig_country.add_trace(go.Funnelarea(
     values = df_country_w['Count'], text = df_country_w['Country']
    ,marker = {'colors': ['#4B2991', '#872CA2', '#C0369D', '#EA4F88', '#FA7876','#F6A97A','#C0369D']}
    ,title = {"text": "<b>Woman"}  
    ,textfont = {"family": "Lato", "size": 20, "color": "#ffffff"}, opacity = 0.8
    ,domain = {"x": [0.6, 1], "y": [0, 0.8]}
))
   
fig_country.add_trace(go.Funnelarea(
     values = df_country_total['Count'], text = df_country_total['Country']
    ,marker = {'colors': ['#4B2991', '#872CA2', '#C0369D', '#EA4F88', '#FA7876','#F6A97A','#C0369D']}
    ,title = {"text": "<b>Total"}
    ,textfont = {"family": "Lato", "size": 25, "color": "#ffffff"}, opacity = 0.8
))

fig_country.update_layout(
                  title_text="<b>Top 7 Country</b> (exclude other) |  Men - Women - Total"
                  ,title_font_size=30
                  ,title_font_color='#474747'
                  ,title_x=0.5
                  ,showlegend=False
                  ,plot_bgcolor='#e0e0e0' #'rgba(0, 0, 0, 0)'
                  ,paper_bgcolor='#e0e0e0'#'rgba(0, 0, 0, 0)'
                  ,font=dict(family="Lato", size=16, color='#474747')
    ,height=550
)

fig_country.add_annotation(
                    dict(
                        font=dict(size=14)
                        ,x=0
                        ,y=-0.15
                        ,showarrow=False
                        ,text='<b>@TrongDV | </b> Kaggle Survey 2021'
                        ,xanchor='left'
                        ,xref="paper"
                        ,yref="paper"
                        )
                  )

#fig_country.show()
iplot(fig_country)

**4. Density of kaggler in data sample**

As we can take a look at a figure, In Asia, India are still have the largest sample. In north America, USA is in the second.

In [None]:
# list of contries without other
df_country_map = df.loc[df['3.Country']!='Other',['3.Country']]

#replace multiple substring to other string
for r in (('United States of America', 'United States')
        ,('United Kingdom of Great Britain and Northern Ireland','United Kingdom')
        ,('Hong Kong (S.A.R.)','Hong Kong')
        ,('Iran, Islamic Republic of...','Iran')
        ):
    df_country_map['3.Country'] = df_country_map['3.Country'].replace(*r)
    
df_country_map2=(
                df_country_map['3.Country']
                .value_counts()
                .to_frame()
                .reset_index()
                .rename({'index':'Country','3.Country':'People'},axis=1)
)

fig_map=go.Figure(data=go.Choropleth(
     locations = df_country_map2['Country']
    ,z = df_country_map2['People']
    ,text = df_country_map2['Country']
    ,locationmode = 'country names'
    ,colorscale = 'Agsunset'
    ,autocolorscale=False
    ,reversescale=True
    ,marker_line_color='#e3e3e3'
    ,marker_line_width=0.1
    ,marker_opacity=0.8
    ,colorbar_title = 'People',
))
fig_map.update_traces(colorbar_showticklabels=False
                    #,marker_line_width=0
                     )

fig_map.update_layout(
                   title_text="<b>Kagglers | </b> Distribution"
                  ,title_font_size=30
                  ,title_font_color='#474747'
                  ,title_x=0.5
                  ,showlegend=False
                  ,plot_bgcolor='#ffffff' #'rgba(0, 0, 0, 0)'
                  ,paper_bgcolor='#ffffff'#'rgba(0, 0, 0, 0)'
                  ,font=dict(family="Lato", size=16, color='#474747')
                  ,width=800
                  ,height=550  
)

fig_map.add_annotation(
                    dict(
                        font=dict(size=14)
                        ,x=0
                        ,y=-0.15
                        ,showarrow=False
                        ,text='<b>@TrongDV | </b> Kaggle Survey 2021'
                        ,xanchor='left'
                        ,xref="paper"
                        ,yref="paper"
                        )
                  )
#fig_map.show()
iplot(fig_map)

**5. Education**

* In a figure as below, the highest percent of education is a master degree. Specifically, propotion of men sample is 30.8% compare to 7.5% of women sample.

* The second is a bachelor degree, the percent is slightly decreased 30.5% for gents and 7.1% for ladies.


In [None]:
#education by gender
df_edu_m = df.loc[df['2.Gender'] == 'Man',['4.Education']]
df_edu_w = df.loc[df['2.Gender'] == 'Woman',['4.Education']]
df_edu_o = df.loc[(df['2.Gender'] != 'Woman') & (df['2.Gender'] != 'Man'),['4.Education']]

df_edu = (
    df['4.Education']
    .value_counts()
    .to_frame()
    .reset_index()
    .rename({'index':'Education', '4.Education':'Count'},axis=1) 
    .sort_values(by=['Count'], ascending=True) 
)

df_edu['Percent']=((df_edu['Count'] / df_edu['Count'].sum())*100).round(1).apply(str)+'%'

df_edu_m = (
    df_edu_m['4.Education']
    .value_counts()
    .to_frame()
    .reset_index()
    .rename({'index':'Education', '4.Education':'Count'},axis=1) 
    .sort_values(by=['Count'], ascending=True) 
)

df_edu_m['Percent']=((df_edu_m['Count'] / df_edu['Count'].sum())*100).round(1).apply(str)+'%'

df_edu_w = (
    df_edu_w['4.Education']
    .value_counts()
    .to_frame()
    .reset_index()
    .rename({'index':'Education', '4.Education':'Count'},axis=1) 
    .sort_values(by=['Count'], ascending=True) 
)

df_edu_w['Percent']=((df_edu_w['Count'] / df_edu['Count'].sum())*100).round(1).apply(str)+'%'

df_edu_o = (
    df_edu_o['4.Education']
    .value_counts()
    .to_frame()
    .reset_index()
    .rename({'index':'Education', '4.Education':'Count'},axis=1) 
    .sort_values(by=['Count'], ascending=True) 
)

df_edu_o['Percent']=((df_edu_o['Count'] / df_edu['Count'].sum())*100).round(1).apply(str)+'%'


#replace multiple string to abbreviations
for r in (('Master’s degree', 'Master')
        ,('Bachelor’s degree','Bachelor')
        ,('Doctoral degree','Doctorate')
        ,('No formal education past high school','No-formal Ed')
        ,('Professional doctorate','Pro-Doctorate')
        ,('Some college/university study without earning a bachelor’s degree','Self-study')
        ,('I prefer not to answer','No answer')
          
        ):
    df_edu['Education'] = df_edu['Education'].replace(*r)
    

fig_edu = go.Figure()
fig_edu.add_trace(go.Bar(
     y=df_edu['Education']
    ,x=df_edu_o['Percent']
    ,name='Other'
    ,orientation='h'
    ,marker=dict(
        color='#50456E'
            )
    ,text=df_edu_o['Percent']
    ,opacity = 0.85
    
    
))

fig_edu.add_trace(go.Bar(
     y=df_edu['Education']
    ,x=df_edu_m['Percent']
    ,name='Man'
    ,orientation='h'
    ,marker=dict(
        color='#694EA1'
            )
    ,text=df_edu_m['Percent']
    ,opacity = 0.85
))

fig_edu.add_trace(go.Bar(
     y=df_edu['Education']
    ,x=df_edu_w['Percent']
    ,name='Woman'
    ,orientation='h'
    ,marker=dict(
        color='#C0369D'        
            )
    ,text=df_edu_w['Percent']
    ,opacity = 0.85
    
))

fig_edu.update_traces(
                    texttemplate='%{text}'
                    ,textposition='inside'
                    ,textfont_size=14
                    ,textfont_color='#ffffff'
                    ,hovertemplate='<b>%{label} </b> <br> Percent: %{value}%'
)

fig_edu.update_layout(
     barmode='stack'
    ,legend_x=0.3
    ,legend_y=1.15
    ,xaxis={'showticklabels': False}
    ,legend_orientation='h'
    
        
)

fig_edu.update_xaxes(showgrid=False)
fig_edu.update_yaxes(showgrid=False)

fig_edu.update_layout(
                  title_text="<b>Education</b> |  Men - Women"
                  ,title_font_size=30
                  ,title_font_color='#474747'
                  ,title_x=0.5
                  #,showlegend=False
                  ,plot_bgcolor='#e0e0e0' #'rgba(0, 0, 0, 0)'
                  ,paper_bgcolor='#e0e0e0'#'rgba(0, 0, 0, 0)'
                  ,font=dict(family="Lato", size=16, color='#474747')
)

fig_edu.add_annotation(
                    dict(
                        font=dict(size=14)
                        ,x=0
                        ,y=-0.2
                        ,showarrow=False
                        ,text='<b>@TrongDV | </b> Kaggle Survey 2021'
                        ,xanchor='left'
                        ,xref="paper"
                        ,yref="paper"
                        )
                  )

#fig_edu.show()
iplot(fig_edu)

**6. Relationship between education and age group and income**

* Apparently, master's degree holders have higher salaries than bachelor's degree. And you can see the distribution prove it clearly.

* Beside that, the older the age, the higher the salary.

In [None]:
df_edu_age_y = df.loc[:,['1.Age','25.Yearly income','4.Education']]

# create 2 columns to calculate median income
df_edu_age_y[['min_income','max_income']]=(
    df['25.Yearly income']
    .apply(lambda x: str(x).replace(',',''))
    .apply(lambda x: str(x).replace('$',''))
    .apply(lambda x: str(x).replace('>',''))    
    .str.split("-", n = 1, expand = True)
    .apply(pd.to_numeric,errors = 'coerce')
)

# create a median column from 2 columns
df_edu_age_y['median_income'] = (
     np.where(df_edu_age_y['min_income'] == 0
            ,df_edu_age_y['max_income']
            ,df_edu_age_y['min_income']/2+df_edu_age_y['max_income']/2
             )
    .round(0)
)

df_edu_age_y=(
     df_edu_age_y
    .drop(columns=['25.Yearly income','min_income','max_income'])
    .rename({'1.Age':'age','4.Education':'education',},axis=1)
)

#replace to abbreviation
for r in (('Master’s degree', 'Master')
        ,('Bachelor’s degree','Bachelor')
        ,('Doctoral degree','Doctorate')
        ,('No formal education past high school','No-formal Ed')
        ,('Professional doctorate','Pro-Doctorate')
        ,('Some college/university study without earning a bachelor’s degree','Self-study')
        ,('I prefer not to answer','No answer')          
        ):
    df_edu_age_y['education'] = df_edu_age_y['education'].replace(*r)
    


## add index
df_edu_age_y['id']=df_edu_age_y.index + 1
df_edu_age_y=df_edu_age_y.reset_index(drop=True)

# Use pivot table
df_edu_age_y=pd.pivot_table(
     df_edu_age_y
    ,index=['age','education']
    ,aggfunc={'id': len,'median_income':np.mean}
).rename(columns={'id': 'count','median_income':'avg_income'})

#convert pivot table to dataframe & rename
df_edu_age_y=pd.DataFrame(df_edu_age_y.to_records())


df_edu_age_y = df_edu_age_y.sort_values(['education', 'age'])


bubble_size = []

for index, row in df_edu_age_y.iterrows():
    bubble_size.append(math.pow(row['count'],1.3))

df_edu_age_y['size'] = bubble_size

education_name = ['Master', 'Bachelor', 'Doctorate', 'Self-study', 'No answer','No-formal Ed','Pro-Doctorate']
education_data = {education:df_edu_age_y.query("education == '%s'" %education)
                             for education in education_name}


fig_edu_age_y = go.Figure()

for education_name, education in education_data.items():
    fig_edu_age_y.add_trace(go.Scatter(
         x=education['avg_income'].round(0)
        ,y=education['age']
        ,name=education_name
        ,text=df_edu_age_y['education']
        ,marker_size=education['size']
        ,hovertemplate=
            'Age: %{y}<br>'+
            'Yearly Income: %{x}<br>'+
            '<extra></extra>'
        ))

fig_edu_age_y.update_traces(
    mode='markers'
    ,marker=dict(sizemode = 'area'
                ,sizeref=10
                ,line=dict(width=0)
               )
    ,opacity = 0.9
)


fig_edu_age_y.update_layout(
    xaxis={
        'title':'Yearly Income',
        'type':'log'}
    ,yaxis={'title':'Age'}    
)

#fig_edu_age_y.update_xaxes(showgrid=False)
#fig_edu_age_y.update_yaxes(showgrid=False)

fig_edu_age_y.update_layout(legend = dict(bordercolor='rgb(100,100,100)'))


fig_edu_age_y.update_layout(
                  title_text="<b>Education - Age - Income </b> |  The triangle"
                  ,title_font_size=30
                  ,title_font_color='#474747'
                  ,title_x=0.5
                  #,showlegend=False
                  ,plot_bgcolor='#e0e0e0' #'rgba(0, 0, 0, 0)'
                  ,paper_bgcolor='#e0e0e0'#'rgba(0, 0, 0, 0)'
                  ,font=dict(family="Lato", size=16, color='#474747')
    
                
)

fig_edu_age_y.add_annotation(
                    dict(
                        font=dict(size=14)
                        ,x=0
                        ,y=-0.2
                        ,showarrow=False
                        ,text='<b>@TrongDV | </b> Kaggle Survey 2021'
                        ,xanchor='left'
                        ,xref="paper"
                        ,yref="paper"
                        )
                  )

#fig_edu_age_y.show()
iplot(fig_edu_age_y)

**7. Education - Continents - Income by TreeMap**

**Total**

In Asia, we have top 5 countries are:
1. India
2. Japan
3. China
4. Russia
5. Pakistan

In Europe, the top 5 are:
1. United Kingdom
2. Germany
3. Spain
4. France
5. Italy

In America:
1. USA
2. Canada
3. Brazile
4. Mexico
5. Colombia

In Africa:
1. Nigeria
2. Egypt
3. Kenya
4. South Africa
5. Morocco

In oceania:
1. Australia

**Top 35 smallest sample:**

In Asia, we have top 5 countries are:
1. Singapore
2. Malaysia
3. Israel
4. Thailand
5. UAE

In Europe, the top 5 are:
1. Netherlands
2. Portugal
3. Greece
4. Ireland
5. Sweden

In Africa:
1. Tunisia
2. Ghana
3. Uganda
4. Algeri
5. Ethiopia

In America:
1. Peru
2. Chile
3. Ecuador


* Loot at the treemap we see the high salaries are concentrated in area such as North America, Europe and some other developed countries.

* In addition, a person hold a doctorate's degree is higher average salary than the others.



In [None]:
df_continents = pd.read_csv('https://raw.githubusercontent.com/trodiva/IS_608/master/NanosatDB_munging/Countries-Continents.csv')

df_country_edu_income = df.loc[:,['3.Country','4.Education','25.Yearly income']]

# replace multiple substring to specific string
for r in (('United States of America', 'United States of America')
        ,('United Kingdom of Great Britain and Northern Ireland','United Kingdom')
        ,('Hong Kong (S.A.R.)','')
        ,('Iran, Islamic Republic of...','Iran')
        ,('Viet Nam','Vietnam')
        ,('South Korea','Korea, South')
        ,('Russia','Russian Federation')
        ):
    df_country_edu_income['3.Country'] = df_country_edu_income['3.Country'].replace(*r)
    
for r in (('Master’s degree', 'Master')
        ,('Bachelor’s degree','Bachelor')
        ,('Doctoral degree','Doctorate')
        ,('No formal education past high school','No-formal Ed')
        ,('Professional doctorate','Pro-Doctorate')
        ,('Some college/university study without earning a bachelor’s degree','Self-study')
        ,('I prefer not to answer','No answer')          
        ):
    df_country_edu_income['4.Education'] = df_country_edu_income['4.Education'].replace(*r)
    
df_country_edu_income = (
    pd.merge(df_country_edu_income,df_continents,how='left',left_on=['3.Country'],right_on=['Country'])
    .drop('Country',1)
    .rename({'3.Country':'Country','4.Education':'Education','25.Yearly income':'yearly_income'},axis=1)    
)

# create 2 columns to calculate median income
df_country_edu_income[['min_income','max_income']]=(
    df_country_edu_income['yearly_income'].copy()
    .apply(lambda x: str(x).replace(',',''))
    .apply(lambda x: str(x).replace('$',''))
    .apply(lambda x: str(x).replace('>',''))    
    .str.split("-", n = 1, expand = True)
    .apply(pd.to_numeric,errors = 'coerce')
)

# create a median income column from 2 columns
df_country_edu_income['median_income'] = (
     np.where(df_country_edu_income['min_income'] == 0
            ,df_country_edu_income['max_income']
            ,df_country_edu_income['min_income']/2+df_country_edu_income['max_income']/2
             )
    .round(0)
)

#print(df_country_edu_income['median_income'].min())    
df_country_edu_income['Id']= df_country_edu_income.index + 1

# top 20 countries have the lasrgest sample
top20=df_country_edu_income[df_country_edu_income['Country'].str.strip().astype(bool)]

top20_largest = (
    top20['Country'].copy().dropna()
    .value_counts()
    .to_frame()
    .reset_index()
    .sort_values(by=['Country'], ascending=False)
    .head(35)
    .drop('Country',1)
    .drop_duplicates()
)
# transform to list
top20_largest=top20_largest['index'].to_list()

# top 20 countries have the smallest sample
top20_smallest = (
    top20['Country'].copy().dropna()
    .value_counts()
    .to_frame()
    .reset_index()
    .sort_values(by=['Country'], ascending=True)
    .head(35)
    .drop('Country',1)
    .drop_duplicates()
)
# transform to list
top20_smallest=top20_smallest['index'].to_list()

#print(top20_smallest)

# top 20 largest and smallest
# largest
df_country_edu_income_l=pd.pivot_table(
     df_country_edu_income[(df_country_edu_income['Country'].isin(top20_largest))][['Country','Education','Continent','median_income','Id']].copy()
    ,index=['Continent','Country','Education']
    ,aggfunc={'Id': len,'median_income':np.mean}
).rename(columns={'Id': 'count','median_income':'avgIncome'})
#df_country_edu_income_l=df_country_edu_income[(df_country_edu_income['Country'].isin(top20_largest))][['Country','Education','Continent','median_income','Id']].copy()

df_country_edu_income_l = (
    pd.DataFrame(df_country_edu_income_l.to_records())
).fillna(df_country_edu_income_l['avgIncome'].min())

#print(top20_largest)

# smallest
df_country_edu_income_s=pd.pivot_table(
     df_country_edu_income[(df_country_edu_income['Country'].isin(top20_smallest))][['Country','Education','Continent','median_income','Id']].copy()
    ,index=['Continent','Country','Education']
    ,aggfunc={'Id': len,'median_income':np.mean}
).rename(columns={'Id': 'count','median_income':'avgIncome'})

df_country_edu_income_s = (
    pd.DataFrame(df_country_edu_income_s.to_records())
).fillna(df_country_edu_income_s['avgIncome'].min())


# total country
df_country_edu_income=pd.pivot_table(
     df_country_edu_income[['Country','Education','Continent','median_income','Id']].copy()
    ,index=['Continent','Country','Education']
    ,aggfunc={'Id': len,'median_income':np.mean}
).rename(columns={'Id': 'count','median_income':'avgIncome'})

# assign NaN values as min(income)
df_country_edu_income = (
    pd.DataFrame(df_country_edu_income.to_records())
).fillna(df_country_edu_income['avgIncome'].min())

# total
fig_country_edu_income = px.treemap(
    df_country_edu_income
    ,path=[px.Constant('Total country'),'Continent', 'Country','Education']
    ,values='count'
    ,color='avgIncome'
    ,color_continuous_scale='deep'
    ,color_continuous_midpoint=np.average(df_country_edu_income['avgIncome'],weights=df_country_edu_income['count'])
    ,range_color=[999,200000]
    #,text=df_country_edu_income['']
)
fig_country_edu_income.update_layout(margin = dict(t=5, l=0, r=0, b=0),height=600)
fig_country_edu_income.update_traces(textinfo='label+percent parent')
#fig_country_edu_income.show()
iplot(fig_country_edu_income)

# the largest sample
fig_country_edu_income_l = px.treemap(
    df_country_edu_income_l
    ,path=[px.Constant('Top 35 the largest sample'),'Continent', 'Country','Education']
    ,values='count'
    ,color='avgIncome'
    ,color_continuous_scale='deep'
    ,color_continuous_midpoint=np.average(df_country_edu_income_l['avgIncome'],weights=df_country_edu_income_l['count'])
    ,range_color=[999,200000]
    #,text=df_country_edu_income['']
)
fig_country_edu_income_l.update_layout(margin = dict(t=5, l=0, r=0, b=0),height=600)
fig_country_edu_income_l.update_traces(textinfo='label+percent parent')
fig_country_edu_income_l.show()
iplot(fig_country_edu_income_l)

# the smallest sample
fig_country_edu_income_s = px.treemap(
    df_country_edu_income_s
    ,path=[px.Constant('Top 35 the smallest sample'),'Continent', 'Country','Education']
    ,values='count'
    ,color='avgIncome'
    ,color_continuous_scale='deep'
    ,color_continuous_midpoint=np.average(df_country_edu_income_s['avgIncome'],weights=df_country_edu_income_s['count'])
    ,range_color=[999,200000]
    #,text=df_country_edu_income['']
)
fig_country_edu_income_s.update_layout(margin = dict(t=5, l=0, r=0, b=0),height=600)
fig_country_edu_income_s.update_traces(textinfo='label+percent parent')
#fig_country_edu_income_s.show()
iplot(fig_country_edu_income_s)

**8. Role**

* Students are still the largest sample.
* The second role is Data scientist.
* And the third is Software engineer.

* Suprisingly, the other role are in 4. (Maybe my role will be ranked in this group)
* The 5th is Data Analyst.



In [None]:
df_occupation=df.loc[:,['5.Occupation']]
df_occupation_m=df.loc[df['2.Gender']=='Man',['5.Occupation']]
df_occupation_w=df.loc[df['2.Gender']=='Woman',['5.Occupation']]
df_occupation_o=df.loc[(df['2.Gender']!='Man') & (df['2.Gender']=='Woman') ,['5.Occupation']]

df_occupation=(
    df_occupation['5.Occupation']
    .value_counts()
    .to_frame()
    .reset_index()
    .rename({'index':'occupation','5.Occupation':'count'},axis=1)
    .sort_values(by=['count'], ascending=True)  
)

df_occupation_m=(
     df_occupation_m['5.Occupation']
    .value_counts()
    .to_frame()
    .reset_index()
    .rename({'index':'occupation','5.Occupation':'count'},axis=1)
    .sort_values(by=['count'], ascending=True)     
)

df_occupation_w=(
     df_occupation_w['5.Occupation']
    .value_counts()
    .to_frame()
    .reset_index()
    .rename({'index':'occupation','5.Occupation':'count'},axis=1)
    .sort_values(by=['count'], ascending=True)     
)

df_occupation_o=(
     df_occupation_o['5.Occupation']
    .value_counts()
    .to_frame()
    .reset_index()
    .rename({'index':'occupation','5.Occupation':'count'},axis=1)
    .sort_values(by=['count'], ascending=True)     
)

df_occupation_m['percent']=((df_occupation_m['count'] / df_occupation['count'].sum())*100).round(1).astype('str')+'%'
df_occupation_w['percent']=((df_occupation_w['count'] / df_occupation['count'].sum())*100).round(1).astype('str')+'%'
df_occupation_o['percent']=((df_occupation_o['count'] / df_occupation['count'].sum())*100).round(1).astype('str')+'%'


fig_occupation = go.Figure()
fig_occupation.add_trace(go.Bar(
     y=df_occupation['occupation']
    ,x=df_occupation_o['percent']
    ,name='Other'
    ,orientation='h'
    ,marker=dict(
        color='#50456E'
            )
    ,text=df_occupation_o['percent']
    ,opacity = 0.85 
))

fig_occupation.add_trace(go.Bar(
     y=df_occupation['occupation']
    ,x=df_occupation_m['percent']
    ,name='Man'
    ,orientation='h'
    ,marker=dict(
        color='#694EA1'
            )
    ,text=df_occupation_m['percent']
    ,opacity = 0.85
))

fig_occupation.add_trace(go.Bar(
     y=df_occupation['occupation']
    ,x=df_occupation_o['percent']
    ,name='Woman'
    ,orientation='h'
    ,marker=dict(
        color='#C0369D'        
            )
    ,text=df_occupation_o['percent']
    ,opacity = 0.85
    
))

fig_occupation.update_traces(
                    texttemplate='%{text}'
                    ,textposition='inside'
                    ,textfont_size=14
                    ,textfont_color='#ffffff'
                    ,hovertemplate='<b>%{label} </b> <br> Percent: %{value}%'
)

fig_occupation.update_layout(
     barmode='stack'
    ,legend_x=0.3
    ,legend_y=1.1
    ,legend_orientation='h'
    ,xaxis={'showticklabels': False}
    
        
)

fig_occupation.update_xaxes(showgrid=False)
fig_occupation.update_yaxes(showgrid=False)

fig_occupation.update_layout(
     title_text="<b>Role</b> |  Men - Women - others"
    ,title_font_size=30
    ,title_font_color='#474747'
    ,title_x=0.5
    ,plot_bgcolor='#e0e0e0' #'rgba(0, 0, 0, 0)'
    ,paper_bgcolor='#e0e0e0'#'rgba(0, 0, 0, 0)'
    ,font=dict(family="Lato", size=16, color='#474747')
    ,height=600
    
)

fig_occupation.add_annotation(
                    dict(
                        font=dict(size=14)
                        ,x=0
                        ,y=-0.15
                        ,showarrow=False
                        ,text='<b>@TrongDV | </b> Kaggle Survey 2021'
                        ,xanchor='left'
                        ,xref="paper"
                        ,yref="paper"
                        )
                  )

#fig_occupation.show()
iplot(fig_occupation)

**9. Salaries classified by yearly income and countries.**

The top high salaries including as below:

* Product Manager / Project Manager
* Research Scientist
* ML engineer
* Data scientist
* Data analyst

In [None]:
df_continents = pd.read_csv('https://raw.githubusercontent.com/trodiva/IS_608/master/NanosatDB_munging/Countries-Continents.csv')

df_country_occ_income = df.loc[:,['3.Country','5.Occupation','25.Yearly income']]

# replace multiple substring to specific string
for r in (('United States of America', 'United States of America')
        ,('United Kingdom of Great Britain and Northern Ireland','United Kingdom')
        ,('Hong Kong (S.A.R.)','')
        ,('Iran, Islamic Republic of...','Iran')
        ,('Viet Nam','Vietnam')
        ,('South Korea','Korea, South')
        ,('Russia','Russian Federation')
        ):
    df_country_occ_income['3.Country'] = df_country_occ_income['3.Country'].replace(*r)
    
df_country_occ_income = (
    pd.merge(df_country_occ_income,df_continents,how='left',left_on=['3.Country'],right_on=['Country'])
    .drop('Country',1)
    .rename({'3.Country':'Country','5.Occupation':'occupation','25.Yearly income':'yearly_income'},axis=1)    
)

# create 2 columns to calculate median income
df_country_occ_income[['min_income','max_income']]=(
    df_country_occ_income['yearly_income'].copy()
    .apply(lambda x: str(x).replace(',',''))
    .apply(lambda x: str(x).replace('$',''))
    .apply(lambda x: str(x).replace('>',''))    
    .str.split("-", n = 1, expand = True)
    .apply(pd.to_numeric,errors = 'coerce')
)

# create a median income column from 2 columns
df_country_occ_income['median_income'] = (
     np.where(df_country_occ_income['min_income'] == 0
            ,df_country_occ_income['max_income']
            ,df_country_occ_income['min_income']/2+df_country_occ_income['max_income']/2
             )
    .round(0)
)

#print(df_country_edu_income['median_income'].min())    
df_country_occ_income['Id']= df_country_occ_income.index + 1

# top 20 countries have the lasrgest sample
top20=df_country_occ_income[df_country_occ_income['Country'].str.strip().astype(bool)]

top20_largest = (
    top20['Country'].copy().dropna()
    .value_counts()
    .to_frame()
    .reset_index()
    .sort_values(by=['Country'], ascending=False)
    .head(35)
    .drop('Country',1)
    .drop_duplicates()
)
# transform to list
top20_largest=top20_largest['index'].to_list()

# top 20 countries have the smallest sample
top20_smallest = (
    top20['Country'].copy().dropna()
    .value_counts()
    .to_frame()
    .reset_index()
    .sort_values(by=['Country'], ascending=True)
    .head(35)
    .drop('Country',1)
    .drop_duplicates()
)
# transform to list
top20_smallest=top20_smallest['index'].to_list()

#print(top20_smallest)

# top 20 largest and smallest
# largest
df_country_occ_income_l=pd.pivot_table(
     df_country_occ_income[(df_country_occ_income['Country'].isin(top20_largest))][['Country','occupation','Continent','median_income','Id']].copy()
    ,index=['Continent','Country','occupation']
    ,aggfunc={'Id': len,'median_income':np.mean}
).rename(columns={'Id': 'count','median_income':'avgIncome'})
#df_country_edu_income_l=df_country_edu_income[(df_country_edu_income['Country'].isin(top20_largest))][['Country','Education','Continent','median_income','Id']].copy()

df_country_occ_income_l = (
    pd.DataFrame(df_country_occ_income_l.to_records())
).fillna(df_country_occ_income_l['avgIncome'].min())

#print(top20_largest)

# smallest
df_country_occ_income_s=pd.pivot_table(
     df_country_occ_income[(df_country_occ_income['Country'].isin(top20_smallest))][['Country','occupation','Continent','median_income','Id']].copy()
    ,index=['Continent','Country','occupation']
    ,aggfunc={'Id': len,'median_income':np.mean}
).rename(columns={'Id': 'count','median_income':'avgIncome'})

df_country_occ_income_s = (
    pd.DataFrame(df_country_occ_income_s.to_records())
).fillna(df_country_occ_income_s['avgIncome'].min())


# total country
df_country_occ_income=pd.pivot_table(
     df_country_occ_income[['Country','occupation','Continent','median_income','Id']].copy()
    ,index=['Continent','Country','occupation']
    ,aggfunc={'Id': len,'median_income':np.mean}
).rename(columns={'Id': 'count','median_income':'avgIncome'})

# assign NaN values as min(income)
df_country_occ_income = (
    pd.DataFrame(df_country_occ_income.to_records())
).fillna(df_country_occ_income['avgIncome'].min())

# total
fig_country_occ_income = px.treemap(
    df_country_occ_income
    ,path=[px.Constant('Total country'),'Continent', 'Country','occupation']
    ,values='count'
    ,color='avgIncome'
    ,color_continuous_scale='deep'
    ,color_continuous_midpoint=np.average(df_country_occ_income['avgIncome'],weights=df_country_occ_income['count'])
    ,range_color=[999,200000]
    #,text=df_country_edu_income['']
)
fig_country_occ_income.update_layout(margin = dict(t=5, l=0, r=0, b=0),height=600)
fig_country_occ_income.update_traces(textinfo='label+percent parent')
fig_country_occ_income.show()

# the largest sample
fig_country_occ_income_l = px.treemap(
    df_country_occ_income_l
    ,path=[px.Constant('Top 35 the largest sample'),'Continent', 'Country','occupation']
    ,values='count'
    ,color='avgIncome'
    ,color_continuous_scale='deep'
    ,color_continuous_midpoint=np.average(df_country_occ_income_l['avgIncome'],weights=df_country_occ_income_l['count'])
    ,range_color=[999,200000]
    #,text=df_country_edu_income['']
)
fig_country_occ_income_l.update_layout(margin = dict(t=5, l=0, r=0, b=0),height=600)
fig_country_occ_income_l.update_traces(textinfo='label+percent parent')
fig_country_occ_income_l.show()

# the smallest sample
fig_country_occ_income_s = px.treemap(
    df_country_occ_income_s
    ,path=[px.Constant('Top 35 the smallest sample'),'Continent', 'Country','occupation']
    ,values='count'
    ,color='avgIncome'
    ,color_continuous_scale='deep'
    ,color_continuous_midpoint=np.average(df_country_occ_income_s['avgIncome'],weights=df_country_occ_income_s['count'])
    ,range_color=[999,200000]
    #,text=df_country_edu_income['']
)
fig_country_occ_income_s.update_layout(margin = dict(t=5, l=0, r=0, b=0),height=600)
fig_country_occ_income_s.update_traces(textinfo='label+percent parent')
#fig_country_occ_income_s.show()
iplot(fig_country_occ_income_s)


**10. Age distribution by continents and years of experience**


In [None]:
df_continents = pd.read_csv('https://raw.githubusercontent.com/trodiva/IS_608/master/NanosatDB_munging/Countries-Continents.csv')

df_continent_gen_yoe = df.loc[:,['2.Gender','3.Country','6.YoE']]

# replace multiple substring to specific string
for r in (('United States of America', 'United States of America')
        ,('United Kingdom of Great Britain and Northern Ireland','United Kingdom')
        ,('Hong Kong (S.A.R.)','')
        ,('Iran, Islamic Republic of...','Iran')
        ,('Viet Nam','Vietnam')
        ,('South Korea','Korea, South')
        ,('Russia','Russian Federation')
        ):
    df_continent_gen_yoe['3.Country'] = df_continent_gen_yoe['3.Country'].replace(*r)
    
for r in (('Prefer not to say', 'Other')
        ,('Nonbinary','Other')
        ,('Prefer to self-describe','Other')
        ):
    df_continent_gen_yoe['2.Gender'] = df_continent_gen_yoe['2.Gender'].replace(*r)    
    
df_continent_gen_yoe = (
    pd.merge(df_continent_gen_yoe,df_continents,how='left',left_on=['3.Country'],right_on=['Country'])
    .drop('Country',1)
    .rename({'2.Gender':'gender','3.Country':'Country','6.YoE':'yoe'},axis=1)  
    .drop('Country',1)    
)


df_continent_gen_yoe['Id']= df_continent_gen_yoe.index + 1

# total country
df_continent_gen_yoe=pd.pivot_table(
     df_continent_gen_yoe[['gender','yoe','Continent','Id']].copy()
    ,index=['Continent','yoe','gender']
    ,aggfunc={'Id': len}
).rename(columns={'Id': 'count'})

df_continent_gen_yoe=pd.DataFrame(df_continent_gen_yoe.to_records())

# total
fig_continent_gen_yoe = px.treemap(
    df_continent_gen_yoe
    ,path=[px.Constant('World'),'Continent','yoe','gender']
    ,values='count'
    ,color_discrete_sequence=px.colors.diverging.Tealrose
)
fig_continent_gen_yoe.update_layout(margin = dict(t=5, l=0, r=0, b=0),height=600)
fig_continent_gen_yoe.update_traces(textinfo='label+percent parent')
#fig_continent_gen_yoe.show()
iplot(fig_continent_gen_yoe)


# Updating...