In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

It is a great project to improve the skill in matplotlib and pandas by this dataset!

Here I only presented few ways to plot nice graph which I leanred from other kaggler.

In [None]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.gridspec as gridspec

from matplotlib.offsetbox import AnchoredText
from mpl_toolkits.axes_grid1 import make_axes_locatable

import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('../input/kaggle-survey-2020/kaggle_survey_2020_responses.csv')

Check the first two rows

In [None]:
data.head(2)

It turns out that the 1st row is the info of question, which is not the data that we need, so we should start from the next row and below.

In [None]:
data = data.iloc[1:, ]

# Q1(age) and Q2(gender) distribution

if we want to quickly plot the info, we can simplely use the below

In [None]:
data.Q1.value_counts().sort_index().plot(kind='barh')

if we want to plot it nicely for presentation, then we need to do some works

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(15, 6))

dataQ1 = data.Q1.value_counts().sort_index()

ax.bar(dataQ1.index, dataQ1, width=0.55,
       edgecolor='darkgray', color='#d4dddd',
       linewidth=0.7)

for i in dataQ1.index:
    # ha va='center'点在注释的中心。
    #ha水平（right left center）放置、 va垂直放置方式 ( top center baseline bottom )
    ax.annotate(dataQ1[i], xy=(i, dataQ1[i]+100),
               va='center', ha='center',
               fontweight='bold', fontfamily='serif', fontsize=14, alpha=dataQ1[i]/dataQ1.sum(),
               color='black')
for s in ['top', 'left', 'right']:
    ax.spines[s].set_visible(False)

ax.set_ylim(0, 4200)
ax.set_xticklabels(dataQ1.index, fontfamily='serif', fontsize=14)
ax.set_yticklabels(np.arange(0, 4500, 500), fontfamily='serif', fontsize=14)

fig.text(0.1, 0.95, 'Age Distribution', fontfamily='serif', fontsize=16, fontweight='bold')

ax.grid(axis='y', linestyle='-', alpha=0.4)
plt.show()

Before we plot, sometime we need to do data preprocessing.

In [None]:
data.Q2.unique()

as we can see, there have some other types rather than 'Man' and 'Woman', so we can uniformly change to 'Unlabel'

In [None]:
data.Q2 = data.Q2.apply(lambda x: 'Unlabel' if x not in ['Man', 'Woman'] else x)
data.Q2.unique()

In [None]:
dataQ1Q2 = data[data.Q2 != 'Unlabel'].groupby(['Q1']).Q2.value_counts().unstack().sort_index()
dataQ1Q2['sum'] = dataQ1Q2.Man + dataQ1Q2.Woman
dataQ1Q2['Per_Man'] = np.round(dataQ1Q2.Man / dataQ1Q2['sum'], 2)
dataQ1Q2['Per_Woman'] = np.round(dataQ1Q2.Woman / dataQ1Q2['sum'], 2)
dataQ1Q2

In [None]:
man = dataQ1Q2.Man
woman = dataQ1Q2.Woman
man_per = dataQ1Q2.Per_Man * 100
woman_per = dataQ1Q2.Per_Woman * 100
indexQ1Q2 = dataQ1Q2.index

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(15, 6))

ax.bar(indexQ1Q2, man, width=0.55, color='#004c70', alpha=0.8, label='Male')
ax.bar(indexQ1Q2, -woman, width=0.55, color='#990000', alpha=0.8, label='Female')
ax.set_ylim(-1200, 3500)

for i in dataQ1Q2.index:
    ax.annotate(man[i], xy=(i, man[i]+100),
               va='center', ha='center',
               fontweight='light', fontsize=14, fontfamily='serif',
               color='#4a4a4a')
    
    ax.annotate(woman[i], xy=(i, -woman[i]-150),
               va='center', ha='center',
               fontweight='light', fontsize=14, fontfamily='serif',
               color='#4a4a4a')
    
    ax.annotate(str(man_per[i])+'%', xy=(i, man[i]/2),
               va='center', ha='center',
               fontweight='light', fontsize=10, fontfamily='serif',
               color='white')
    


for s in ['top', 'left', 'right', 'bottom']:
    ax.spines[s].set_visible(False)
    
ax.set_xticklabels(indexQ1Q2, fontsize=14, fontfamily='serif')
ax.set_yticks([])
legend = ax.legend(loc='upper right', fontsize=14, frameon=False, title='Sex')
legend.get_title().set_fontsize(fontsize=20) # 改Sex的字体大小

fig.text(0.1, 0.95, "Age/Gender distribution", fontsize=20, fontfamily='serif', fontweight='bold')
plt.show()

# Q2(gender) and Q3(country)

In [None]:
dataQ2Q3 = data[['Q2', 'Q3']].groupby('Q3').Q2.value_counts().unstack().drop('Other')
dataQ2Q3['sum'] = dataQ2Q3.sum(axis=1)
dataQ2Q3 = dataQ2Q3.sort_values('sum', ascending=False).head(10).rename(
    index={'United States of America':'USA', 
           'United Kingdom of Great Britain and Northern Ireland': 'UK'})
dataQ2Q3['per_man'] = dataQ2Q3.Man / dataQ2Q3['sum']
dataQ2Q3['per_woman'] = dataQ2Q3.Woman / dataQ2Q3['sum']
dataQ2Q3['per_unlabel'] = dataQ2Q3.Unlabel / dataQ2Q3['sum']
dataQ2Q3 = dataQ2Q3.sort_values('sum', ascending=True)
dataQ2Q3

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(15, 6))

ax.barh(dataQ2Q3.index, dataQ2Q3['per_man'], label='Man',
        color='#004c70', alpha=0.7)
ax.barh(dataQ2Q3.index, dataQ2Q3['per_woman'], left=dataQ2Q3['per_man'], label='Woman',
       color='#990000', alpha=0.7)
ax.barh(dataQ2Q3.index, dataQ2Q3['per_unlabel'], 
        left=dataQ2Q3['per_man']+dataQ2Q3['per_woman'], label='Unlabel',
       color='#4a4a4a', alpha=0.7)

ax.set_xticks([])
ax.set_yticklabels(dataQ2Q3.index, fontsize=14, fontfamily='serif')

for i in dataQ2Q3.index:
    ax.annotate(f"{dataQ2Q3['per_man'][i]*100:.3}%", 
                   xy=(dataQ2Q3['per_man'][i]/2, i),
                   va = 'center', ha='center',fontsize=9, fontweight='light', fontfamily='serif',
                   color='white')
    ax.annotate(f"{dataQ2Q3['per_woman'][i]*100:.3}%", 
                   xy=(dataQ2Q3['per_man'][i]+dataQ2Q3['per_woman'][i]/2, i),
                   va = 'center', ha='center',fontsize=9, fontweight='light', fontfamily='serif',
                   color='white')


for s in ['top', 'left', 'right', 'bottom']:
    ax.spines[s].set_visible(False)

fig.text(0.13, 0.95, 'Top10 Country : Gender Distribution', fontsize=15, fontweight='bold', fontfamily='serif')   
fig.text(0.131, 0.91, 'Percent Stacked Bar Chart', fontsize=12,fontfamily='serif')      

ax.legend(loc='lower center', ncol=3, bbox_to_anchor=(0.5, -0.06), fontsize=14)

plt.show()

# Q4(education)

In [None]:
q4_order = [
    'I prefer not to answer',
    'No formal education past high school',
    'Professional degree',
    'Some college/university study without earning a bachelor’s degree',
    'Bachelor’s degree',
    'Master’s degree',
    'Doctoral degree'
]
dataQ4 = data.Q4.value_counts()[q4_order]
dataQ4

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(12, 6))

ax.bar(dataQ4.index, dataQ4, width=0.55,
      color=sns.color_palette('husl', 7), linewidth=0.7)

for i in range(len(dataQ4.index)):
    ax.annotate(dataQ4[i],
               xy=(i, dataQ4[i]+150),
               va='center', ha='center',
               fontsize=14, fontfamily='serif',
               color='gray')
    ax.text(i, dataQ4[i]+550, f'{dataQ4[i]/dataQ4.sum()*100:0.3}%', 
            va='center', ha='center',
           fontsize=14, fontfamily='serif',
           color='pink')
    
    
for s in ['top', 'left', 'right']:
    ax.spines[s].set_visible(False)

ax.grid(axis='y', alpha=0.3)

ax.set_xticklabels(dataQ4.index, fontfamily='serif', fontsize=14, rotation=90)
ax.set_ylim(0, 9000)
fig.text(0.1, 0.95, 'Education Distribution', fontfamily='serif', fontsize=14, fontweight='bold')

plt.show()

# Q6(experience in writing code)

In [None]:
q6_order = [
      'I have never written code', '< 1 years', '1-2 years', '3-5 years', '5-10 years', '10-20 years', '20+ years',
]

data_q6 = data['Q6'].value_counts()[q6_order]

fig, ax = plt.subplots(1,1, figsize=(12, 6))
ax.bar(data_q6.index, data_q6, width=0.55, 
       edgecolor='darkgray', color=sns.color_palette("Blues", 7),
       linewidth=0.6)

for i in data_q6.index:
    ax.annotate(f"{data_q6[i]}", 
                   xy=(i, data_q6[i] + 100),
                   va = 'center', ha='center',fontweight='light', fontfamily='serif',
                   color='#4a4a4a')


for s in ['top', 'left', 'right']:
    ax.spines[s].set_visible(False)

ax.set_xticklabels(data_q6.index, fontfamily='serif', rotation=90)

fig.text(0.09, 0.95, 'Experience in writing code distribution', fontsize=15, fontweight='bold', fontfamily='serif')    
ax.grid(axis='y', linestyle='-', alpha=0.4)    
plt.show()

## Q4(education) and Q6(experience)

In [None]:
data['count'] = 1 # add count = 1 to further count the number
dataQ4Q6 = pd.pivot_table(data, values='count', index='Q6', columns='Q4', aggfunc=np.sum).\
            loc[q6_order, q4_order] # using .loc[xx, xx] to change presented order
dataQ4Q6

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(12, 12))
sns.heatmap(dataQ4Q6,
           cmap='YlGnBu',
           square=True,
           linewidth=2.5,
           cbar=False,
           ax=ax,
           annot=True, fmt='d', annot_kws={'size': 16})
ax.spines['top'].set_visible(True)

fig.text(0.48, 1, 'Pivot Table : Education & Experience', fontweight='bold', fontfamily='serif', fontsize=15)  
plt.tight_layout()

ax.set_yticklabels(ax.get_yticklabels(), fontfamily='serif', fontsize=14)
ax.set_xticklabels(ax.get_xticklabels(), fontfamily='serif', fontsize=14)
ax.set_ylim(0, len(ax.get_yticklabels()))

plt.show()

# Q2(gender), Q5(title) and Q15(experience in machine learning)

In [None]:
q5_order = ['Business Analyst'
 ,'Data Analyst'
 ,'Data Engineer'
 ,'Data Scientist'
 ,'DBA/Database Engineer'
 ,'Machine Learning Engineer'
 ,'Product/Project Manager'
 ,'Research Scientist'
 ,'Software Engineer'
 ,'Statistician'
 ,'Student'
 ,'Currently not employed'
 ,'Other']

q15_order = [
    'I do not use machine learning methods'
    ,'Under 1 year'
    ,'1-2 years'
    ,'2-3 years'
    ,'3-4 years'
    ,'4-5 years'
    ,'5-10 years'
    ,'10-20 years'
    ,'20 or more years']
data_q5q15 = pd.pivot_table(data, values='count', index=['Q5'], columns=['Q15'], aggfunc=np.sum).fillna(0).astype(int).loc[q5_order, q15_order].stack()
data_q5q15_man = pd.pivot_table(data[data['Q2']=='Man'], values='count', index=['Q5'], columns=['Q15'], aggfunc=np.sum).fillna(0).astype(int).loc[q5_order, q15_order].stack()
data_q5q15_woman = pd.pivot_table(data[data['Q2']=='Woman'], values='count', index=['Q5'], columns=['Q15'], aggfunc=np.sum).fillna(0).astype(int).loc[q5_order, q15_order].stack()

In [None]:
def drawPieMarker(xs, ys, ratios, sizes, colors, ax):
    markers = []
    previous = 0
    # calculate the points of the pie pieces
    for color, ratio in zip(colors, ratios):
        this = 2 * np.pi * ratio + previous
        x  = [0] + np.cos(np.linspace(previous, this, 30)).tolist() + [0]
        y  = [0] + np.sin(np.linspace(previous, this, 30)).tolist() + [0]
        xy = np.column_stack([x, y])
        previous = this
        markers.append({'marker':xy, 's':np.abs(xy).max()**2*np.array(sizes), 'facecolor':color})

    # scatter each of the pie pieces to create pies
    for marker in markers:
        ax.scatter(xs, ys, **marker, alpha=0.7)

In [None]:
fig = plt.figure(figsize=(16, 16))
gs = fig.add_gridspec(5, 5)

ax_plot = fig.add_subplot(gs[1:4, 0:4])

for q5_idx in q5_order[::-1]: # 这个[::-1] --> reverse order
    for q15_idx in q15_order:
        man = data_q5q15_man[q5_idx][q15_idx]
        woman = data_q5q15_woman[q5_idx][q15_idx]
        tot = data_q5q15[q5_idx][q15_idx]
        
        drawPieMarker([q15_idx],[q5_idx], 
                      [man/(man+woman), woman/(man+woman)],
                      [tot*2.5],
                      ['#004c70', '#990000'],
                      ax=ax_plot)
ax_plot.grid(linewidth=0.2)
ax_plot.set_xticklabels(q15_order, rotation=90)


# Job position
ax_pos = fig.add_subplot(gs[0, :4], sharex=ax_plot)
data_q15_woman = data[data['Q2']=='Woman']['Q15'].value_counts()[q15_order]
ax_pos.bar(data_q15_woman.index, data_q15_woman,
          width=0.45, alpha=0.7, color='#990000')

data_q15_man = data[data['Q2']=='Man']['Q15'].value_counts()[q15_order]
ax_pos.bar(data_q15_man.index, data_q15_man,
          width=0.45, alpha=0.7, color='#004c70', bottom=data_q15_woman)

plt.setp(ax_pos.get_xticklabels(), visible=False)

# Exprience
ax_exp = fig.add_subplot(gs[1:4, 4], sharey=ax_plot)
data_q5_woman = data[data['Q2']=='Woman']['Q5'].value_counts()[q5_order]
ax_exp.barh(data_q5_woman.index[::-1], data_q5_woman[::-1], #[::-1]
          height=0.55, alpha=0.7, color='#990000')

data_q5_man = data[data['Q2']=='Man']['Q5'].value_counts()[q5_order]
ax_exp.barh(data_q5_man.index[::-1], data_q5_man[::-1],  #[::-1]
          height=0.55, alpha=0.7, color='#004c70', left=data_q5_woman[::-1])  #[::-1]
plt.setp(ax_exp.get_yticklabels(), visible=False)

for s in ['top', 'bottom', 'left', 'right']:
    ax_plot.spines[s].set_visible(False)
    ax_pos.spines[s].set_visible(False)
    ax_exp.spines[s].set_visible(False)

plt.show()

# Q24(salary)

In [None]:
q24_order = ['$0-999', '1,000-1,999', '2,000-2,999', '3,000-3,999', '4,000-4,999', '5,000-7,499', '7,500-9,999',
'10,000-14,999','15,000-19,999', '20,000-24,999', '25,000-29,999', '30,000-39,999', '40,000-49,999', '50,000-59,999', '60,000-69,999', '70,000-79,999', '80,000-89,999', '90,000-99,999',
'100,000-124,999', '125,000-149,999',  '150,000-199,999', '200,000-249,999',  '250,000-299,999', '300,000-500,000', '> $500,000']
data_q24q5 = pd.pivot_table(data, values='count', index='Q24', columns='Q5', aggfunc=np.sum).fillna(0).astype(int).loc[q24_order,:]
data_q24q5

In [None]:
data_q24q5 = data_q24q5/data_q24q5.sum(axis=0)
data_q24q5 = data_q24q5.T.stack()
data_q24q5

this is a way to check whether we get the correct calculation or not.

In [None]:
(data_q24q5/data_q24q5.sum(axis=0)).cumsum()

In [None]:
data_q24q5.unstack().index.tolist()

In [None]:
row = 6
column = 2
position = []
for i in range(row):
    for j in range(column):
        position.append((i, j))
        
colors = sns.light_palette('seagreen', len(q24_order))
fig, ax = plt.subplots(row, column, figsize=(15, 22), sharex=False)

for i, name in zip(position, data_q24q5.unstack().index.tolist()):
    ax[i[0], i[1]].bar(data_q24q5[name].index, data_q24q5[name],
                       edgecolor='black', linewidth=0.4, width=1,
                      color=colors)
    
    ax[i[0], i[1]].set_yticks([])
    ax[i[0], i[1]].set_xticklabels([])

    for s in ['top']:
        ax[i[0], i[1]].spines[s].set_visible(False)
    
    divider = make_axes_locatable(ax[i[0], i[1]])
    cax = divider.append_axes('top', size='18%', pad=0)
    cax.get_xaxis().set_visible(False)
    cax.get_yaxis().set_visible(False)
    cax.spines['bottom'].set_visible(False)
    cax.set_facecolor('lightgray')

    at = AnchoredText(name, loc=10, 
                      prop=dict(backgroundcolor='lightgray',
                                size=13, color='white', weight='bold'))
    cax.add_artist(at)
    

ax[-1, -1].set_visible(False)
ax[-1, 0].set_xticklabels(q24_order, rotation=90, fontfamily='serif', fontsize=12)
ax[-2, 1].set_xticklabels(q24_order, rotation=90, fontfamily='serif', fontsize=12)

#plt.tight_layout()
plt.show()

# End~