In [None]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.gridspec as gridspec 

from matplotlib.offsetbox import AnchoredText
from mpl_toolkits.axes_grid1 import make_axes_locatable

import seaborn as sns
import warnings
warnings.filterwarnings(action='ignore')
plt.rcParams['figure.dpi'] = 200 #high resolution
import gc

In [None]:
data = pd.read_csv('/kaggle/input/kaggle-survey-2020/kaggle_survey_2020_responses.csv')
questions = data.iloc[0, :].T
data = data.iloc[1:, :]

In [None]:
data.head()

In [None]:
import missingno
missingno.bar(data[['Q1','Q2','Q3','Q4','Q5','Q6']], sort='ascending', figsize = (15,5))

In [None]:
# Q2 Analysis
data['Q2'] = data['Q2'].apply(lambda x : 'ETC' if x not in ['Man', 'Woman'] else x)

data_q1q2 = data[data['Q2'] != 'ETC'].groupby(['Q2'])['Q1'].value_counts().unstack().sort_index()

man = data_q1q2.loc['Man']
woman = -data_q1q2.loc['Woman']

fig, ax = plt.subplots(1,1, figsize=(15, 6))
ax.bar(man.index, man, width=0.55,  alpha=0.8, label='Male')
ax.bar(woman.index, woman, width=0.55,  alpha=0.8, label='Female')
ax.set_ylim(-1200, 3500)

ax.legend()
fig.text(0.16, 0.95, 'Age / Gender Distribution', fontsize=12, )    
plt.show()

In [None]:
# Q1 - How many Men and Women Responded to the Survey across different age groups
sns.catplot(data=data.sort_index(), x='Q1', col='Q2',
            kind='count')

Fewer Women Repsonded to the Survey then Men<br>
Male and femal response numbers across age groups are similar


In [None]:
# Q3
data_q3 = data['Q3'].value_counts()

fig, ax = plt.subplots(1,1, figsize=(12, 6))
ax.bar(data_q3.index, data_q3, width=0.55, 
       edgecolor='darkgray', color=sns.color_palette("coolwarm", 7),
       linewidth=0.7)

ax.set_xticklabels(data_q3.index, rotation=90)

fig.text(0.09, 0.95, 'Responses by Country', fontsize=12,)    
ax.grid(axis='y', linestyle='-', alpha=0.4)    
plt.show()

In [None]:
data.Q3.value_counts()

In [None]:
data_q4 = data['Q4'].value_counts()
#[q4_order]

fig, ax = plt.subplots(1,1, figsize=(12, 6))
ax.bar(data_q4.index, data_q4, width=0.55, 
       edgecolor='darkgray', color=sns.color_palette("coolwarm", 7),
       linewidth=0.7)

ax.set_xticklabels(data_q4.index, fontfamily='serif', rotation=90)
fig.text(0.09, 0.95, 'Education Distribution', fontsize=12,)    
ax.grid(axis='y', linestyle='-', alpha=0.4)    
plt.show()

In [None]:
data_q3.loc[data_q3 > 400]

In [None]:
data_q3 = data['Q3'].value_counts()

fig, ax = plt.subplots(1,1, figsize=(12, 6))
ax.bar(data_q3.index, data_q3, width=0.55, 
       edgecolor='darkgray', color=sns.color_palette("coolwarm", 7),
       linewidth=0.7)

ax.set_xticklabels(data_q3.index, rotation=90)

fig.text(0.09, 0.95, 'Responses by Country', fontsize=12,)    
ax.grid(axis='y', linestyle='-', alpha=0.4)    
plt.show()

India seems to have the largest number of respondents to the survey followed by the United States of America

In [None]:

g = sns.catplot(x="Q5", col="Q3", col_wrap=3,
                data=data.loc[(data.Q3 == 'India') | (data.Q3 == 'United States of America')
                             | (data.Q3 == 'Other')
                             | (data.Q3 == 'Brazil') | (data.Q3 == 'Japan')],
                kind="count", aspect=.8)
g.set_xticklabels(rotation=90)


In top 5 Countries responding to the Survey India seems to have the highest number of students and the highest ratio of students to other Titles compared to all other countries

In [None]:
q24_order = ['$0-999', '1,000-1,999', '2,000-2,999', '3,000-3,999', '4,000-4,999', '5,000-7,499', '7,500-9,999',
'10,000-14,999','15,000-19,999', '20,000-24,999', '25,000-29,999', '30,000-39,999', '40,000-49,999', '50,000-59,999', '60,000-69,999', '70,000-79,999', '80,000-89,999', '90,000-99,999',
'100,000-124,999', '125,000-149,999',  '150,000-199,999', '200,000-249,999',  '250,000-299,999', '300,000-500,000', '> $500,000']

data_q24 = data['Q24'].value_counts()[q24_order]

fig, ax = plt.subplots(1,1, figsize=(20, 6), dpi=200)
ax.bar(data_q24.index, data_q24, width=0.55, 
       edgecolor='black', color=sns.light_palette("black", len(data_q24)),
       linewidth=0.6)

ax.plot(data_q24.index, data_q24, c='k', linewidth=0.6)

ax.set_xticklabels(data_q24.index, rotation=90)
plt.show()

In [None]:
data['Q24']
data_q24 = data.groupby(['Q6'])['Q24'].value_counts().unstack().sort_index()
Experience_labels = ['Never Coded', 'Less than 1 Year', '1-2 years',
                      '3-5 years', '5-10 years','10-20 years','20+ years']
N=len(data_q24.columns)
ind = np.arange(N)
width = 0.4

never = data_q24.loc['I have never written code']
lessone = data_q24.loc['< 1 years']
one = data_q24.loc['1-2 years']
three = data_q24.loc['3-5 years']
five = data_q24.loc['5-10 years']
ten = data_q24.loc['10-20 years']
twenty = data_q24.loc['20+ years']

fig, ax = plt.subplots(1,1, figsize=(15, 6))

p1 = plt.bar(ind, never.values, width,alpha=0.5)
p2 = plt.bar(ind, lessone.values, width,
             bottom=never,alpha=0.5)
p3 = plt.bar(ind, one.values, width,
             bottom=(never + lessone),alpha=0.5)
p4 = plt.bar(ind, three.values, width,
             bottom=(never + lessone + one),alpha=0.5)
p5 = plt.bar(ind, ten.values, width,
             bottom=(never + lessone + one + three),alpha=0.5)
p5 = plt.bar(ind, twenty.values, width,
             bottom=(never + lessone + one + three + ten),alpha=0.5)

ax.legend(Experience_labels)
plt.xticks(ind)
ax.set_xticklabels(data_q24.columns,rotation=90)


plt.show()

Many proffesionals work at a lower salary till up-to 5 years,thereafter they either fall out or their salary increases.<br>
We see a shrinkage of the first band as the salaries go higher <<br>
Once an employee is at 50K he would generally be in teh system till $100,000 salary range 

In [None]:
companysize_labels = ['0-49 employees', '50-249 employees', '250-999 employees',
                      '1000-9,999 employees', '10,000 or more employees']
Experience_labels = ['Never Coded', 'Less than 1 Year', '1-2 years',
                      '3-5 years', '5-10 years','10-20 years','20+ years']

data_q20 = data.groupby(['Q6'])['Q20'].value_counts().unstack().sort_index()


N=5
ind = np.arange(N)
width = 0.4

never = data_q20.loc['I have never written code']
lessone = data_q20.loc['< 1 years']
one = data_q20.loc['1-2 years']
three = data_q20.loc['3-5 years']
five = data_q20.loc['5-10 years']
ten = data_q20.loc['10-20 years']
twenty = data_q20.loc['20+ years']

fig, ax = plt.subplots(1,1, figsize=(15, 6))


p1 = plt.bar(ind, never.values, width,alpha=0.5)
p2 = plt.bar(ind, lessone.values, width,
             bottom=never,alpha=0.5)
p3 = plt.bar(ind, one.values, width,
             bottom=(never + lessone),alpha=0.5)
p4 = plt.bar(ind, three.values, width,
             bottom=(never + lessone + one),alpha=0.5)
p5 = plt.bar(ind, ten.values, width,
             bottom=(never + lessone + one + three),alpha=0.5)
p5 = plt.bar(ind, twenty.values, width,
             bottom=(never + lessone + one + three + ten),alpha=0.5)

ax.legend(Experience_labels)
plt.xticks(ind, ('G1', 'G2', 'G3', 'G4', 'G5'))
ax.set_xticklabels(companysize_labels,rotation=90)
#ax.set_ylim(-1200, 3500)
plt.show()

Smaller Companies hire more Employees with lesser experience - 1 to 5 years segment<br>
This segment also contributes to the largest pool of resources.<br>
The 3 to 5 year segment are hired in companies of all sizes, and is the highest segment of employees in each comoany segment

In [None]:
data_q20

**Is the Gender Ratio changing with New Students entering into Data Science**

In [None]:
studentdata = data.loc[data.Q5=="Student"]
dsdata = data.loc[data.Q5=="Data Scientist"]
#data_q20 = studentdata.groupby(['Q6'])['Q20'].value_counts().unstack().sort_index()
plt.subplot(1, 2, 1)
(studentdata['Q2'].value_counts()/ len(studentdata['Q2'])).plot.bar()
plt.title('Students by Gender')

plt.subplot(1, 2, 2)
(dsdata['Q2'].value_counts()/ len(dsdata['Q2'])).plot.bar()
plt.title('Data Scientist bt Gender')
plt.show()

del studentdata
del dsdata
gc.collect()


There is slight (10%) increase in Woman students entering into data science than the current population of data scientist

In [None]:
sedata = data.loc[data.Q5=="Software Engineer"]
dsdata = data.loc[data.Q5=="Data Scientist"]
#data_q20 = studentdata.groupby(['Q6'])['Q20'].value_counts().unstack().sort_index()
plt.subplot(1, 2, 1)
(studentdata['Q2'].value_counts()/ len(studentdata['Q2'])).plot.bar()
plt.title('Students by Gender')

plt.subplot(1, 2, 2)
(dsdata['Q2'].value_counts()/ len(dsdata['Q2'])).plot.bar()
plt.title('Data Scientist bt Gender')
plt.show()

Are Software Engineers catching up with Data Science

In [None]:
(data.loc[~(data.Q5 == "Student")].Q5.value_counts()/ len(data.Q5)).plot.bar()

Excluding students, Software Engineers form the second largest response group in teh survey.

In [None]:
data.loc[(data.Q5 == "Software Engineer")]["Q6"].value_counts().plot.bar()

It appears that respondents consider themselves Software engineers at around 3+ years of experience<br>
There are very few respondents less than one year who consider them selves to be Software Engineers

# Reference 
https://www.kaggle.com/subinium/kaggle-2020-visualization-analysis