In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
df = pd.read_csv('/kaggle/input/kaggle-survey-2020/kaggle_survey_2020_responses.csv')
qcode = df.iloc[0].to_dict()
df.drop([0], axis=0, inplace=True)
df.columns = ['Time'] + df.columns[1:].tolist()
df['Q2'] = df['Q2'].map(lambda x: 'Male' if x == 'Man' else 'Non-male')
df['region'] = df['Q3'].map(lambda x: 'India' if x == 'India' else 'World')
df.head()

In [None]:
t = df['Time'].astype(float)
t = t[t < 20000]
ax = t.hist(bins=100)
ax.set_xlim(0, 5000)

If there were too many people answering the survey too quickly, we'd have dropped their responses. Not too many such people it seems. Most people took ~ 15 mins to answer the survey, which seems reasonable.

In [None]:
india = df[df['Q3'] == 'India']
world = df[df['Q3'] != 'India']

In [None]:
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2)
india['Q2'].value_counts().plot(kind='pie', ax=ax1)
ax1.set_title('India')
world['Q2'].value_counts().plot(kind='pie', ax=ax2)
ax2.set_title('World')
fig.suptitle('Gender Ratios - India and the World')

India seems to have only slightly better gender representation.

In [None]:
groups = df.groupby('Q3')['Q2']
def myagg(g):
    return g.value_counts(normalize=True).loc['Male']
gr = groups.agg(myagg).sort_values()
fig, (ax1, ax2) = plt.subplots(ncols=2, nrows=1, figsize=(10, 6))
gr.head(10).plot(kind='barh', ax=ax1)
ax1.set_title('Best Gender Ratios')
gr.tail(10).plot(kind='barh', ax=ax2)
ax2.set_title('Worst Gender Ratios')
plt.tight_layout()

Muslim majority countries have the best gender ratio???

In [None]:
ages = df['Q1'].unique().tolist()
ages.sort()
ages = pd.api.types.CategoricalDtype(categories=ages, ordered=True)
df['Q1'] = df['Q1'].astype(ages)

pd.crosstab(df['Q1'], df['region'], normalize=True).plot(kind='bar')

India is significantly younger than the rest of the world.

In [None]:
xdf = df.dropna(subset=['Q4'])
xdf = xdf[xdf['Q4'] != 'I prefer not to answer']
qual = [
    'No formal education past high school',
    'Some college/university study without earning a bachelor’s degree',
    'Professional degree',
    'Bachelor’s degree',
    'Master’s degree',
    'Doctoral degree'
]
qual = pd.api.types.CategoricalDtype(categories=qual, ordered=True)
xdf['Q4'] = xdf['Q4'].astype(qual)
pd.crosstab(xdf['Q4'], df['region'], normalize=True).plot(kind='barh')

Lots of grads and not enough postgrads.

In [None]:
india = df[df['region'] == 'India']
world = df[df['region'] == 'World']
india['Q5'].value_counts().head().plot(kind='bar')

In [None]:
world['Q5'].value_counts().head().plot(kind='bar')

India is flooding with Students.

In [None]:
def process_mcq(prefix, df):

    xdf = df[[c for c in df if c.startswith(prefix)] + ['region']]

    x = xdf[xdf['region'] == 'India']
    x = x[[c for c in x if c != 'region']]
    x_prop = x.fillna(value=False).astype(bool).sum(0) / x.shape[0]
    x_prop.index = x.mode(axis=0).iloc[0]


    y = xdf[xdf['region'] == 'World']
    y = y[[c for c in y if c != 'region']]
    y_prop = y.fillna(value=False).astype(bool).sum(0) / y.shape[0]
    y_prop.index = y.mode(axis=0).iloc[0]

    pd.DataFrame.from_dict(dict(World=y_prop, India=x_prop)).plot(kind='bar')

process_mcq('Q7_', df)

We use a lot more C, C++ and Java than the rest of the world, and a lot less bash.

In [None]:
process_mcq('Q10_', df)

Significantly higher use of hosted notebook products - obv, students can't afford expensive hardware. Try this with professionals too - in fact for all the rest.

In [None]:
pd.crosstab(df['Q11'], df['region'], normalize=True).plot(kind='barh')

In [None]:
process_mcq('Q14_', df)

Slightly less relative usage of web-based visualization tools.

In [None]:
xdf = df.dropna(subset=['Q15'])

In [None]:
ml_exp = [
    'I do not use machine learning methods',
    'Under 1 year',
    '1-2 years',
    '2-3 years',
    '3-4 years',
    '4-5 years',
    '5-10 years',
    '10-20 years',
    '20 or more years',
]
ml_exp = pd.api.types.CategoricalDtype(categories=ml_exp, ordered=True)
xdf['Q15'] = xdf['Q15'].astype(ml_exp)
pd.crosstab(xdf['Q15'], xdf['region'], normalize=True).plot(kind='barh')

In [None]:
process_mcq('Q16_', df)

In [None]:
process_mcq('Q19_', df)

In [None]:
xdf = df.dropna(subset=['Q20'])
co_size = [
    '0-49 employees',
    '50-249 employees',
    '250-999 employees',
    '1000-9,999 employees',
    '10,000 or more employees'
]
co_size = pd.api.types.CategoricalDtype(categories=co_size, ordered=True)
xdf['Q20'] = xdf['Q20'].astype(co_size)
pd.crosstab(xdf['Q20'], xdf['region'], normalize=True).plot(kind='barh')

In [None]:
xdf = df.dropna(subset=['Q21'])
team_size = [
    '0',
    '1-2',
    '3-4',
    '5-9',
    '10-14',
    '15-19',
    '20+'
]
team_size = pd.api.types.CategoricalDtype(categories=team_size, ordered=True)
xdf['Q21'] = xdf['Q21'].astype(team_size)
pd.crosstab(xdf['Q21'], xdf['region'], normalize=True).plot(kind='barh')

In [None]:
xdf = df.dropna(subset=['Q22']).copy()
xdf = xdf[xdf['Q22'] != 'I do not know']
q22_map = {
    'No (we do not use ML methods)': 'None',
    'We are exploring ML methods (and may one day put a model into production)': 'Exploring',
    'We use ML methods for generating insights (but do not put working models into production)': 'For insight generation',
    'We recently started using ML methods (i.e., models in production for less than 2 years)': '< 2 years in prod',
    'We have well established ML methods (i.e., models in production for more than 2 years)': '> 2 years in prod'
}
xdf['Q22'] = xdf['Q22'].map(lambda x: q22_map[x])
ml_maturity = pd.api.types.CategoricalDtype(categories=q22_map.values(), ordered=True)
xdf['Q22'] = xdf['Q22'].astype(ml_maturity)
pd.crosstab(xdf['Q22'], xdf['region'], normalize=True).plot(kind='barh')

In [None]:
process_mcq('Q23_', df)

In [None]:
process_mcq('Q28_A_', df)

In [None]:
process_mcq('Q31_A_', df)

In [None]:
process_mcq('Q33_A_', df)

In [None]:
process_mcq('Q34_A_', df)