In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import seaborn as sns
import matplotlib.pyplot as plt

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
input_data = pd.read_csv('../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv')
print(list(input_data.columns))

## Exploring Basic Age and Gender Variations 

Table 

In [None]:
age_distribution = input_data['Q1'][1:].value_counts().rename_axis('age_bracket').reset_index(name='counts')
age_distribution['cum_sum'] = age_distribution['counts'].cumsum() / 25973
age_distribution

#### About 79% of all respondents less than 40 years of age and 56% less than 24 years of age.

##### Let us explore the same by gender :- Male and Female.

In [None]:
age_sex_distribution = input_data[['Q1','Q2']][1:].groupby(by='Q1').agg({'Q2' : [lambda x : (x == 'Man').sum(), lambda x : (x == 'Woman').sum()]})
age_sex_distribution.columns = age_sex_distribution.columns.droplevel()
age_sex_distribution = age_sex_distribution.rename(columns={"<lambda_0>":'Male',"<lambda_1>":'Female'})
age_sex_distribution['male_cum_sum'] = age_sex_distribution['Male'].cumsum() / sum(age_sex_distribution['Male'])
age_sex_distribution['female_cum_sum'] = age_sex_distribution['Female'].cumsum() / sum(age_sex_distribution['Female'])
age_sex_distribution

### Women on average are younger as compared to Men with about 77% of men younger than 40 while 85% of females younger than 40

In [None]:
fig,ax = plt.subplots(figsize=(9,7))
sns.lineplot(data=age_sex_distribution[['male_cum_sum','female_cum_sum']])
plt.xlabel('Age Bracket')
plt.ylabel('Normalised cumulative sum of number of respondents')
ax.legend(['Males','Females'])
plt.show()

## Adding Country as a variable
The respondednt are located in 67 countries with more than 7000 responses from India and just 43 from Iraq. Lets us analyze the top 5 countries.

In [None]:
input_data['Q3'].value_counts()

In [None]:
pd.DataFrame(input_data[1:]['Q3'].value_counts()).sort_values(by='Q3',ascending=False).head(6)

We have our top 5 countries as :- India, USA, Japan, China and Brazil. 'Other is not considered'. Let us try and consider the demographic mix on the countries. Starting with Age Sex Distribution

In [None]:
top_5 = ['India','United States of America','Japan','China','Brazil']

In [None]:
## Returns age-sex distribution for a country
def age_sex_distribution_by_country(country):
    age_sex_distribution_india = input_data[input_data['Q3'] == country][['Q1','Q2']]
    age_sex_distribution_india = age_sex_distribution_india[['Q1','Q2']][1:].groupby(by='Q1').agg({'Q2' : [lambda x : (x == 'Man').sum(), lambda x : (x == 'Woman').sum()]})
    age_sex_distribution_india.columns = age_sex_distribution_india.columns.droplevel()
    age_sex_distribution_india = age_sex_distribution_india.rename(columns={"<lambda_0>":'Male',"<lambda_1>":'Female'}).rename_axis('Age Bracket')
    age_sex_distribution_india['male_cum_sum'] = age_sex_distribution_india['Male'].cumsum() / sum(age_sex_distribution_india['Male'])
    age_sex_distribution_india['female_cum_sum'] = age_sex_distribution_india['Female'].cumsum() / sum(age_sex_distribution_india['Female'])
    return age_sex_distribution_india

In [None]:
age_sex_distribution_by_country('India')

##### The respondents from India are younger than the world average with 92% of males and 95% of females younger than 40.

In [None]:
age_sex_distribution_by_country('China')

##### The chinese respondents are even younger than their Indian counterparts with 96% of males and 98% of females younger than 40

In [None]:
age_sex_distribution_by_country('Japan')

In [None]:
age_sex_distribution_by_country('United States of America')

In [None]:
age_sex_distribution_by_country('Brazil')

##### Japaneses, American and Brazilian respondents are much older than their conterparts in India and China. At this point two points are worth exploring :- Top countries by female participation and top countries by female participation among younger respondents (< 30).

##### Let us find top 20 countries in female participation. We only consider countries with atleast 100 respondents

In [None]:
female_participation_by_country = input_data[['Q1','Q2','Q3']][1:].groupby(by='Q3').agg({'Q2' : [lambda x : (x == 'Man').sum(), lambda x : (x == 'Woman').sum()]})
female_participation_by_country.columns = female_participation_by_country.columns.droplevel()
female_participation_by_country = female_participation_by_country.rename(columns={"<lambda_0>":'Male',"<lambda_1>":'Female'}).rename_axis("country")
female_participation_by_country['female_part_ratio'] = female_participation_by_country['Female']/(female_participation_by_country['Male'] + female_participation_by_country['Female'])
female_participation_by_country[female_participation_by_country['Male'] + female_participation_by_country['Female'] > 100].sort_values(by='female_part_ratio',ascending=False).head(20)

##### Thus we observe that for countries with atleast 100 respondents, Tunisia tops the list of maximum female participation.
##### Lets us see if this trend hold up when we consider only young population(< 35), which likely it should.

In [None]:
female_participation_by_country_young = input_data[(input_data['Q1'] == '18-21') | (input_data['Q1'] == '22-24') | (input_data['Q1'] == '25-29') | (input_data['Q1'] == '30-34')][['Q1','Q2','Q3']][1:]
female_participation_by_country_young = female_participation_by_country_young.groupby(by='Q3').agg({'Q2' : [lambda x : (x == 'Man').sum(), lambda x : (x == 'Woman').sum()]})
female_participation_by_country_young.columns = female_participation_by_country_young.columns.droplevel()
female_participation_by_country_young = female_participation_by_country_young.rename(columns={"<lambda_0>":'Male',"<lambda_1>":'Female'}).rename_axis("country")
female_participation_by_country_young['female_part_ratio'] = female_participation_by_country_young['Female']/(female_participation_by_country_young['Male'] + female_participation_by_country_young['Female'])
female_participation_by_country_young[female_participation_by_country_young['Male'] + female_participation_by_country_young['Female'] > 78].sort_values(by='female_part_ratio',ascending=False).head(20)

##### The younger population has more females with Tunisia leading the way.

## Time Taken to fill the survey
##### Let us try and understand if there is any pattern in time taken across variables.

In [None]:
input_data[1:][input_data['Time from Start to Finish (seconds)'][1:].astype('int') >  20000]

##### From what it seems the survey could have been filled in multiple sessions by many as there are many responses with time duration in hours.

In [None]:
input_data[1:].groupby(by='Q3').agg({'Time from Start to Finish (seconds)' : lambda x: x.astype('int').mean()}).sort_values(by='Time from Start to Finish (seconds)').head(21)

In [None]:
input_data[1:].groupby(by='Q3').agg({'Time from Start to Finish (seconds)' : lambda x: x.astype('int').mean()}).sort_values(by='Time from Start to Finish (seconds)',ascending=False).head(21)

##### So Sweden took least time while Uganda the most time to fill the survey

#### Let us now explore the profession and education of respondents

In [None]:
fig, ax = plt.subplots(figsize=(14,8))
labels = ['Master\'s','Bachelor\'s',"Doctoral","DropOut","Prefer No answer","High School","Professional Doctorate"]
values = list(input_data['Q4'][1:].value_counts())
ax.pie(values,labels=labels,autopct='%1.1f%%')
plt.show()

A significant number of respondents are master's degree.

In [None]:
plt.clf()
fig,ax = plt.subplots(figsize=(12,8))
sns.barplot(y='Profession',x='counts',data=input_data['Q5'][1:].value_counts().rename_axis('Profession').reset_index(name='counts'),axes=ax,color='tab:blue')
plt.xlabel("No. of Respondents")
plt.show()

##### Let me see how my country looks like

In [None]:
plt.clf()
fig,ax = plt.subplots(figsize=(12,8))
sns.barplot(y='Profession',x='counts',data=input_data[input_data['Q3'] == 'India']['Q5'][1:].value_counts().rename_axis('Profession').reset_index(name='counts'),axes=ax,color='tab:blue')
plt.xlabel("No. of Respondents")
plt.show()

##### For India the number of Students clearly dominate the survey.

##### Let us see how the students in India are distributed across degrees.

In [None]:
input_data[(input_data['Q3'] == 'India') & (input_data['Q5'] == 'Student')]

In [None]:
## Students in India
sns.countplot(y='Q4',data=input_data[(input_data['Q3'] == 'India') & (input_data['Q5'] == 'Student')][1:])

In [None]:
##Students overall
sns.countplot(y='Q4',data=input_data[(input_data['Q5'] == 'Student')][1:])

##### In India there are is more proportion of Students in Bachelors degree than the global average. Also while we see that respondents with Bachelor's and Master's degree are roughly same number overall; there is a significant difference in the student community. It is possibly because of enthusiasm in the younger people for this field.