In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns




# **Introduction**

**The objective of this notebook is to check on the differences between men and women in the kaggle community**

In [None]:
df = pd.read_csv("../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv")
df.head()

# Gender Distribution

We find that there is a very high gap between men and women in participating in the questionnaire when 80% of the respondents are men and only about 19% are women

In [None]:
fig = go.Figure(data=[go.Pie(labels=df['Q2'][1:].value_counts().index, values=df['Q2'][1:].value_counts().values, textinfo='label+percent')])
fig.update_layout(title_text='Gender Distribution', showlegend=False)
fig.show()

# Age distribution by gender

If we look at the division by age, we find that in the twenties the percentage of women is about 22%, which is about 5% above the general average of women, and the higher the age, the percentage of women decreases.

In [None]:
man = df[df['Q2'] == 'Man']['Q1'].value_counts()
woman = df[df['Q2'] == 'Woman']['Q1'].value_counts()
textonbar_man = [round((m/(m+w))*100, 1) for m, w in zip(man.values, woman.values)]
textonbar_woman = [round((w/(m+w))*100, 1) for m, w in zip(man.values, woman.values)]

fig = go.Figure(data=[
    go.Bar(name='Man', x=man.index, y=man.values, text=textonbar_man),
    go.Bar(name='Woman', x=woman.index, y=woman.values, text=textonbar_woman)
])
fig.update_traces(texttemplate='%{text:.3s}%', textposition='inside')
fig.update_layout(barmode='stack', title_text='Age distribution by gender', xaxis_title='Age', yaxis_title='Counts')
fig.show()

# Country distribution by gender

By country we found that India and USA are the countries with the highest proportion of women (and also the highest number of participants in the questionnaire)

In [None]:
man = df[df['Q2'] == 'Man']['Q3'].value_counts()
woman = df[df['Q2'] == 'Woman']['Q3'].value_counts()
textonbar_man = [round((m/(m+w))*100, 1) for m, w in zip(man.values, woman.values)]
textonbar_woman = [round((w/(m+w))*100, 1) for m, w in zip(man.values, woman.values)]


fig = go.Figure(data=[
    go.Bar(name='Man', y=man.index, x=man.values, text=textonbar_man , orientation='h',),
    go.Bar(name='Woman', y=woman.index, x=woman.values, text=textonbar_woman ,orientation='h',)
])
fig.update_traces(texttemplate='%{text:.3s}%', textposition='inside')
fig.update_layout(barmode='group', title_text='Country distribution by gender', xaxis_title='Counts', yaxis_title='Country')
fig.update_yaxes(range=(-.5, 9.5))
fig.show()

# Highest level of formal education of Kaggler by gender

An interesting fact we found is that the highest rate of women in answer to the question of what your level of education is is that they refuse to answer with 24% of the respondents, and the lowest rate is "no formal education" with only 14% of the respondents so are women.
This is a bit inconsistent with the high distribution of women at young ages and it may be that young women are ashamed to answer that they have no formal education and prefer not to answer.

In [None]:
man = df[df['Q2'] == 'Man']['Q4'].value_counts()
woman = df[df['Q2'] == 'Woman']['Q4'].value_counts()
textonbar_man = [round((m/(m+w))*100, 1) for m, w in zip(man.values, woman.values)]
textonbar_woman = [round((w/(m+w))*100, 1) for m, w in zip(man.values, woman.values)]


fig = go.Figure(data=[
    go.Bar(name='Man', y=man.index, x=man.values, text=textonbar_man , orientation='h',),
    go.Bar(name='Woman', y=woman.index, x=woman.values, text=textonbar_woman ,orientation='h',)
])
fig.update_traces(texttemplate='%{text:.3s}%', textposition='inside')
fig.update_layout(barmode='stack', title_text='Highest level of formal education of Kaggler by gender', xaxis_title='Counts', yaxis_title='Level of education')
fig.show()

# Coding Experince distribution by gender

We see here a match with the high percentage of women at young ages that the high percentage of women is in respondents with less than 3 years of experience in writing code

In [None]:
man = df[df['Q2'] == 'Man']['Q6'].value_counts()
woman = df[df['Q2'] == 'Woman']['Q6'].value_counts()
textonbar_man = [round((m/(m+w))*100, 1) for m, w in zip(man.values, woman.values)]
textonbar_woman = [round((w/(m+w))*100, 1) for m, w in zip(man.values, woman.values)]


fig = go.Figure(data=[
    go.Bar(name='Man', y=man.index, x=man.values, text=textonbar_man , orientation='h',),
    go.Bar(name='Woman', y=woman.index, x=woman.values, text=textonbar_woman ,orientation='h',)
])
fig.update_traces(texttemplate='%{text:.3s}%', textposition='inside')
fig.update_layout(barmode='group', title_text='Coding Experince distribution by gender', xaxis_title='Counts', yaxis_title='Coding Experince')
fig.show()

# Current role of kaggler by gender

We see here a match with the high percentage of women at young ages that the highest proportion of women is among students

In [None]:
man = df[df['Q2'] == 'Man']['Q5'].value_counts()
woman = df[df['Q2'] == 'Woman']['Q5'].value_counts()
textonbar_man = [round((m/(m+w))*100, 1) for m, w in zip(man.values, woman.values)]
textonbar_woman = [round((w/(m+w))*100, 1) for m, w in zip(man.values, woman.values)]


fig = go.Figure(data=[
    go.Bar(name='Man', y=man.index, x=man.values, text=textonbar_man , orientation='h',),
    go.Bar(name='Woman', y=woman.index, x=woman.values, text=textonbar_woman ,orientation='h',)
])
fig.update_traces(texttemplate='%{text:.3s}%', textposition='inside')
fig.update_layout(barmode='group', title_text='Current role of kaggler by gender', xaxis_title='Counts', yaxis_title='Current role')
fig.show()