# Kaggle Survey - 2021 Analysis Plotly

![](https://miro.medium.com/max/846/1*vdNc_eV8fY9OxikxGW9nWg.png)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from warnings import filterwarnings
filterwarnings('ignore')

colors = ['#B1EDED','#B1B2ED','#1DE7ED','#1DA5ED','#1D50ED','#16548E']

## Load Data

In [None]:
df = pd.read_csv('../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv')
df.head()

## Gender Distribution

In [None]:
fig = go.Figure(data=[go.Pie(labels=df['Q2'][1:].value_counts().index, values=df['Q2'][1:].value_counts().values, textinfo='label+percent')])
fig.update_traces(marker=dict(colors=colors[2:]))
fig.update_layout(title_text='Gender Distribution', showlegend=False)
fig.show()

#### Almost 80% of kagglers are men and only 19% are women

In [None]:
man = df[df['Q2'] == 'Man']['Q1'].value_counts()
woman = df[df['Q2'] == 'Woman']['Q1'].value_counts()
textonbar_man = [round((m/(m+w))*100, 1) for m, w in zip(man.values, woman.values)]
textonbar_woman = [round((w/(m+w))*100, 1) for m, w in zip(man.values, woman.values)]

fig = go.Figure(data=[
    go.Bar(name='Man', x=man.index, y=man.values, text=textonbar_man, marker_color=colors[2]),
    go.Bar(name='Woman', x=woman.index, y=woman.values, text=textonbar_woman, marker_color=colors[3])
])
fig.update_traces(texttemplate='%{text:.3s}%', textposition='inside')
fig.update_layout(barmode='stack', title_text='Age distribution by gender', xaxis_title='Age', yaxis_title='Counts')
fig.show()

## Country wise distribution

In [None]:
df['Q3'] = df['Q3'].str.replace(', Islamic Republic of...', '')
map_df = pd.DataFrame()
map_df['country'] = df['Q3'][1:].value_counts().index
map_df['count'] = df['Q3'][1:].value_counts().values

fig = px.choropleth(map_df, locations="country", locationmode='country names',
                     hover_name="country", color="count", color_continuous_scale=colors,
                    title='Country wise Distribution of Kagglers')
fig.show()

#### We can see most of kagglers are from India and USA

In [None]:
x=df['Q5'][1:].value_counts().values[::-1]
y=df['Q5'][1:].value_counts().index[::-1]
percent = [round(((i/sum(x)) * 100), 1) for i in x]
fig = go.Figure(data=[go.Bar(
            x=x,
            y=y,
            text=percent,
            textposition='inside',
            texttemplate='%{text:.3s}%',
            orientation='h',
            marker_color=colors[2]
        )])


fig.update_layout(title='Current role of kagglers', xaxis_title='counts', yaxis_title='Current role')
fig.show()

#### Overall 26% of kagglers are student and 14% are data scientist. I think most of the students using kaggle for learning purpose and want to make their carrer in data science and machine learning field.

In [None]:
fig = go.Figure(data=[go.Pie(labels=df['Q4'][1:].value_counts().index, values=df['Q4'][1:].value_counts().values, textinfo='label+percent')])
fig.update_traces(marker=dict(colors=colors[2:]))
fig.update_layout(title_text='Formal Education attained or plan to attain in next 2 year', showlegend=False)
fig.show()

#### So the most of the kagglers are pursuing master's and bacherlor's degree or they will pursue in next two years. Great to see 11% of kaggler's have doctoral degree.

In [None]:
fig = px.funnel_area(names=df['Q6'][1:].value_counts().index, values=df['Q6'][1:].value_counts().values, title='Coding Experince')
fig.update_traces(marker=dict(colors=colors[::-1]))
fig.show()

#### Most of the kagglers (30%) have 1-3 years of coding experience and 22% kaggler's have less than 1 year of experince.

In [None]:
df_py = df[(df['Q7_Part_1'] == 'Python')]
df_r = df[(df['Q7_Part_2'] == 'R')]

fig = go.Figure(data=[
    go.Bar(name='Python', x=df_py['Q1'].value_counts().index, y=df_py['Q1'].value_counts().values, marker_color=colors[2]),
    go.Bar(name='R', x=df_r['Q1'].value_counts().index, y=df_r['Q1'].value_counts().values, marker_color=colors[3])
])

fig.update_layout(barmode='group', title='Kagglers using Python and R on regular basis by Age', xaxis_title='Age', yaxis_title='Counts')
fig.show()

#### Most of the python users are between age 18-21 and R users are between age 25-29.

In [None]:
df_env = pd.DataFrame()
df_env['dev_env'] = [df[col][1:].value_counts().index[0] for col in df.columns[21:34]]
df_env['counts'] = [df[col][1:].value_counts().values[0] for col in df.columns[21:34]]
df_env.sort_values(by='counts', ascending=False, inplace=True)

fig = px.treemap(df_env, path=[px.Constant("all"),'dev_env'], values='counts', color='counts', color_continuous_scale=colors)
fig.update_traces(root_color="lightgrey")
fig.update_layout(title='Development environment used by kagglers')
fig.show()

#### 16k kagglers use jupyter notebook and 10k use visual studio code on regular basis.

In [None]:
df_std = df[df['Q5'] == 'Student']
df_ds = df[df['Q5'] == 'Data Scientist']

df_note = pd.DataFrame()
df_note['std_notebook'] = [df_std[col][1:].value_counts().index[0] for col in df.columns[34:51]]
df_note['std_counts'] = [df_std[col][1:].value_counts().values[0] for col in df.columns[34:51]]

df_note['ds_notebook'] = [df_ds[col][1:].value_counts().index[0] for col in df.columns[34:51]]
df_note['ds_counts'] = [df_ds[col][1:].value_counts().values[0] for col in df.columns[34:51]]
df_note.sort_values(by=['std_counts', 'ds_counts'], ascending=False, inplace=True)

fig = go.Figure(data=[
    go.Bar(name='Student', x=df_note['std_notebook'], y=df_note['std_counts'], marker_color=colors[2]),
    go.Bar(name='Data Scientist', x=df_note['ds_notebook'], y=df_note['ds_counts'], marker_color=colors[3])
])


fig.update_layout(barmode='group', title='Cloud notebooks used by kagglers', xaxis_title='Notebook', yaxis_title='Counts',
                 height=600, width=900)
fig.show()

#### Most of the Students and Data Scientist use colab notebooks. Students ratio is higher than Data Scientist for lot of notebooks but we can see for Azure Notebooks, Amazon Sagemaker Studio Notebooks and Databricks Collaborative Notebooks data scientist ratio is high.

## Thank you 