# Visualization of responses using interactive treemaps.

In [None]:
# packages
import numpy as np
import pandas as pd

import plotly.express as px

In [None]:
# import data: first read only descriptions
df_desc = pd.read_csv('../input/kaggle-survey-2020/kaggle_survey_2020_responses.csv',
                      low_memory=False, nrows=1)
# show descriptions
df_desc

In [None]:
# now import full data frame...
df = pd.read_csv('../input/kaggle-survey-2020/kaggle_survey_2020_responses.csv',
                low_memory=False)

# ...and remove detailed description of questions
df = df.drop(0, axis=0)
df.head()

# Frequency evaluations

In [None]:
# look at age first, this is question Q1
sel_question = 'Q1'
print(df_desc[sel_question][0])

In [None]:
# treemap displaying frequencies
fig = px.treemap(df, path=[sel_question], title='Age (Q1)')
fig.show()

In [None]:
# gender
sel_question = 'Q2'
print(df_desc[sel_question][0])

In [None]:
# treemap displaying frequencies
fig = px.treemap(df, path=[sel_question], title='Gender (Q2)')
fig.show()

In [None]:
# country
sel_question = 'Q3'
print(df_desc[sel_question][0])

In [None]:
# treemap displaying frequencies
fig = px.treemap(df, path=[sel_question], title='Country (Q3)')
fig.show()

In [None]:
# education
sel_question = 'Q4'
print(df_desc[sel_question][0])

In [None]:
# a little issue: we have missing values here
print('Missing entries:', sum(df[sel_question].isnull()))
# we have to impute the missings otherwise the plot won't work
df[sel_question] = df[sel_question].fillna('NA')

In [None]:
# treemap displaying frequencies
fig = px.treemap(df, path=[sel_question], title='Education (Q4)')
fig.show()

In [None]:
# role title
sel_question = 'Q5'
print(df_desc[sel_question][0])

In [None]:
# missing value handling
print('Missing entries:', sum(df[sel_question].isnull()))
df[sel_question] = df[sel_question].fillna('NA')

In [None]:
fig = px.treemap(df, path=[sel_question], title='Role title (Q5)')
fig.show()

In [None]:
# coding experience
sel_question = 'Q6'
print(df_desc[sel_question][0])

In [None]:
# missing value handling
print('Missing entries:', sum(df[sel_question].isnull()))
df[sel_question] = df[sel_question].fillna('NA')

In [None]:
# treemap displaying frequencies
fig = px.treemap(df, path=[sel_question], title='Coding Experience (Q6)')
fig.show()

In [None]:
# yearly compensation
sel_question = 'Q24'
print(df_desc[sel_question][0])

In [None]:
# missing value handling; here we have quite a lot of them
print('Missing entries:', sum(df[sel_question].isnull()))
df[sel_question] = df[sel_question].fillna('NA')

In [None]:
# treemap displaying frequencies
fig = px.treemap(df, path=[sel_question], title='Yearly Compensation (Q24)')
fig.show()

# Combined view on two features

### Let's try to combine two features in one plot in a hierarchical way:

In [None]:
# trying e. g. education(Q4) and gender(Q2)
fig = px.treemap(df, path=['Q4','Q2'], title='Gender by Education')
fig.show()

#### Click into the plot to drill down. Double-click to "zoom out" again.

In [None]:
# another example: gender(Q2) by role(Q5)
fig = px.treemap(df, path=['Q5','Q2'], title='Gender by Role')
fig.show()

In [None]:
# another example: education(Q4) by role(Q5)
fig = px.treemap(df, path=['Q5','Q4'], title='Education by Role')
fig.show()

In [None]:
# another example: country(Q3) by role(Q5)
fig = px.treemap(df, path=['Q5','Q3'], title='Country by Role')
fig.show()

#### So at first glance it seems that in every role category the most respondents are from India? There is one exception, namely the Statisticians:

In [None]:
df_stats = df[df.Q5=='Statistician']
fig = px.treemap(df_stats, path=['Q3'], title='Statisticians by Country')
fig.show()

# Three levels

### We can easily also arrange three features in one plot. However, you will need the drill down feature in order to really benefit from the plots.

In [None]:
# example for three features: role(Q5) => age(Q1) => coding experience(Q6)
fig = px.treemap(df, path=['Q5','Q1','Q6'], title='Role | Age | Coding Experience')
fig.show()

In [None]:
# another example: country(Q3) => role(Q5) => yearly compensation(Q24)
fig = px.treemap(df, path=['Q3','Q5','Q24'], title='Country | Role | Yearly Compensation')
fig.show()

If you are interested in an evaluation of the multiple choice questions, like e. g. usage of programming languages, ML frameworks, IDEs etc. please check out also the following notebook: [https://www.kaggle.com/docxian/2020-kaggle-survey-multiple-choice-correlations](https://www.kaggle.com/docxian/2020-kaggle-survey-multiple-choice-correlations)