In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt

df = pd.read_csv('../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv', dtype = object)

In [None]:
questions  = df.iloc[:1,:]
answers = df.iloc[1:, :]
# px.scatter(response, x = 'Time from Start to Finish (seconds)')
response = answers.copy() 
response['Time from Start to Finish (seconds)'] = (response['Time from Start to Finish (seconds)'].astype(int)/3600).round(2)
response = response.rename(columns={'Q1': 'age', 'Q2': 'gender', 
                                  'Time from Start to Finish (seconds)' : 'time_hours',
                                  'Q3': 'country', 'Q4': 'education',
                                 'Q5': 'role', 'Q6': 'code_exp'})

* __Time from Start to Finish (seconds)__ column has varied distribution with some very large values.
* Converting total time taken into hours for aggregation as `time_hours`
* Creating a __time_category__ column by binning the data based on `time_hours` column into low, medium and high time taken categories. 

In [None]:
# response['time_minutes']
response_time = response.copy()
response_time['time_category'] = pd.cut(response['time_hours'],3, labels = ['low', 'medium', 'high'])

In [None]:
# Maximum time in hours for time_category : low
response_time.loc[response_time['time_category'] == 'low'].sort_values(by = 'time_hours',ascending = False).head(1)

In [None]:
# Maximum time in hours for time_category : medium
response_time.loc[response_time['time_category'] == 'medium'].sort_values(by = 'time_hours',ascending = False).head(1)

In [None]:
# Maximum time in hours for time_category : high
response_time.loc[response_time['time_category'] == 'high'].sort_values(by = 'time_hours',ascending = False).head(1)

In [None]:
# Number of responses for each time_category : low
response_time['time_category'].value_counts()

## Time taken to complete the survey by different age groups:

In [None]:
px.box(response_time,
       x = 'age',
       y = 'time_hours',
       color = 'time_category',
       template="ggplot2",
       color_discrete_sequence= px.colors.qualitative.Safe)

## Time taken to complete the survey by different roles

In [None]:
px.box(response_time,x = 'role',
       y = 'time_hours',
       color = 'time_category',
       template="ggplot2",
       color_discrete_sequence= px.colors.qualitative.Safe)

## Time taken by various demographics to complete the survey

In [None]:
fig = make_subplots(
    rows=2, cols=2,
    row_heights=[0.5,0.5],
    column_widths=[0.5,0.5],
    shared_yaxes = True,
    vertical_spacing=0.1)

fig.add_trace(go.Box(
    x=response_time.loc[response_time['time_category'] == 'high'].age,
    y=response_time.loc[response_time['time_category'] == 'high'].time_hours,
    name="high time: age group"),row=1, col=1)
fig.add_trace(go.Box(
    x=response_time.loc[response_time['time_category'] == 'medium'].age,
    y=response_time.loc[response_time['time_category'] == 'medium'].time_hours,
    name="medium time: age group"),row=1, col=1)
fig.add_trace(go.Box(
    x=response_time.loc[response_time['time_category'] == 'low'].age,
    y=response_time.loc[response_time['time_category'] == 'low'].time_hours,
    name="low time: age group"),row=1, col=1)
fig.add_trace(go.Box(
    x=response_time.loc[response_time['time_category'] == 'high'].role,
    y=response_time.loc[response_time['time_category'] == 'high'].time_hours,
    name="high time: roles"),row=2, col=1)
fig.add_trace(go.Box(
    x=response_time.loc[response_time['time_category'] == 'medium'].role,
    y=response_time.loc[response_time['time_category'] == 'medium'].time_hours,
    name="medium time: roles"),row=2, col=1)
fig.add_trace(go.Box(
    x=response_time.loc[response_time['time_category'] == 'low'].role,
    y=response_time.loc[response_time['time_category'] == 'low'].time_hours,
    name="low time: roles"),row=2, col=1)
fig.add_trace(go.Box(
    x=response_time.loc[response_time['time_category'] == 'high'].gender,
    y=response_time.loc[response_time['time_category'] == 'high'].time_hours,
    name="high time: gender"),row=1, col=2)
fig.add_trace(go.Box(
    x=response_time.loc[response_time['time_category'] == 'medium'].gender,
    y=response_time.loc[response_time['time_category'] == 'medium'].time_hours,
    name="medium time: gender"),row=1, col=2)
fig.add_trace(go.Box(
    x=response_time.loc[response_time['time_category'] == 'low'].gender,
    y=response_time.loc[response_time['time_category'] == 'low'].time_hours,
    name="high time: gender"),row=1, col=2)
fig.add_trace(go.Box(
    x=response_time.loc[response_time['time_category'] == 'high'].education,
    y=response_time.loc[response_time['time_category'] == 'high'].time_hours,
    name="high time: education"),row=2, col=2)
fig.add_trace(go.Box(
    x=response_time.loc[response_time['time_category'] == 'medium'].education,
    y=response_time.loc[response_time['time_category'] == 'medium'].time_hours,
    name="medium time: education"),row=2, col=2)
fig.add_trace(go.Box(
    x=response_time.loc[response_time['time_category'] == 'low'].education,
    y=response_time.loc[response_time['time_category'] == 'low'].time_hours,
    name="low time: education"),row=2, col=2)

# Set theme, margin, and annotation in layout
fig.update_layout(
    template="ggplot2",
    height = 800,
    margin=dict(r=5, t=25, b=40, l=5))
fig.update_xaxes(showticklabels=True) # hide all the xticks
fig.update_yaxes(range=[0, 900])
fig.show()