# Importing Libraries & Datasets

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
raw_data = pd.read_csv('../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv', low_memory=False)
raw_data.shape

In [None]:
raw_data.head(3)

In [None]:
data = raw_data.loc[1:]
data_description = raw_data.loc[0]

data_description = data_description.str.replace('- Selected Choice', '') \
                                    .str.strip()

# 1. Columns by answer type

In [None]:
multiple_columns = data.filter(like='_').columns.tolist()

multiple_groups = []
for x in multiple_columns:
    x = x.split('_')[0]
    if x not in multiple_groups:
        multiple_groups.append(x)

single_columns = [col for col in data.columns
                  if col not in multiple_columns]

In [None]:
print("Single columns:", data[single_columns].shape[1])
print("Multiple groups:", len(multiple_groups))
print("Multiple columns:", data[multiple_columns].shape[1])

In [None]:
print("The group of answers to question '{}'\n".format(multiple_groups[0]))
data.filter(like=multiple_groups[0]).head()

# 2. Duration of answers (is it outliers?)

In [None]:
duration = data['Time from Start to Finish (seconds)'].astype(int)
duration_in_minutes = (duration / 60).round(2)

In [None]:
duration_in_minutes.describe(percentiles=[.01, .05, 0.1,
                                          0.5, 0.7, 0.9,
                                          .95, 0.97, 0.99])

In [None]:
threshold_duration = 40  # minutes
is_slowly = (duration_in_minutes > threshold_duration)

In [None]:
plt.figure(figsize=(12,4))
sns.histplot(duration_in_minutes[~is_slowly])
plt.title("Time from Start to Finish (in minutes), Less than 40")
plt.xlabel("")
plt.show()

In [None]:
print("Normal answers:", duration_in_minutes[~is_slowly].count())
print("Slowly answers:", duration_in_minutes[is_slowly].count())

# 3. Columns with one answer

In [None]:
def single_answer_stats(column_name, show_slowly=True):
    cleaned_data = data[column_name].str.strip()
    cleaned_data = cleaned_data.fillna('--- None ---')

    result = pd.DataFrame()

    answers_counted = cleaned_data.value_counts(dropna=False)
    
    result['ALL %'] = (answers_counted / answers_counted.sum()) \
                                    .mul(100).round(1).map("{} %".format)
    result['Count'] = answers_counted

    if show_slowly == True:
        result['  |'] = '  |'
        result['Normal'] = cleaned_data[~is_slowly].value_counts()
        result['|'] = '|'
        result['Slowly'] = cleaned_data[is_slowly].value_counts()
    
    max_len = 30
    result.index = [index_name[:max_len] + " [...]" if len(index_name) > max_len
                    else index_name 
                    for index_name in result.index.tolist()]
    result.index.name = 'Answers:'
    
    print()
    print(">>> ({}) {}\n".format(column_name, data_description.loc[column_name]))
    print(result)
    print()

## 3.1 Automatic check with/without Types of duration

In [None]:
for check_column in single_columns[1:]:
    single_answer_stats(check_column)  # (, show_slowly=False)

## 3.2 Manual check

In [None]:
def single_answer_crosstab(ind_name, col_name='Q1', is_normalize=False):
    if is_normalize == True:
        normalize_by = 'columns'
    else:
        normalize_by = False
        
    result_desc = [">>> ({}) {}".format(ind_name, data_description.loc[ind_name]),
                   ">>> ({}) {}".format(col_name, data_description.loc[col_name])]

    fillna_none = "--- None ---"
    result = pd.crosstab(data[ind_name].fillna(fillna_none),
                         data[col_name].fillna(fillna_none),
                         normalize=normalize_by)
    result['All'] = result.sum(axis=1)
    result = result.sort_values(by='All', ascending=False)
    
    if is_normalize:
        result = result.drop(['All'], axis=1)
        result = result.mul(100).round(2)
    
    return result_desc, result

In [None]:
select_column = "Q2"
result_desc, result = single_answer_crosstab(select_column)

print("", *result_desc, "", sep="\n")
result

In [None]:
select_column = "Q2"
result_desc, result = single_answer_crosstab(select_column, is_normalize=True)

print("", *result_desc, "", sep="\n")
result

In [None]:
plt.figure(figsize=(12, 5))
sns.histplot(x="Q1", hue="Q2", data=data.sort_values(by="Q1"),
             hue_order=["Woman", "Man"],
             multiple="fill", shrink=.75)
plt.title("Distribution by Gender (Man and Woman) and Age")
plt.xlabel("")
plt.show()

In [None]:
select_column = "Q8"
explain_by = "Q2"
result_desc, result = single_answer_crosstab(select_column, explain_by, is_normalize=True)

print("", *result_desc, "", sep="\n")
result

In [None]:
select_column = "Q5"
explain_by = "Q2"
result_desc, result = single_answer_crosstab(select_column, explain_by, is_normalize=True)

print("", *result_desc, "", sep="\n")
result

In [None]:
select_column = "Q5"
explain_by = "Q1"
result_desc, result = single_answer_crosstab(select_column, explain_by)

print("", *result_desc, "", sep="\n")
result

In [None]:
max_roles = 10

plt.figure(figsize=(12, 12))
sns.histplot(x="Q1", hue="Q5", data=data.sort_values(by="Q1"),
             hue_order=result.index[:max_roles].tolist(),
             multiple="fill", shrink=.75)
plt.title("Distribution by Age and current role (or most recent title if retired)")
plt.xlabel("")
plt.show()

In [None]:
select_column = "Q3"
explain_by = "Q2"
result_desc, result = single_answer_crosstab(select_column, explain_by, is_normalize=True)

print("", *result_desc, "", sep="\n")
result

In [None]:
pd.DataFrame({'Man': result['Man'],
              'Woman': result['Woman'],
              'Diff (Man - Woman)': (result['Man'] - result['Woman'])}).sort_values(by='Diff (Man - Woman)')

# 4. Multiple answer columns

In [None]:
def multiple_answer_stats(group_name, is_sort=False):
    group_data = data.filter(like=group_name)

    result = pd.DataFrame(group_data.describe().T)
    result = result.reset_index()
    result = result.fillna('--- None ---')
    result = result.set_index('top')
    result = result.rename(columns={"index": "code"})
    result = result.drop(['freq', 'unique'], axis=1)
    
    result[' % '] = (result['count'] / result['count'].sum()) \
                            .mul(100).round(2).map(" {} %".format)
    
    max_len = 30
    result.index = result.index.str.strip()
    result.index = [index_name[:max_len] + " [...]" if len(index_name) > max_len
                    else index_name 
                    for index_name in result.index.tolist()]
    result.index.name = 'Answers:'    

    if is_sort == True:
        result = result.sort_values(by='count', ascending=False)
        
    first_question = group_data.columns[0]
    group_title = data_description.loc[first_question]
    
    print()
    print(">>> ({}) {}\n".format(first_question, group_title))
    print(result)
    print()

In [None]:
data.filter(like=multiple_groups[0]).head()

## 4.1 Automatic check with/without sorting

In [None]:
for x_group in multiple_groups:
    multiple_answer_stats(x_group)  # (, is_sort=True)

## 4.2 Manual check

In [None]:
def multiple_answer_plot(group_name, explain_by='Q1', n=10):
    group_data = data.filter(like=group_name).apply(lambda x: x.str.strip())
    first_question = group_data.columns[0]
    group_title = data_description.loc[first_question]

    multiple_data = group_data.join(data[explain_by])
    
    select_id_vars = [multiple_data.columns[-1]]
    select_value_vars = multiple_data.columns[:-1].tolist()
    
    multiple_data = multiple_data.melt(id_vars=select_id_vars,
                                       value_vars=select_value_vars,
                                       ignore_index=False).dropna()
    
    check_values = multiple_data.columns[0]
    explain_by = multiple_data.columns[2]
    list_explain_by = multiple_data[explain_by].value_counts().nlargest(n) \
                                                              .index.tolist()

    multiple_data = multiple_data.sort_values(by=check_values).reset_index(drop=True)
    
    plt.figure(figsize=(12, 6))
    sns.histplot(x=check_values, data=multiple_data.sort_values(by=check_values),
                 hue=explain_by, hue_order=list_explain_by,
                 multiple="fill", shrink=.75)
    plt.xlabel("")
    plt.show()

    result = multiple_data.groupby([check_values, 'value']).agg(['count']).unstack(level=0)
    result['All'] = result.sum(axis=1)
    result = result.sort_values(by='All', ascending=False)

    max_len = 30
    result.index = result.index.str.strip()
    result.index = [index_name[:max_len] + " [...]" if len(index_name) > max_len
                    else index_name 
                    for index_name in result.index.tolist()]
    result.index.name = 'Answers:'    
   
    print()
    print(">>> ({}) {}\n".format(first_question, group_title))    
    
    return result

In [None]:
group_name = "Q39"
max_items = 4
multiple_answer_plot(group_name, n=max_items)

In [None]:
group_name = "Q14"
max_items = 5
multiple_answer_plot(group_name, n=max_items)