# 1. Import & Load & Extract

In [None]:
import numpy as np
import pandas as pd
import json

pd.set_option('display.max_colwidth', 150)

In [None]:
path_to_2021 = "../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv"
path_to_2020 = "../input/kaggle-survey-2020/kaggle_survey_2020_responses.csv"
path_to_2019 = "../input/kaggle-survey-2019/multiple_choice_responses.csv"
path_to_2018 = "../input/kaggle-survey-2018/multipleChoiceResponses.csv"

data_2021 = pd.read_csv(path_to_2021, low_memory=False)
data_2020 = pd.read_csv(path_to_2020, low_memory=False)
data_2019 = pd.read_csv(path_to_2019, low_memory=False)
data_2018 = pd.read_csv(path_to_2018, low_memory=False)

path_to_cols_info = "../input/dataset-kaggle-survey-2018-2021/info_data/columns_info.json"

with open(path_to_cols_info, "r") as cols_info_file:
    diff_cols = json.load(cols_info_file)

# 2. Check single columns diff

In [None]:
diff_sa_cols = diff_cols.get('single columns')
pd.DataFrame(diff_sa_cols).iloc[1:, :]

In [None]:
pd.DataFrame(data_2021['Q8'].value_counts())

In [None]:
pd.DataFrame(data_2019['Q19'].value_counts())

In [None]:
pd.DataFrame(data_2018['Q18'].value_counts())

In [None]:
pd.DataFrame(data_2021['Q41'].value_counts(sort=False))

In [None]:
pd.DataFrame(data_2018['Q12_MULTIPLE_CHOICE'].value_counts(sort=False))

# 3. Check group columns diff

In [None]:
def show_ga_diff(num_group):
    if isinstance(num_group, int):
        num_group = str(num_group)
        
    if num_group not in diff_ga_cols.keys():
        return False
    
    diff_data = diff_ga_cols.get(num_group)

    return pd.DataFrame(diff_data)

In [None]:
diff_ga_cols = diff_cols.get('group columns')
print(len(diff_ga_cols))

In [None]:
show_ga_diff(0)

In [None]:
show_ga_diff(1)

In [None]:
show_ga_diff(20)

# 4. Merged data

In [None]:
def get_columns(df):
    single_cols = df.filter(like="SA").columns.to_list()
    multiple_cols = df.filter(like="GA").columns.to_list()
    
    multiple_groups = []
    for multiple_col in multiple_cols:
        found_group = multiple_col.split("_")[0]
        if found_group not in multiple_groups:
            multiple_groups.append(found_group)
    
    return single_cols, multiple_cols, multiple_groups

In [None]:
path_to_merged = "../input/dataset-kaggle-survey-2018-2021/kaggle_survey_2018-2021_data.csv"
path_to_merged_header = "../input/dataset-kaggle-survey-2018-2021/kaggle_survey_2018-2021_header.csv"

data = pd.read_csv(path_to_merged, low_memory=False)
data_head = pd.read_csv(path_to_merged_header, nrows=1)

data_description = data_head.loc[0].to_dict()

single_cols, multiple_cols, multiple_groups = get_columns(data)

## 4.1. Single answers

In [None]:
def show_sa_title(col_name=None):    
    for x_col in single_cols:
        if col_name and col_name != x_col:
            continue
        
        question = data_description.get(x_col)
        print()
        print(">>> ({}) {}".format(x_col, question))
    
    print()


def show_sa_info(col_name, is_title=True, is_sort=False, is_norm=False):
    if col_name not in data.columns:
        return False

    col_years = "Year"
    
    if is_norm == True:
        how_normalize = 'columns'
    else:
        how_normalize = False
        
    result = pd.crosstab(data[col_name], data[col_years], margins=True,
                                                             margins_name='All',
                         rownames=['Answer'], normalize=how_normalize)

    if how_normalize:
        result = result.mul(100).round(2)
    
    if is_sort == True and 'All' in result.columns:
        result = result.sort_values('All', ascending=False)    
    
    if is_title == True:
        show_sa_title(col_name)
    
    return result

In [None]:
show_sa_title()

In [None]:
show_sa_info('SA1')

In [None]:
show_sa_info('SA1', is_norm=True, is_sort=True)

## 4.2. Group answers

In [None]:
def show_ga_title(group_name=None):    
    for x_group in multiple_groups:
        if group_name and group_name != x_group:
            continue
        
        cols_list = data.filter(like=(x_group + "_")).columns.to_list()
        first_question = data_description.get(cols_list[0])
        group_select = '(Select all that apply)'
        group_title = first_question.split(group_select)[0]
        print()
        print(">>> ({}) {}".format(x_group, group_title))
    
    print()

        
def show_ga_info(group_name, is_title=True, is_sort=False, is_norm=False):
    if group_name not in multiple_groups:
        return False
    
    col_years = "Year"
    cols_list = data.filter(like=(group_name + "_")).columns.to_list()
    result = pd.melt(data, id_vars=[col_years], value_vars=cols_list)
    
    if is_norm == True:
        how_normalize = 'columns'
    else:
        how_normalize = False
        
    result = pd.crosstab(result['value'], result[col_years], margins=True,
                                                             margins_name='All',
                         rownames=['Answer'], normalize=how_normalize)

    if how_normalize:
        result = result.mul(100).round(2)
    
    if is_sort == True and 'All' in result.columns:
        result = result.sort_values('All', ascending=False)    
    
    if is_title == True:
        show_ga_title(group_name)
    
    return result

In [None]:
show_ga_title()

In [None]:
x_group = "GA0"
show_ga_info(x_group)

In [None]:
x_group = "GA0"
show_ga_info(x_group, is_norm=True)

In [None]:
x_group = "GA21"
show_ga_info(x_group, is_norm=True, is_sort=True)