# 1. Importing Libraries & Datasets

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
def clean_data(df):
    replace_values = {}
    replace_values['Q2'] = {'Male': 'Man', 'Female': 'Woman'}

    replace_values['Q4'] = {'Professional doctorate': 'Professional degree',
                            np.nan: 'I prefer not to answer'}

    replace_values['Q5'] = {'Program/Project Manager': 'Product/Project Manager',
                                    'Product Manager': 'Product/Project Manager'}

    replace_values['Q8'] = {'Julia': 'Other', 'Bash': 'Other', 'Swift': 'Other',
                            'TypeScript': 'Other'}
    
    replace_values['Q15'] = {'< 1 years': 'Under 1 year',
                             '10-15 years': '10-20 years',
                             '20+ years': '20 or more years',
                             np.nan: 'I do not use machine learning methods'}

    replace_values['Q21'] = {'> 10,000 employees': '10,000 or more employees'}
    
    for column_name, replace_dict in replace_values.items():
        if column_name in df.columns:
            df[column_name] = df[column_name].replace(replace_dict)
        
    return df

def get_data(data_list, is_clean_data=True):
    downloaded_data = [(load_data[0], pd.read_csv(load_data[1], low_memory=False))
                       for load_data in data_list]

    questions_list = [set(check_data[1].loc[0, :].values)
                      for check_data in downloaded_data]
    questions_intersection = questions_list[0].intersection(*questions_list[1:])
    
    columns_description = dict()
    columns_intersection = {name_data: []
                            for name_data, _ in downloaded_data}

    base_name, base_data = downloaded_data[0]
    for column_name in base_data.columns:
        question = base_data.loc[0, column_name]
        if question in questions_intersection:
            columns_description[column_name] = question

            for name_data, current_data in downloaded_data:
                for column in current_data.columns:
                    if current_data.loc[0, column] == question:
                        columns_intersection[name_data].append(column)

    concat_data = []
    concat_keys = []
    for data_name, df in downloaded_data:
        columns_list = columns_intersection[data_name]
        data_values = pd.DataFrame(df.loc[1:, columns_list].values)
        concat_data.append(data_values)
        concat_keys.append(data_name)

    data = pd.concat(concat_data, keys=concat_keys)
    data.columns = columns_intersection[base_name]
    data = data.reset_index(level=0).reset_index(drop=True) \
                .rename(columns={'level_0': 'Year'})
       
    columns_description['Year'] = "Year"
    columns_description = {key: value.replace('- Selected Choice', '').strip()
                             for key, value in columns_description.items()}

    data = data.apply(lambda x: x.str.strip())
    
    if is_clean_data == True:
        data = clean_data(data)
    
    return columns_description, data


def get_columns_data(df):
    multiple_columns = df.filter(like='_').columns.tolist()

    multiple_groups = []
    for x in multiple_columns:
        x = x.split('_')[0]
        if x not in multiple_groups:
            multiple_groups.append(x)

    single_columns = [col for col in df.columns
                      if col not in multiple_columns]
    
    return single_columns, multiple_groups, multiple_columns

In [None]:
data_list = [("2021", "/kaggle/input/kaggle-survey-2021/kaggle_survey_2021_responses.csv"),
             ("2020", "/kaggle/input/kaggle-survey-2020/kaggle_survey_2020_responses.csv"),
             ("2019", "/kaggle/input/kaggle-survey-2019/multiple_choice_responses.csv")]

data_description, data = get_data(data_list)
print(data.shape)

> I take a list of data with names (by year), load the first row from each dataset, look for common questions (column names may not match) and, based on this list of questions, form a list of columns for each dataset.
> 
> Then I load the required columns from each dataset into one dataset.

In [None]:
data.sample(10)

In [None]:
data['Year'].value_counts()

In [None]:
single_cols, multiple_groups, multiple_cols = get_columns_data(data)

print("Single columns:", len(single_cols))
print("Multiple groups:", len(multiple_groups))
print("Multiple columns:", len(multiple_cols))

> There are single columns and group columns in the dataset, I form three lists - with single columns, group names and group columns.
> 
> This is necessary to automatically check each column type in the loop.

# 2. Automatic check single columns

In [None]:
def make_table_by(column_name, groupby_column=None):
    cleaned_data = data[column_name]
    cleaned_data = cleaned_data.fillna('--- None ---')

    result = pd.DataFrame()

    answers_counted = cleaned_data.value_counts(dropna=False)
    
    result['ALL %'] = (answers_counted / answers_counted.sum()) \
                                    .mul(100).round(1).map("{} %".format)
    result['Count'] = answers_counted

    if groupby_column and groupby_column in data.columns:
        result[' | '] = " | "
        for g_name, g_value in data.groupby(groupby_column):
            g_value_stats = g_value[column_name].fillna('--- None ---').value_counts()
            g_value_stats.name = g_name
            
            result = result.join(g_value_stats)
            result[g_name] = result[g_name].fillna(0).astype(int)

    max_len = 30
    result.index = [index_name[:max_len] + " [...]" if len(index_name) > max_len
                    else index_name 
                    for index_name in result.index.tolist()]
    result.index.name = 'Answers:'
    
    return result
    

def make_plot_by_age(column_name, age='Q1', size=(12,6), perc_other=None, is_notna=True, is_norm=True):
    plot_data = data.copy()

    if is_notna == True:
        plot_data = plot_data.fillna('--- None ---')
    else:
        plot_data = data
    
    if is_norm == True:
        multiple = 'fill'
    else:
        multiple = 'layer'

    # replacing values less than X percent with '--- Other ---'
    if perc_other:
        top_values = (plot_data[column_name].value_counts() /
                      plot_data[column_name].count()
                     ).mul(100)
        more_than = (top_values < perc_other).values
        replace_to = top_values[more_than].index.tolist()
        plot_data[column_name] = plot_data[column_name].replace(replace_to, '--- Other ---')
    
    plt.figure(figsize=size)
    sns.histplot(x=age, hue=column_name, data=plot_data.sort_values(by=age),
                 multiple=multiple, shrink=.75)
    plt.title("Distribution by Age")
    plt.xlabel("")
    plt.show()


def make_plot_by_year(column_name, size=(12,6), perc_other=None, is_notna=True, is_norm=True):
    column_year = "Year"
    plot_data = data.copy()

    if is_notna == True:
        plot_data = plot_data.fillna('--- None ---')
    else:
        plot_data = data
    
    if is_norm == True:
        multiple = 'fill'
    else:
        multiple = 'layer'

    # replacing values less than X percent with '--- Other ---'
    if perc_other:
        top_values = (plot_data[column_name].value_counts() /
                      plot_data[column_name].count()
                     ).mul(100)
        more_than = (top_values < perc_other).values
        replace_to = top_values[more_than].index.tolist()
        plot_data[column_name] = plot_data[column_name].replace(replace_to, '--- Other ---')
    
    plt.figure(figsize=size)
    sns.histplot(y=column_year, hue=column_name, data=plot_data.sort_values(by=column_year),
                 multiple=multiple, shrink=.75)
    plt.title("Distribution by Year")
    plt.ylabel("")
    plt.show()

## 2.1 Distribution by Age

In [None]:
for col_name in single_cols[2:]:
    result = make_table_by(col_name, "Year")
    print()
    print(">>> ({}) {}\n".format(col_name, data_description.get(col_name)))
    print(result)
    print()
    col_age = "Q1"
    if col_name != col_age:
        make_plot_by_age(col_name, age=col_age, size=(12,6), perc_other=5)

## 2.2 Distribution by Year

In [None]:
for col_name in single_cols[2:]:
    result = make_table_by(col_name, "Year")
    print()
    print(">>> ({}) {}\n".format(col_name, data_description.get(col_name)))
    print(result)
    print()
    make_plot_by_year(col_name, is_norm=True, perc_other=2)

# 3. Automatic check multiple columns

In [None]:
def make_multiple_table_by(group_name, is_years=False, is_sort=False):
    group_data = data.filter(like=group_name)

    result = pd.DataFrame(group_data.describe().T)
    result = result.reset_index()
    result = result.fillna('--- None ---')
    result = result.set_index('top')
    result = result.rename(columns={"index": "code"})
    result = result.drop(['freq', 'unique'], axis=1)
    
    result[' % '] = (result['count'] / result['count'].sum()) \
                            .mul(100).round(2).map(" {} %".format)
    
    if is_years == True:
        result[' | '] = " | "
        years_column = "Year"
        
        for select_year in sorted(data[years_column].unique()):
            mask_select_year = (data[years_column] == select_year)
            result[select_year] = data.loc[mask_select_year].filter(like=group_name) \
                                                    .describe().T \
                                                    .set_index('top')['count']
    max_len = 15
    result.index = result.index.str.strip()
    result.index = [index_name[:max_len] + " [...]" if len(index_name) > max_len
                    else index_name 
                    for index_name in result.index.tolist()]
    result.index.name = 'Answers:'    

    if is_sort == True:
        result = result.sort_values(by='count', ascending=False)
        
    return result


def make_plot_multiple(group_name, column_name='Year', size=(12,6), perc_other=None, is_norm=True):
    column_by = column_name
    group_data = data.filter(like=group_name)
    plot_data = group_data.join(data[column_by]).melt(id_vars=[column_by],
                                                        value_vars=group_data.columns,
                                                        ignore_index=False) \
                                                            .drop('variable', axis=1) \
                                                            .dropna()
    
    if is_norm == True:
        multiple = 'fill'
    else:
        multiple = 'layer'

    value_name = 'value'
    # replacing values less than X percent with '--- Other ---'
    if perc_other:
        top_values = (plot_data[value_name].value_counts() /
                      plot_data[value_name].count()
                     ).mul(100)
        more_than = (top_values < perc_other).values
        replace_to = top_values[more_than].index.tolist()
        plot_data[value_name] = plot_data[value_name].replace(replace_to, '--- Other ---')
    
    plot_data = plot_data.reset_index(drop=True)  # to fix Kaggle's trouble
    
    plt.figure(figsize=size)
    sns.histplot(y=column_by, hue=value_name, data=plot_data.sort_values(by=column_by),
                 multiple=multiple, shrink=.75)
    plt.title("Distribution group '{}' by column '{}'".format(group_name, column_name))
    plt.ylabel("")
    plt.show()
    
    
def show_multiple_result(column_name, size=(12,6), perc_other=None, is_norm=True):
    for check_group in multiple_groups:
        if check_group in list_of_incomplete_groups:
            continue

        result = make_multiple_table_by(check_group, is_years=True, is_sort=True)
        first_question = result.iloc[0, 0]
        group_title = data_description.get(first_question)

        group_select = '(Select all that apply)'
        group_title = group_title.split(group_select)[0]

        print()
        print(">>> Group question: ({}) {}- {}\n".format(check_group, group_title, group_select))
        print(result)
        print()
        make_plot_multiple(check_group, column_name, size, perc_other, is_norm)

In [None]:
# for datasets: 2021 + 2020 + 2019
list_of_incomplete_groups = ['Q9', 'Q10', 'Q12', 'Q19', 'Q40', 'Q42']

## 3.1 Distribution by Year

In [None]:
show_multiple_result('Year', perc_other=6)

## 3.2 Distribution by Age

In [None]:
show_multiple_result('Q1', size=(12,8), perc_other=6)

## 3.3 Distribution by Gender

In [None]:
show_multiple_result('Q2', perc_other=6)