In [None]:
import re
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
%matplotlib inline

from sklearn.decomposition import PCA, KernelPCA
from sklearn.cluster import KMeans

from random import randint

pd.set_option('display.max_colwidth', 100)

option_rs = 1234  # Option random state
option_cm = sns.light_palette('green', as_cmap=True)
option_color = "#5fba7d"
option_plot_width = 12

In [None]:
def table_amount(df, col_name, is_norm, is_sort, is_all, is_empty, is_style):
    year_key = "year"

    if col_name not in df.columns:
        return False

    year_name = res_cols_dict.get(year_key)

    if not year_name or year_name not in df.columns:
        return False

    if col_name == year_name:
        return False
    
    table_index = df[col_name]
    table_columns = df[year_name]

    if is_empty == False:
        table_index = table_index.copy().fillna('-Empty-')

    if is_norm == True:
        how_normalize = 'columns'
    else:
        how_normalize = False

    result = pd.crosstab(table_index, table_columns,
                         rownames=[col_name], colnames=[year_name],
                         margins=is_all, normalize=how_normalize)

    if how_normalize:
        result = result.mul(100).round(2)

    if is_sort == True and 'All' in result.columns:
        result = result.sort_values('All', ascending=False)

    if is_all == True and 'All' in result.index:
        result = result.drop(['All'], axis=0)
    
    trim_to = 50
    result.index = result.index.astype(str)
    result.index = [x[:trim_to] + " [...]"
                        if len(x) > trim_to else x
                            for x in result.index]   

    result.index.name = "Answer"
    for key, value in res_cols_dict.items():
        if col_name == value:
            result.index.name = key
        
    if is_style == True and 'All' in result.columns:
        result = result.style.bar(subset=['All'], color=option_color)

    return result


def plot_amount(df, col_name, hue_name, is_empty, is_norm, is_sort):
    plot_data = df.copy()

    max_len_value = 30
    plot_width = option_plot_width
    min_height = 4
    max_height = int(plot_width * 1.5)
    plot_height = int(plot_data[col_name].nunique() * 0.8)

    if plot_height < min_height:
        plot_height = min_height
    
    if plot_height > max_height:
        plot_height = max_height

    if is_norm == True:
        multiple = 'fill'
    else:
        multiple = 'stack'

    if is_empty == False:
        plot_data[col_name] = plot_data[col_name].fillna('-Empty-')

    def _trim_value(x):        
        if isinstance(x, str) and len(x) > max_len_value:
            return x[:max_len_value] + " [...]"
        else:
            return x
    
    plot_data[col_name] = plot_data[col_name].apply(_trim_value)
    
    if is_sort == True:
        sorter = plot_data[col_name].value_counts(ascending=False) \
                                    .index.to_list()

        plot_data[col_name] = plot_data[col_name].astype("category")
        plot_data[col_name] = plot_data[col_name].cat.set_categories(sorter)
        
        plot_data = plot_data.sort_values([col_name])

    hue_order = plot_data[hue_name].value_counts(ascending=False) \
                                   .index.to_list()
    
    plt.figure(figsize=(plot_width, plot_height))
    sns.histplot(y=col_name, hue=hue_name, data=plot_data,
                 hue_order=hue_order,
                 multiple=multiple, shrink=.75)
    plt.title("Distribution by {}".format(hue_name))
    plt.ylabel("")
    plt.xlabel("")
    plt.show()
    

def plot_age(df, col_name, is_norm):
    age_name = "age"
    age_col = res_cols_dict.get(age_name)
    
    if not age_col:
        return False
    
    if col_name not in df.columns or age_col not in df.columns:
        return False
    
    plot_data = df[[age_col, col_name]].copy()

    max_len_value = 50
    plot_width = option_plot_width
    min_height = 4
    max_height = int(plot_width * 1.5)
    plot_height = int(plot_data[col_name].nunique() * 0.8)

    if plot_height < min_height:
        plot_height = min_height
    
    if plot_height > max_height:
        plot_height = max_height

    if is_norm == True:
        multiple = 'fill'
    else:
        multiple = 'stack'

    def _trim_value(x):        
        if isinstance(x, str) and len(x) > max_len_value:
            return x[:max_len_value] + " [...]"
        else:
            return x
    
    plot_data[col_name] = plot_data[col_name].apply(_trim_value)

    hue_order = plot_data[col_name].value_counts(ascending=False) \
                                   .index.to_list()

    plt.figure(figsize=(plot_width, plot_height))
    sns.histplot(x=age_col, hue=col_name, data=plot_data.sort_values(age_col),
                 hue_order=hue_order,
                 multiple=multiple, shrink=.75)
    plt.title("Distribution by {}".format(age_name))
    plt.ylabel("")
    plt.xlabel("")
    plt.show()
    
    
def agender(df, saga_name, type_ge=2):
    age_name = "age"
    gender_name = "gender"
    country_name = "country"
    
    age_col = res_cols_dict.get(age_name)    
    gender_col = res_cols_dict.get(gender_name)
    country_col = res_cols_dict.get(country_name)

    if saga_name in [age_name, gender_name, age_col, gender_col]:
        return False

    mask_gender = df[gender_col].isin(['Man', 'Woman'])
    data = df.loc[mask_gender, :].copy()

    if saga_name == country_name:
        indx_table = data[country_col]
        indx_name = [country_name]

    elif 'Clusters' in saga_name:
        if saga_name not in data.columns:
            return False
        else:
            indx_table = data[saga_name]
            indx_name = [saga_name]

    elif 'SA' in saga_name:
        if saga_name not in data.columns:
            return False
        else:
            indx_table = data[saga_name]
            indx_name = [saga_name]
    
    elif 'GA' in saga_name:
        group_cols = data.filter(like=(saga_name + "_")).columns.to_list()
        
        if not group_cols:
            return False
        
        data = pd.melt(data, id_vars=[gender_col, age_col], value_vars=group_cols)
        
        data = data.drop('variable', axis=1).rename(columns={'value': saga_name})
        
        indx_table = data[saga_name]
        indx_name = [saga_name]
    
    else:
        return False

    if type_ge == 1:
        cols_table = [data[age_col], data[gender_col]]
        cols_names = [age_name, gender_name]
        
    elif type_ge == 2:
        cols_table = [data[gender_col], data[age_col]]
        cols_names = [gender_name, age_name]
        
    else:
        return False

    
    result = pd.crosstab(index=indx_table, columns=cols_table,
                         rownames=indx_name, colnames=cols_names,
                         margins=True)
    
    result.index = result.index.astype(str)
    trim_to = 45
    result.index = [x[:trim_to] + " [...]"
                        if len(x) > trim_to else x
                            for x in result.index]

    result.index.name = "Answer"

    all_col = "All"
    if all_col in result.columns:
        result = result.sort_values(all_col, ascending=False)

        result = result.drop(columns=all_col, level=0)

        if all_col in result.index:
            result = result.drop(index=all_col)

    result = result.style.background_gradient(cmap=option_cm, axis=None)
    
    return result


def edugender(df, col_name, type_ge):
    edu_col = "SA4"
    edu_name = "Education"
    gender_name = "gender"    
    gender_col = res_cols_dict.get(gender_name)

    if col_name not in df.columns:
        return False

    if col_name  == edu_col or col_name == gender_col:
        return False

    if edu_col not in df.columns or gender_col not in df.columns:
        return False
    
    degree_list = ["Bachelor’s degree", "Master’s degree", "Doctoral degree"]
    mask_edu = df[edu_col].isin(degree_list)
    mask_gender = df[gender_col].isin(['Man', 'Woman'])
    
    mask_all = (mask_gender & mask_edu)

    cols_list = [edu_col, gender_col, col_name]

    data = df.loc[mask_all, cols_list].copy()

    data[edu_col] = data[edu_col].astype("category")
    data[edu_col] = data[edu_col].cat.set_categories(degree_list)
    data = data.sort_values([edu_col])
    
    if type_ge == 1:
        cols_table = [data[edu_col], data[gender_col]]
        cols_names = [edu_name, gender_name]
        
    elif type_ge == 2:
        cols_table = [data[gender_col], data[edu_col]]
        cols_names = [gender_name, edu_name]
        
    else:
        return False

    indx_table = data[col_name]
    indx_name = [col_name]
    
    result = pd.crosstab(index=indx_table, columns=cols_table,
                         rownames=indx_name, colnames=cols_names,
                         margins=True)

    trim_to = 45
    result.index = result.index.astype(str)
    result.index = [x[:trim_to] + " [...]"
                        if len(x) > trim_to else x
                            for x in result.index]
    result.index.name = "Answer"

    all_col = "All"
    if all_col in result.columns:
        result = result.sort_values(all_col, ascending=False)

        result = result.drop(columns=all_col, level=0)

        if all_col in result.index:
            result = result.drop(index=all_col)

    result = result.style.background_gradient(cmap=option_cm, axis=None)
    
    return result


def transform_cols(df, method, is_drop_first=False):
    if isinstance(df, pd.Series):
        df = pd.DataFrame(df)
    
    cols_list = [col for col in df.columns
                 if df[col].dtype == 'object']
    
    if not cols_list:
        return None
    
    data = df[cols_list].copy()
    
    methods = ['onehot', 'bin', 'cols', 'rows']
    
    if not method or method not in methods:
        return methods
    
    if method == 'onehot':
        cols_list = data.columns.to_list()
        
        if is_drop_first != True:
            is_drop_first = False
        
        data = pd.get_dummies(data, prefix=cols_list, drop_first=is_drop_first)
    
    if method == 'bin':
        data = data.notna().astype(int)
    
    if method == 'cols':
        for col in data.columns:
            repl_dict = data[col].value_counts(normalize=True).to_dict()
            data[col] = data[col].replace(repl_dict).fillna(0)

    if method == 'rows':
        data = data.notna().astype(int)
        data = data.div(data.sum(axis=1), axis=0).fillna(0)
        
    return data

## What is this notebook about?

> It will tell a data story about a subset of the data science community represented in 2021-2018 industry surveys.

# 1. About Data

> In 2021, the survey was conducted from 09.01.2021 to 04.10.2021 and after clearing the data, it contains 25973 answers, in other years (2018-2020) the number of answers is different.
> 
> To compare correctly the data for the period of interest, we need to understand what they have in common and what is different.

In [None]:
def get_columns(df, select_by=None):
    if not select_by:
        multiple_cols = df.filter(like='_').columns.tolist()

        multiple_groups = []
        for x_col in multiple_cols:
            group_split = re.findall('_Part|_OTHER', x_col)[0]
            found_group = x_col.split(group_split)[0]

            if found_group not in multiple_groups:
                multiple_groups.append(found_group)

        single_cols = [col for col in df.columns
                          if col not in multiple_cols
                           and "Q" in col]  # skip Duration column

    if select_by == "SAGA":
        single_cols = df.filter(like="SA").columns.to_list()
        multiple_cols = df.filter(like="GA").columns.to_list()

        multiple_groups = []
        for multiple_col in multiple_cols:
            found_group = multiple_col.split("_")[0]
            if found_group not in multiple_groups:
                multiple_groups.append(found_group)
    
    return single_cols, multiple_cols, multiple_groups

In [None]:
# Dataset 2018-2021
path_to_data = "../input/dataset-kaggle-survey-2018-2021/kaggle_survey_2018-2021_data.csv"
data = pd.read_csv(path_to_data, low_memory=False)

path_to_data_header = "../input/dataset-kaggle-survey-2018-2021/kaggle_survey_2018-2021_header.csv"
data_header = pd.read_csv(path_to_data_header)
data_description = data_header.loc[0].to_dict()

single_cols, multiple_cols, multiple_groups = get_columns(data, select_by='SAGA')

# Dataset 2021
path_to_data_21 = "../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv"
data_21 = pd.read_csv(path_to_data_21, low_memory=False)

single_cols_21, multiple_cols_21, multiple_groups_21 = get_columns(data_21)

## 1.1 Introduction

> This notebook uses already downloaded and old cleaned data, because the answers are stored in different columns and have different structures.
> 
> Let's see what we managed to collect together, how many single and group answers.

In [None]:
def df_diff():
    result = pd.DataFrame()

    if single_cols_21 and multiple_cols_21 and multiple_groups_21:
        _ = [len(single_cols_21), len(multiple_groups_21), len(multiple_cols_21)]
        result['data_21'] = pd.DataFrame(_)

    if single_cols and multiple_cols and multiple_groups:
        _ = [len(single_cols), len(multiple_groups), len(multiple_cols)]
        result['data'] = pd.DataFrame(_)
       
    result.index = ['single columns', 'multiple groups', 'multiple columns']

    return result


def df_fullness(df, cols_list, is_stats=True):
    if is_stats == True:
        df_shape = df.shape
        print("\nRows: {}".format(df_shape[0]))

        year_stats = pd.DataFrame(df['Year'].value_counts(sort=False))
        print("\n{}\n".format(year_stats))
    
    fullness = pd.DataFrame()
    
    fullness['Fullness'] = df[cols_list].notna().mean() \
                        .mul(100).round(2).apply("{}%".format)
    
    if data_description:
        fullness['Questions'] = pd.Series(data_description)

    return fullness

In [None]:
df_diff()

> As a result, we have 20 single columns (one column for different answers) and 23 group columns (where there is a separate column for each answer).
> 
> In order to determine how to work with data, we will choose only single answers for evaluation. Let's look at these columns, what their names are, how complete they are, and what questions they have.

In [None]:
df_fullness(data, single_cols)

## 1.2 Check notna

> It can be seen that the fullness of the columns with the answers is different, it depends on the fact that in different years some answers could be absent in the survey.
> 
> If we look at the general statistics, we can see that in 2018-2020 the minimum number of cells (answers to questions) was 3 - they include age, gender and country, in 2021 the minimum number of cells (answers) is 6.

In [None]:
def get_notna(df, cols_list, less_than=None):
    result = df[cols_list].notna().sum(axis=1)
    
    if less_than:
        result = (result < less_than)  # mask
    
    return result


def df_notna_stats(df, cols_list, per_list=None, is_style=True):
    if not per_list:
        per_list = [.1, .25, .5, .75, .9]

    result = df[cols_list].notna().sum(axis=1) \
                          .groupby(df['Year']) \
                          .describe(percentiles=per_list)

    result = result.drop(['count', 'mean', 'std'], axis=1).astype(int)

    if is_style == True:
        result = result.style.background_gradient(subset=['min', 'max'], cmap=option_cm)
    
    return result


def df_notna_table(df, cols_list, is_norm=False, is_sort=False, is_all=False):
    num_notna = get_notna(data, cols_list)
    year_data = data['Year']
    
    if is_norm == True:
        how_normalize = 'index'
    else:
        how_normalize = False
        
    result = pd.crosstab(year_data, num_notna, colnames=['Number notna'],
                         margins=is_all, normalize=how_normalize)

    if how_normalize:
        result = result.mul(100).round(2)
    
    if is_sort == True and 'All' in result.columns:
        result = result.sort_values('All', ascending=False)        
    
    return result

In [None]:
df_notna_stats(data, single_cols)

> The filling of the cells (survey) by year also differs, in 2018-2019 most of the answers were for 11-12 questions from the respondent, in 2021 - 17 answers.

In [None]:
check_notna = df_notna_table(data, single_cols)
check_notna

In [None]:
plt.figure(figsize=(12, 4))
sns.heatmap(check_notna, cmap=option_cm)
plt.show()

In [None]:
plt.figure(figsize=(12, 4))
sns.heatmap(check_notna, cmap=option_cm,
            vmin=0, vmax=1, linewidths=.5)
plt.show()

> Examples of those responses where there were only 3, only 4, or only 6 answers.

In [None]:
number_notna = get_notna(data, single_cols)

In [None]:
data.loc[number_notna == 3,
         single_cols].sample(3, random_state=option_rs) \
        .style.highlight_null(null_color='grey')

In [None]:
data.loc[number_notna == 4,
         single_cols].sample(3, random_state=option_rs) \
        .style.highlight_null(null_color='grey')

In [None]:
data.loc[number_notna == 6,
         single_cols].sample(3, random_state=option_rs) \
        .style.highlight_null(null_color='grey')

In [None]:
# df_notna_table(data, single_cols, is_all=True)

> Based on this, it is possible to single out a segment of respondents, let's call them mute, and, if necessary, exclude them from further analysis.
> 
> The **threshold is less than 7 answers** from the respondent.

In [None]:
is_mute = get_notna(data, single_cols, less_than=7)

data.loc[is_mute,
         ["Year"] + single_cols].sample(5, random_state=option_rs) \
        .style.highlight_null(null_color='grey')

> Let's look at these respondents by year. It can be seen that this segment did not affect the data of the 2021 survey.

In [None]:
data.loc[is_mute, "Year"].value_counts(sort=False)

## 1.3 Check duration

> The duration of filling out the questionnaire is different for each respondent, let's see the statistical data by year.
> 
> It can be seen that in 2018-2020 the minimum time was about 30 seconds. Considering that there are profiles with only 3 responses, this is not surprising.

In [None]:
def df_duration_stats(df, per_list=None, is_style=True):
    if not per_list:
        per_list = [.1, .25, .5, .75, .9]
        
    result = df.groupby('Year')['Duration'].describe(percentiles=per_list)
    result = result.drop(['count', 'mean', 'std'], axis=1)
    
    if is_style == True:
        result = result.style.background_gradient(subset=['min'], cmap=option_cm)

    return result

In [None]:
df_duration_stats(data)

> Let's analyze the data on the graph, cut off the long tail after 40 minutes per questionnaire.
>
> It can be seen that every year the number of anomalies decreases - the graph is leveled.

In [None]:
check_duration = data['Duration'] < 40  # minutes
plot_data = data.loc[check_duration]

plt.figure(figsize=(12, 6))
sns.histplot(data=plot_data, x="Duration", hue="Year", binwidth=.5)
plt.ylabel("")
plt.xlabel("")
plt.show()

In [None]:
g = sns.FacetGrid(plot_data, col="Year", col_wrap=2,
                  height=3.5, aspect=1.5)
g.map(sns.histplot, "Duration", binwidth=.5)
g.set_axis_labels("", "");

> Let's look at this data without the mute respondents segment. It can be seen that the minimum response time has increased.

In [None]:
df_duration_stats(data[~is_mute], is_style=False)

> To cut off those who answered quickly, we will highlight the segment of those who answered fast, less than 2 minutes per questionnaire.
> 
> Examples of their answers, as well as data by year.

In [None]:
is_fast = (data['Duration'] < 2)  # less than X minutes

data.loc[is_fast,
         single_cols].sample(3, random_state=option_rs) \
        .style.highlight_null(null_color='grey')

In [None]:
data.loc[is_fast, "Year"].value_counts(sort=False)

> After all this, we can create a segment of respondents, those who answered normally - by the number of responses and the speed of filling out the survey.
>
> For this we use the already created segments.

In [None]:
is_normal = (~is_fast & ~is_mute)

# Check not normal
data.loc[~is_normal,
         ['Duration', 'Year'] + single_cols].sample(7, random_state=option_rs) \
        .style.highlight_null(null_color='grey')

## 1.4 Results

> Let's check what we got. The data that we have excluded and the data that we have left for analysis.

In [None]:
def df_segments():
    concat_data = []
    concat_cols = []
    x_col = "Year"

    concat_data.append(data[x_col].value_counts(sort=False))
    concat_cols.append('Origin')

    concat_data.append(data.loc[is_mute, x_col].value_counts(sort=False))
    concat_cols.append('Mute')

    concat_data.append(data.loc[is_fast, x_col].value_counts(sort=False))
    concat_cols.append('Fast')

    concat_data.append(data.loc[~is_normal, x_col].value_counts(sort=False))
    concat_cols.append('Not normal')

    concat_data.append(data.loc[is_normal, x_col].value_counts(sort=False))
    concat_cols.append('Normal')
    
    result = pd.concat(concat_data, keys=concat_cols, axis=1).fillna(0).astype(int)
    
    if 'Not normal' in result.columns:
        result = result.style.text_gradient(subset=['Not normal'],
                                            cmap="brg", low=0.5, high=1.0)
    
    return result

In [None]:
df_segments()

# 2. About Respondents

> After we have decided on the characteristics of the dataset, let's look at those who answered the questions. We are interested in information about the respondent - age, gender and country.
> 
> For the convenience of working with this data, we will create a separate dataframe and add meaningful names for the required columns.

In [None]:
res_cols_dict = {'year': 'Year',
                 'age': 'SA1',
                 'gender': 'SA2',
                 'country': 'SA3'}

## 2.1 Introduction

> We will work with the res_data dataframe, which contains a segment of normal answers.

In [None]:
res_cols = list(res_cols_dict.values())
res_data = data.loc[is_normal, res_cols].copy()

print(res_data.shape)

## 2.2 Check and modify

In [None]:
def res_amount(type_info=None, is_norm=False, is_sort=True, is_all=True, is_empty=True, is_style=True):
    df = res_data
    
    if not type_info:
        return list(res_cols_dict.keys())[1:]

    col_name = res_cols_dict.get(type_info)
    
    if not col_name:
        return False
    
    return table_amount(df, col_name, is_norm, is_sort, is_all, is_empty, is_style)


def res_trim_to(n_labels, type_info):
    df = res_data

    if type_info == 'year':
        return False

    col_name = res_cols_dict.get(type_info)
    
    if not col_name or col_name not in df.columns:
        return False
    
    if n_labels >= df[col_name].nunique():
        return None
    
    all_labels = pd.DataFrame(df[col_name].value_counts()).T
    
    other_name = "Other"
    if other_name in all_labels.columns:
        _ = all_labels.pop(other_name)
        
    popular_labels = all_labels.iloc[:, :n_labels -1].columns.to_list()
    other_labels = [x_labels for x_labels in all_labels
                        if x_labels not in popular_labels]
    
    df[col_name].replace(other_labels, other_name, inplace=True)
    
    return True


def res_trim_after(x_age, type_info):
    df = res_data

    if type_info == 'year':
        return False
    
    col_name = res_cols_dict.get(type_info)
    
    if not col_name or col_name not in df.columns:
        return False

    replace_x_age = str(x_age) + "+"

    if replace_x_age in df[col_name].unique():
        return None
    
    mask_age = df[col_name].str.slice(stop=2).astype(int).ge(x_age)

    df[col_name].mask(mask_age, replace_x_age, inplace=True)
    
    return True


def res_agender(type_ge=2):
    data = res_data
    saga_name = "country"
    
    return agender(data, saga_name, type_ge)

## Gender

> The answers include 5 options, of which two are basic - man and woman, which did not change much during the analyzed period.
> 
> For the convenience of visualizing and analyzing data, we will collect the rest of the answers in one group. As a result, we will have three options with which we will work.

In [None]:
res_amount('gender')

In [None]:
res_trim_to(3, 'gender')

In [None]:
res_amount('gender')

## Country

> There are a large number of answer options, among which there are two leaders - India and the USA, and there is also a group called Others.
> 
> For convenience, we will reduce the number of answer options by moving some of them to the Others.

In [None]:
res_amount('country')

In [None]:
res_trim_to(10, 'country')

In [None]:
res_amount('country')

## Age

> Age is divided into small subgroups. It can be seen from the data that the number of respondents increases from 18 to 25, after 30 it begins to decline.
> 
> For the convenience of further work, we will reduce the range of ages by recording the subgroups after a certain age in one last.

In [None]:
res_amount('age', is_sort=False)

In [None]:
res_trim_after(60, 'age')

In [None]:
res_amount('age', is_sort=False)

## Country & Age & Gender (Man / Woman)

In [None]:
res_agender()

## 2.3 Visualization

In [None]:
def res_plot_amount(type_info, hue_info, is_empty=True, is_norm=True, is_sort=False):
    if type_info == hue_info:
        return False
    
    df = res_data

    col_name = res_cols_dict.get(type_info)
    hue_col = res_cols_dict.get(hue_info)
    
    if not col_name or not hue_col:
        return False
    
    plot_data = df[[col_name, hue_col]].copy() \
                .rename(columns={col_name: type_info,
                                 hue_col: hue_info})
    
    if type_info in ['age', 'year']:
        plot_data = plot_data.sort_values(by=type_info)

    if type_info == 'year':
        plot_data[type_info] = plot_data[type_info].astype(str)

    return plot_amount(plot_data, type_info, hue_info, is_empty, is_norm, is_sort)

In [None]:
res_plot_amount('age', 'gender', is_norm=False)

> Answer options in some cases are distributed unevenly, it is better to visualize them with normalization.
> 
> If we look at the data for age and gender, we can clearly see that the older the age group, the more man.

In [None]:
res_plot_amount('age', 'gender')

> By age and country, it can be seen that the youngest respondents is in India, and the oldest is in the USA.
> 
> It can also be seen that two countries - China and Japan - vary greatly in audience composition with age.

In [None]:
res_plot_amount('age', 'country')

> We can see the difference between these countries and by gender. USA has the most diverse audience.

In [None]:
res_plot_amount('country', 'gender', is_sort=True)

> If we analyze the data by country and year, we can see that Nigeria has the largest growth in 2021. The largest decline (2021 to 2018) is in China.

In [None]:
res_plot_amount('country', 'year', is_sort=True)

# 3. About Answers (single)

> After we have decided on the respondents, let's see how they answered the questions. In this section, we analyze single question-answers (one column). We are interested in quantitative and qualitative - age, gender, country, year - the analysis of these answers.
> 
> For the convenience of working with this data, we will create a separate dataframe and add the previously processed data about the respondents.

In [None]:
def clean_money_cols(df, cols_list):
    result = []
    for col in cols_list:
        if col not in df.columns:
            continue
        
        money_abbr = ["\$", "\(USD\)"]
        for abbr in money_abbr:
            if df[col].str.contains(abbr).any():
                df[col] = df[col].replace(regex=abbr, value="")
                df[col] = df[col].str.strip()
                
                if col not in result:
                    result.append(col)
    
    if result:
        return result


def sa_info(df, cols_list, is_style=True):
    sa_isna = df.loc[: , cols_list].isna()

    sa_isna_sum = sa_isna.sum().sum()    
    print("\n{} empty cells\n".format(sa_isna_sum))
    
    result = pd.DataFrame()
    result['Empty'] = sa_isna.mean().mul(100).round(2).apply("{}%".format)

    result['Unique'] = df.loc[: , cols_list].nunique()

    if data_description:
        result['Questions'] = pd.Series(data_description)

    result = result.reset_index().rename(columns={'index': 'Column'})

    if is_style == True and 'Unique' in result.columns:
        """
        result = result.style.text_gradient(subset=['Unique'],
                                            cmap=option_cm,
                                            low=0.75, high=1.0)
        """
        result = result.style.background_gradient(subset=['Unique'],
                                                  cmap=option_cm)
    
    return result

In [None]:
def answers_corr(df, upd='cols', method=None, plot_width=12):
    if not method:
        method = 'pearson'

    methods = ['pearson', 'kendall', 'spearman']
    
    if method not in methods:
        return methods

    data = transform_cols(df, upd)
    
    if isinstance(data, list):
        return data
    
    corr_data = data.corr(method) \
                    .dropna(axis=0, how='all').dropna(axis=1, how='all')
    
    corr_len = len(corr_data.columns.to_list())
    plot_height = int(corr_len * 0.8)

    min_height = 6
    max_height = plot_width

    if plot_height < min_height:
        plot_height = min_height
    
    if plot_height > max_height:
        plot_height = max_height
    
    grid_kws = {"height_ratios": (.9, .05), "hspace": .2}

    f, (ax, cbar_ax) = plt.subplots(2, gridspec_kw=grid_kws,
                                    figsize=(plot_width, plot_height))

    ax = sns.heatmap(corr_data, annot=True, linewidths=.5, cmap="YlGnBu",
                     ax=ax, cbar_ax=cbar_ax,
                     cbar_kws={"orientation": "horizontal"})

In [None]:
sa_cols = [col for col in single_cols
               if col not in res_data.columns]

sa_data = res_data.join(data.loc[res_data.index, sa_cols])

In [None]:
clean_money_cols(sa_data, sa_cols)  # deleted text: '$' or '(USD)'

> Let's check the presence of empty values with answers, as well as unique values - how many answer options there are for each question.
> 
> It can be seen that the longer the respondent answered the survey, the more gaps there were in the answers.

In [None]:
sa_info(sa_data, sa_cols)

In [None]:
# answers_corr(sa_data[sa_cols])

## Check and Visualisation

> Let's start analyzing the answers, for this we will create the required number of variables for each available question.
> 
> Columns 1-3 in the available dataset refer to respondents.
> 
> We will enlarge the answer options with a small number - transfer them to Other.

In [None]:
def sa_amount(col_name, is_norm=False, is_sort=True, is_all=True, is_empty=True, is_style=True):    
    df = sa_data
    
    if col_name not in df.columns:
        return False

    print("\n>>> ({}) {}\n".format(col_name, data_description.get(col_name)))

    return table_amount(df, col_name, is_norm, is_sort, is_all, is_empty, is_style)   


def sa_trim_to(n_labels, col_name):
    df = sa_data

    if col_name not in df.columns:
        return False
    
    if n_labels >= df[col_name].nunique():
        return None
    
    all_labels = pd.DataFrame(df[col_name].value_counts()).T
    
    other_name = "Other"
    if other_name in all_labels.columns:
        _ = all_labels.pop(other_name)

    popular_labels = all_labels.iloc[:, :n_labels -1].columns.to_list()
    other_labels = [x_labels for x_labels in all_labels
                        if x_labels not in popular_labels]
    
    df[col_name].replace(other_labels, other_name, inplace=True)
    
    return True


def sa_plot_amount(col_name, type_info, is_empty=True, is_norm=True, is_sort=True):
    df = sa_data

    hue_col = res_cols_dict.get(type_info)

    if col_name not in df.columns or not hue_col:
        return False
    
    plot_data = df[[col_name, hue_col]].copy() \
                                       .rename(columns={hue_col: type_info})

    return plot_amount(plot_data, col_name, type_info, is_empty, is_norm, is_sort)


def sa_plot_age(col_name, is_norm=True):
    df = sa_data

    if col_name not in df.columns:
        return False
    
    return plot_age(df, col_name, is_norm)

    
def sa_agender(saga_name, type_ge=2):
    data = sa_data
    
    return agender(data, saga_name, type_ge)


def sa_edugender(col_name, type_ge=2):
    df = sa_data

    if col_name not in df.columns:
        return False
    
    return edugender(df, col_name, type_ge)

In [None]:
locals().update({x.lower(): x for x in sa_cols})

print(sa4, "...", sa20)

### SA 4: ... formal education ...

> This question has 6 different answer types, 3 of which are the most popular.
> 
> We will cut this list down to 4 by including the remaining answers in "Other".
> 
> The graph shows that the higher the education, the greater the percentage of women.
> 
> If we look at the countries, India has a decreasing percentage of those with higher education, while the USA, Germany and Britain have an increasing percentage.

In [None]:
sa_amount(sa4)

In [None]:
sa_trim_to(4, sa4)

In [None]:
sa_amount(sa4)

In [None]:
sa_agender(sa4)

In [None]:
sa_plot_amount(sa4, 'gender', is_sort=False)

In [None]:
sa_plot_amount(sa4, 'country', is_sort=False)

In [None]:
sa_plot_age(sa4)

### SA 5: ... current role ...

> This question can also be shortened by the number of answers by moving the part to "Other".
> 
> The most popular answer is "Student", second as "Data Scientist / ML Engineer".
> 
> If we look at the data by year, we can see that the largest increase in 2021, compared to previous years, was in the answer - "Not employed".

In [None]:
sa_amount(sa5)

In [None]:
sa_trim_to(8, sa5)

In [None]:
sa_agender(sa5)

In [None]:
sa_edugender(sa5)

In [None]:
sa_plot_amount(sa5, 'year')

In [None]:
sa_plot_age(sa5)

### SA 6: ... writing code ...

> The most popular part of the answers to this question is "1-3 years", while the figure practically does not change between the ages of 18 and 29, either for men or women, that is, a decrease.
> 
> For the answer option "3-5 years" the number increases to the age group 25-29 years - the same as the previous answer - and then also decreases.
> 
> If you look at the graph, you can see that the percentage of women is the highest for the answer "I never wrote code" and gradually decreases with increasing to the minimum value for the answer "20+ years".

In [None]:
sa_amount(sa6)

In [None]:
sa_agender(sa6)

In [None]:
sa_edugender(sa6)

In [None]:
sa_plot_amount(sa6, 'gender')

In [None]:
sa_plot_age(sa6)

### SA 7: ... programming language ...

> The absolute leader in the answer to this question is Python. The number of empty responses has been declining over the past two years.
> 
> The list of answers can be shortened for clarity of further analysis.
> 
> The popularity of Python's answer grows from 18 to 39, then gradually declines. If we look at the countries, we can see that the distribution is different for the "C/C++", India has the largest.

In [None]:
sa_amount(sa7, is_empty=False)

In [None]:
sa_amount(sa7)

In [None]:
sa_trim_to(5, sa7)

In [None]:
sa_agender(sa7)

In [None]:
sa_edugender(sa7)

In [None]:
sa_plot_amount(sa7, 'country')

In [None]:
sa_plot_age(sa7)

### SA 8: ... computing platform ...

> This question was absent in 2018-2019, the number of empty answers in 2020-2021 is minimal.
> 
> The most popular answer is "A personal computer".
> 
> If we analyze how the answers are distributed by gender, we can see that the percentage of women decreases from the option "Other" to the option "A deep learning workstation".

In [None]:
sa_amount(sa8, is_empty=False)

In [None]:
sa_amount(sa8)

In [None]:
sa_trim_to(4, sa8)

In [None]:
sa_agender(sa8)

In [None]:
sa_edugender(sa8)

In [None]:
sa_plot_amount(sa8, 'gender')

In [None]:
sa_plot_age(sa8)

### SA 9: ... used a TPU ...

> This question has answers only for 2020-2021, if we look at those who have used TPU one or more times - their number is growing, but not much.

In [None]:
sa_amount(sa9, is_empty=False)

In [None]:
sa_amount(sa9)

In [None]:
sa_agender(sa9)

In [None]:
sa_edugender(sa9)

In [None]:
sa_plot_amount(sa9, 'year')

In [None]:
sa_plot_age(sa9)

### SA 10: ... used machine learning ...

> Most of those who answered this question indicated less than 2 years of using machine learning methods. In second place were those who either did not answer or indicated the answer "I do not use ML".
> 
> At the same time, in 2019 there is no such possibility, but if we look at the number of empty ones, we can see that this was an alternative to the answer "I do not use ML".
> 
> If we analyze how the answers are distributed by gender, we can see that the percentage of women decreases from option "I do not use ML" to option "2-3" and after that it does not decrease much.

In [None]:
sa_amount(sa10, is_empty=False)

In [None]:
sa_agender(sa10)

In [None]:
sa_edugender(sa10)

In [None]:
sa_plot_amount(sa10, 'gender', is_empty=False)

In [None]:
sa_plot_age(sa10)

### SA 11: ... industry ...

> This question was in 2018 and 2021, it has the largest list of answer options, so we will shorten it.
> 
> The two most popular options are "Computers/Technology" and "Academics/Education".
> 
> The "Manufacturing/Fabrication" answer had an upward change.

In [None]:
sa_amount(sa11)

In [None]:
sa_trim_to(8, sa11)

In [None]:
sa_agender(sa11)

In [None]:
sa_edugender(sa11)

In [None]:
sa_plot_amount(sa11, 'year')

In [None]:
sa_plot_age(sa11)

### SA 12: ... size of the company ...

> The most popular answer to this question is "0-49 employees". In second place is the "10,000+ employees" option.
> 
> If we look at the countries, we can see that in the USA and India this response has the highest percentage.

In [None]:
sa_amount(sa12)

In [None]:
sa_agender(sa12)

In [None]:
sa_edugender(sa12)

In [None]:
sa_plot_amount(sa12, 'country')

In [None]:
sa_plot_age(sa12)

### SA 13: ... individuals are responsible for data science workloads ...

In [None]:
sa_amount(sa13, is_empty=False)

In [None]:
sa_agender(sa13)

In [None]:
sa_edugender(sa13)

In [None]:
sa_plot_age(sa13)

### SA 14: ... employer incorporate machine learning methods ...

> This question has the largest increase in empty responses. If we look at the countries, we can see that India has the highest percentage of such responses.

In [None]:
sa_amount(sa14, is_empty=False)

In [None]:
sa_agender(sa14)

In [None]:
sa_edugender(sa14)

In [None]:
sa_plot_amount(sa14, 'year', is_empty=False)

In [None]:
sa_plot_amount(sa14, 'country', is_empty=False)

In [None]:
sa_plot_age(sa14)

### SA 15: ... yearly compensation ...

> The number of respondents to this question depends on the answer option; the higher the amount, the fewer respondents.
> 
> The graph shows that the larger the amount, the more respondents from the USA.

In [None]:
sa_amount(sa15)

In [None]:
sa_agender(sa15)

In [None]:
sa_edugender(sa15)

In [None]:
sa_plot_amount(sa15, 'country')

In [None]:
sa_plot_age(sa15)

### SA 16: ... spent on machine learning ...

In [None]:
sa_amount(sa16)

In [None]:
sa_agender(sa16)

In [None]:
sa_edugender(sa16)

In [None]:
sa_plot_amount(sa16, 'country')

In [None]:
sa_plot_age(sa16)

### SA 17: ... cloud platforms ...

In [None]:
sa_amount(sa17, is_empty=False)

In [None]:
sa_amount(sa17)

In [None]:
sa_trim_to(6, sa17)

In [None]:
sa_agender(sa17)

In [None]:
sa_edugender(sa17)

In [None]:
sa_plot_amount(sa17, 'country')

In [None]:
sa_plot_age(sa17)

### SA 18: ... big data products ...

In [None]:
sa_amount(sa18)

In [None]:
sa_trim_to(8, sa18)

In [None]:
sa_agender(sa18)

In [None]:
sa_edugender(sa18)

In [None]:
sa_plot_amount(sa18, 'country')

In [None]:
sa_plot_age(sa18)

### SA 19: ... business intelligence tools ...

In [None]:
sa_amount(sa19)

In [None]:
sa_trim_to(4, sa19)

In [None]:
sa_agender(sa19)

In [None]:
sa_edugender(sa19)

In [None]:
sa_plot_amount(sa19, 'country')

In [None]:
sa_plot_amount(sa19, 'gender')

In [None]:
sa_plot_age(sa19)

### SA 20: ... primary tool ...

In [None]:
sa_amount(sa20)

In [None]:
sa_agender(sa20)

In [None]:
sa_edugender(sa20)

In [None]:
sa_plot_amount(sa20, 'year', is_empty=False)

In [None]:
sa_plot_age(sa20)

# 4. About Answers (group)

> In this section, we analyze group question-answers (there is a column for each answer option).
> 
> We will also analyze in terms of age, gender, country of the respondent and the year of the survey.
> 
> For this task, we will also create a separate dataframe, which includes group questions and data about respondents.

In [None]:
def get_ga_info(df):
    df_cols = df.shape[1]
    df_size = df.size
    df_count = df.count().sum()
    fullness = round((df_count * 100) / df_size, 2)
    fullness = "{}%".format(fullness)
    
    emptiness = round(df.isna().mean().mean() * 100, 2)
    emptiness = "{}%".format(emptiness)

    df_info = {'Cols': [df_cols], 'Empty': [emptiness]}

    result = pd.DataFrame(df_info)

    df_stats = pd.DataFrame(df.notna().sum(axis=1) \
                            .describe()).T \
                            .loc[:, 'min':'max'].astype(int)
    
    result = result.join(df_stats)
    
    return result


def ga_info(df, groups_list, is_style=True):
    
    result = pd.DataFrame()
    
    for group_name in groups_list:
        group_data = df.filter(like=(group_name + "_"))

        # Group title
        cols_list = group_data.columns.to_list()
        first_question = data_description.get(cols_list[0])
        group_select = '(Select all that apply)'
        group_title = first_question.split(group_select)[0]

        # Group info
        group_result = get_ga_info(group_data)
        group_result['Question'] = group_title
        group_result.index = [group_name]
        
        result = result.append(group_result)

    if is_style == True:
        color_cols = result.loc[:, 'min':'max'].columns.to_list()
        """
        result = result.style.text_gradient(subset=color_cols,
                                            cmap=option_cm,
                                            low=0.75, high=1.0)
        """
        result = result.style.background_gradient(subset=color_cols,
                                                  cmap=option_cm,
                                                  low=0.3, high=1.0,
                                                  axis=None)

    return result

In [None]:
ga_data = res_data.join(data.loc[res_data.index, multiple_cols])

print(ga_data.shape)

> Let's check the statistics, where we are interested in the average number of responses, as well as how many voids there were that will need to be analyzed separately.
> 
> It can be seen that the majority (75%) of the respondents chose 2 or more answer options in only a few questions.

In [None]:
ga_info(ga_data, multiple_groups)

## Check and Visualisation

> Let's start analyzing the answers, for this we will create the required number of variables for each available question.
> 
> In those groups where there are many empty columns (with answers), we will enlarge these answers - move them to Other.

In [None]:
def ga_amount(group_name, is_norm=False, is_sort=True, is_all=True, is_empty=True, is_style=True):
    df = ga_data
    
    if group_name not in multiple_groups:
        return False

    cols_list = df.filter(like=(group_name + "_")).columns.to_list()

    first_question = data_description.get(cols_list[0])
    group_select = '(Select all that apply)'
    group_title = first_question.split(group_select)[0]

    year_col = res_cols_dict.get('year')

    group_data = pd.melt(df, id_vars=[year_col],
                         value_vars=cols_list,
                         value_name=group_name)

    print("\n>>> ({}) {}\n".format(group_name, group_title))

    return table_amount(group_data, group_name, is_norm, is_sort, is_all, is_empty, is_style)


def ga_corr(group_name, upd='rows', method=None, plot_width=12):
    df = ga_data

    if group_name not in multiple_groups:
        return False

    group_data = df.filter(like=(group_name + "_"))
    
    return answers_corr(group_data, upd, method, plot_width)


def ga_describe(group_name):
    df = ga_data

    if group_name not in multiple_groups:
        return False

    group_data = df.filter(like=(group_name + "_"))
    
    return group_data.describe()


def ga_trim_to(n_labels, group_name):
    df = ga_data

    if group_name not in multiple_groups:
        return False

    group_data = df.filter(like=(group_name + "_"))
    
    if n_labels >= len(group_data.columns):
        return None

    group_stats = group_data.describe().loc[['top', 'count'], :].T \
                            .sort_values(by='count', ascending=False)
    
    other_name = "Other"
    is_other = group_stats['top'].str.contains(other_name).values
    
    cols_list = group_stats.loc[~is_other, 'top'].index.to_list()
    other_col = group_stats.loc[is_other, 'top'].index.to_list()  # list[0]
    
    trim_list = cols_list[ n_labels - 1 : ]
    
    if not other_col:
        other_col = group_name + '_XXX'
        df[other_col] = np.nan
    else:
        other_col = other_col[0]
    
    for x_col in trim_list:
        mask_notna = df[x_col].notna()
        df[x_col].mask(mask_notna, other_name, inplace=True)

        df[other_col].fillna(df[x_col], inplace=True)
        df.drop([x_col], axis=1, inplace=True)
        
    return True


def ga_plot_amount(group_name, type_info, is_empty=True, is_norm=True, is_sort=True):
    df = ga_data

    hue_col = res_cols_dict.get(type_info)

    if group_name not in multiple_groups or not hue_col:
        return False
            
    cols_list = df.filter(like=(group_name + "_")).columns.to_list()

    plot_data = pd.melt(df, id_vars=[hue_col], value_vars=cols_list,
                                               value_name=group_name) \
                            .rename(columns={hue_col: type_info})

    return plot_amount(plot_data, group_name, type_info, is_empty, is_norm, is_sort)


def ga_plot_age(group_name, is_norm=True):
    df = ga_data

    if group_name not in multiple_groups:
        return False

    cols_list = df.filter(like=(group_name + "_")).columns.to_list()

    age_name = "age"
    age_col = res_cols_dict.get(age_name)

    group_data = pd.melt(df, id_vars=[age_col], value_vars=cols_list,
                                               value_name=group_name)

    return plot_age(group_data, group_name, is_norm)


def ga_agender(saga_name, type_ge=2):
    df = ga_data
    
    return agender(df, saga_name, type_ge)


def ga_edugender(group_name, type_ge=2):
    if group_name not in multiple_groups:
        return False

    group_cols = ga_data.filter(like=(group_name + "_")).columns.to_list()

    edu_col = "SA4"
    gender_col = res_cols_dict.get('gender')
    
    info_cols = [gender_col, edu_col]

    df = pd.concat([sa_data[info_cols], ga_data[group_cols]], axis=1)
    
    group_data = pd.melt(df, id_vars=[edu_col, gender_col],
                         value_vars=group_cols, value_name=group_name)

    return edugender(group_data, group_name, type_ge)

In [None]:
locals().update({x.lower(): x for x in multiple_groups})

print(ga0, ga2, "...")
print(*multiple_groups)

### GA 0: ... programming languages ...

In [None]:
ga_amount(ga0)

In [None]:
ga_trim_to(9, ga0)

In [None]:
ga_amount(ga0)

In [None]:
ga_agender(ga0)

In [None]:
ga_edugender(ga0)

In [None]:
ga_plot_amount(ga0, 'gender')

In [None]:
ga_plot_amount(ga0, 'country')

In [None]:
ga_plot_age(ga0)

### GA 1: ... IDE's ...

In [None]:
ga_amount(ga1)

In [None]:
ga_trim_to(10, ga1)

In [None]:
ga_agender(ga1)

In [None]:
ga_edugender(ga1)

In [None]:
ga_plot_amount(ga1, 'gender')

In [None]:
ga_plot_amount(ga1, 'year')

In [None]:
ga_plot_age(ga1)

### GA 2: ... hosted notebook ...

In [None]:
ga_amount(ga2)

In [None]:
ga_trim_to(6, ga2)

In [None]:
ga_agender(ga2)

In [None]:
ga_edugender(ga2)

In [None]:
ga_plot_amount(ga2, 'country')

In [None]:
ga_plot_amount(ga2, 'year')

In [None]:
ga_plot_age(ga2)

### GA 3: ... specialized hardware ...

In [None]:
ga_amount(ga3)

In [None]:
ga_agender(ga3)

In [None]:
ga_edugender(ga3)

In [None]:
ga_plot_amount(ga3, 'gender')

In [None]:
ga_plot_amount(ga3, 'country')

In [None]:
ga_plot_age(ga3)

### GA 4: ... visualization libraries ...

In [None]:
ga_amount(ga4)

In [None]:
ga_trim_to(7, ga4)

In [None]:
ga_agender(ga4)

In [None]:
ga_edugender(ga4)

In [None]:
ga_plot_amount(ga4, 'year')

In [None]:
ga_plot_age(ga4)

### GA 5: ... machine learning frameworks ...

In [None]:
ga_amount(ga5)

In [None]:
ga_trim_to(8, ga5)

In [None]:
ga_agender(ga5)

In [None]:
ga_edugender(ga5)

In [None]:
ga_plot_amount(ga5, 'gender')

In [None]:
ga_plot_age(ga5)

### GA 6: ... ML algorithms ...

In [None]:
ga_amount(ga6)

In [None]:
ga_trim_to(10, ga6)

In [None]:
ga_agender(ga6)

In [None]:
ga_edugender(ga6)

In [None]:
ga_plot_amount(ga6, 'country')

In [None]:
ga_plot_age(ga6)

### GA 7: ... computer vision methods ...

In [None]:
ga_amount(ga7)

In [None]:
ga_agender(ga7)

In [None]:
ga_edugender(ga7)

In [None]:
ga_plot_amount(ga7, 'country')

In [None]:
ga_plot_age(ga7)

### GA 8: ... natural language processing ...

In [None]:
ga_amount(ga8)

In [None]:
ga_agender(ga8)

In [None]:
ga_edugender(ga8)

In [None]:
ga_plot_amount(ga8, 'country')

In [None]:
ga_plot_age(ga8)

### GA 9: ... role at work ...

In [None]:
ga_amount(ga9)

In [None]:
ga_agender(ga9)

In [None]:
ga_edugender(ga9)

In [None]:
ga_plot_amount(ga9, 'country')

In [None]:
ga_plot_age(ga9)

### GA 10: ... cloud computing platforms ... use ...

In [None]:
ga_amount(ga10)

In [None]:
ga_trim_to(5, ga10)

In [None]:
ga_agender(ga10)

In [None]:
ga_edugender(ga10)

In [None]:
ga_plot_amount(ga10, 'country')

In [None]:
ga_plot_age(ga10)

### GA 11: ... cloud computing platforms ... more familiar ...

In [None]:
ga_amount(ga11)

In [None]:
ga_trim_to(7, ga11)

In [None]:
ga_agender(ga11)

In [None]:
ga_edugender(ga11)

In [None]:
ga_plot_amount(ga11, 'gender')

In [None]:
ga_plot_age(ga11)

### GA 14: ... big data products ... use ...

In [None]:
ga_amount(ga14)

In [None]:
ga_agender(ga14)

In [None]:
ga_edugender(ga14)

In [None]:
# ga_plot_amount(ga14, 'country')

In [None]:
ga_plot_age(ga14)

### GA 15: ... big data products ... familiar ...

In [None]:
ga_amount(ga15)

In [None]:
ga_agender(ga15)

In [None]:
ga_edugender(ga15)

In [None]:
# ga_plot_amount(ga15, 'country')

In [None]:
ga_plot_age(ga15)

### GA 16: ... business intelligence tools ... use ...

In [None]:
ga_amount(ga16)

In [None]:
ga_trim_to(5, ga16)

In [None]:
ga_agender(ga16)

In [None]:
ga_edugender(ga16)

In [None]:
ga_plot_amount(ga16, 'country')

In [None]:
ga_plot_age(ga16)

### GA 17: ... business intelligence tools ... familiar ...

In [None]:
ga_amount(ga17)

In [None]:
ga_trim_to(5, ga17)

In [None]:
ga_agender(ga17)

In [None]:
ga_edugender(ga17)

In [None]:
ga_plot_amount(ga17, 'country')

In [None]:
ga_plot_age(ga17)

### GA 18: ... automated machine learning tools ... use ...

In [None]:
ga_amount(ga18)

In [None]:
ga_agender(ga18)

In [None]:
ga_edugender(ga18)

In [None]:
ga_plot_amount(ga18, 'country')

In [None]:
ga_plot_age(ga18)

### GA 19: ... automated machine learning tools ... familiar ...

In [None]:
ga_amount(ga19)

In [None]:
ga_agender(ga19)

In [None]:
ga_edugender(ga19)

In [None]:
ga_plot_amount(ga19, 'country')

In [None]:
ga_plot_age(ga19)

### GA 20: ... automated machine learning tools ... use ...

In [None]:
ga_amount(ga20)

In [None]:
ga_agender(ga20)

In [None]:
ga_edugender(ga20)

In [None]:
ga_plot_amount(ga20, 'country')

In [None]:
ga_plot_age(ga20)

### GA 21: ... automated machine learning tools ... familiar ...

In [None]:
ga_amount(ga21)

In [None]:
ga_agender(ga21)

In [None]:
ga_edugender(ga21)

In [None]:
ga_plot_amount(ga21, 'country')

In [None]:
ga_plot_age(ga21)

### GA 24: ... publicly share ...

In [None]:
ga_amount(ga24)

In [None]:
ga_trim_to(6, ga24)

In [None]:
ga_agender(ga24)

In [None]:
ga_edugender(ga24)

In [None]:
ga_plot_amount(ga24, 'country')

In [None]:
ga_plot_age(ga24)

### GA 25: ... begun or completed data science courses ...

In [None]:
ga_amount(ga25)

In [None]:
ga_agender(ga25)

In [None]:
ga_edugender(ga25)

In [None]:
ga_plot_amount(ga25, 'gender')

### GA 26: ... media sources ...

In [None]:
ga_amount(ga26)

In [None]:
ga_agender(ga26)

In [None]:
ga_edugender(ga26)

In [None]:
ga_plot_amount(ga26, 'year')

In [None]:
ga_plot_age(ga26)

# 5. Clastering

> The resulting three dataframes are merged and we are trying to do clustering.
> 
> At the stage of data merged, we choose what we do exclude - the individual characteristics of the respondents and/or the threshold values of the voids in the answers.

In [None]:
def get_empty_sa(more_than=None):
    df = sa_data
    
    cols_list = [col for col in sa_data.columns
                 if col not in res_data.columns]

    empty_cols_list = []

    if more_than == 0:
        return empty_cols_list

    if not more_than:
        return sa_info(df, cols_list)
    
    def check_col(df, col_name):
        data = df[col_name]
        
        emptiness = data.isna().mean() * 100
        
        if emptiness > more_than:
            return col_name

    for col in cols_list:
        result = check_col(df, col)
        
        if result:
            empty_cols_list.append(result)
            
    return empty_cols_list


def get_empty_ga(more_than=None):
    df = ga_data    
    groups_list = multiple_groups

    empty_group_list = []

    if more_than == 0:
        return empty_group_list

    if not more_than:
        return ga_info(df, groups_list)
    
    def check_group(df, group_name):
        group_data = df.filter(like=(group_name + "_"))
        
        emptiness = group_data.isna().mean().mean() * 100
        
        if emptiness > more_than:
            return group_name
    
    for group in groups_list:
        result = check_group(df, group)
        
        if result:
            empty_group_list.append(result)
            
    return empty_group_list


def transform_res(method, skip_cols=None, is_drop_first=False):
    df = res_data
    
    cols_list = res_data.columns.to_list()

    if skip_cols:
        skip_cols = [res_cols_dict.get(type_info)
                     for type_info in skip_cols]

        cols_list = [col for col in cols_list
                     if col not in skip_cols]

    data = df[cols_list]
    
    result = transform_cols(data, method, is_drop_first)

    if not isinstance(result, pd.DataFrame):
        result = pd.DataFrame(index=df.index)
    
    return result


def transform_sa(method, skip_cols=None):
    df = sa_data
    
    cols_list = [col for col in sa_data.columns
                 if col not in res_data.columns]
    
    if skip_cols:
        cols_list = [col for col in cols_list
                     if col not in skip_cols]
    
    data = df[cols_list]
    
    return transform_cols(data, method)


def transform_ga(method, skip_groups=None):
    df = ga_data
    groups_list = multiple_groups
    
    if len(skip_groups) == len(groups_list):
        return pd.DataFrame(index=ga_data.index)
    
    concat_data = []
    
    for group_name in groups_list:
        
        if skip_groups and group_name in skip_groups:
            continue
            
        group_data = df.filter(like=(group_name + "_"))
        concat_data.append(transform_cols(group_data, method))
    
    return pd.concat(concat_data, axis=1)

## 5.1 Extract & Predict

In [None]:
# Check empty
# get_empty_sa()
# get_empty_ga()

In [None]:
skip_res = ['country', 'gender', 'age']  # 'country', 'gender', 'age'
skip_sa = get_empty_sa(more_than=50)  # 50% empty
skip_ga = get_empty_ga(more_than=80)  # 80% empty

print(skip_sa)
print(skip_ga)

In [None]:
%%time
X = transform_res('onehot', skip_res, is_drop_first=False) \
    .join(transform_sa('cols', skip_sa)) \
    .join(transform_ga('rows', skip_ga))

In [None]:
X.shape

In [None]:
X.info(verbose=False, memory_usage='deep')

In [None]:
%%time
n_clusters = 6  # randint(2, 22)

kmeans = KMeans(n_clusters=n_clusters, random_state=option_rs)
X_clusters = kmeans.fit_predict(X)

## 5.2 PCA

In [None]:
%%time
pca = PCA(n_components=3, random_state=option_rs)
X_pca = pca.fit_transform(X)

In [None]:
total_var = pca.explained_variance_ratio_.sum() * 100

fig = px.scatter_3d(
    X_pca, x=0, y=1, z=2, color=X_clusters,
    title=f'Total Explained Variance: {total_var:.2f}%',
    labels={'0': 'PC 1', '1': 'PC 2', '2': 'PC 3'},
    width=800, height=600
)

fig.show()

In [None]:
# === KernelPCA ===
# Insufficient memory to process all data (to allocate more memory ...)

test_kernel_pca = False

if test_kernel_pca == True:

    x_data_trim_to = 0.3139  # 2021 year
    x_data_length = int(X.shape[0] * x_data_trim_to)

    X_short = X.iloc[:x_data_length]
    print(X_short.shape)

    X_short_clusters = kmeans.fit_predict(X_short)

    kernels = ['linear', 'rbf', 'poly', 'sigmoid', 'cosine']
    check_kernel = kernels[1]

    %%time

    kpca = KernelPCA(n_components=3, kernel=check_kernel, random_state=option_rs)
    X_short_kpca = kpca.fit_transform(X_short)

    fig = px.scatter_3d(
        X_short_kpca, x=0, y=1, z=2, color=X_short_clusters,
        labels={'0': 'PC 1', '1': 'PC 2', '2': 'PC 3'},
        width=800, height=600
    )

    fig.show()

## 5.3 Analysis

In [None]:
def get_saga_info(df, saga_info):
    col_clusters = "Clusters"
    
    if saga_info in res_cols_dict.keys():
        col_name = res_cols_dict.get(saga_info)
        cols_list = [col_name, col_clusters]
        data = df[cols_list].copy()
        data = data.rename(columns={col_name: saga_info})
        
        if saga_info == 'age':
            data = data.sort_values(by=saga_info)

        if saga_info == 'year':
            data[saga_info] = data[saga_info].astype(str)
        
        return data
        
    if 'SA' in saga_info:
        if saga_info not in single_cols:
            return False
        else:
            col_name = saga_info
            cols_list = [col_name, col_clusters]

            return df[cols_list]
    
    if 'GA' in saga_info:
        if saga_info not in multiple_groups:
            return False
        else:
            group_name = saga_info
            cols_group = df.filter(like=(group_name + "_")).columns.to_list()
            
            data = pd.melt(df, id_vars=[col_clusters],
                           value_vars=cols_group, value_name=group_name)
            
            cols_list = [group_name, col_clusters]
            
            return data[cols_list]


def clusters_amount(saga_info, is_norm=True, is_sort=True, is_all=True, is_empty=True):
    df = clusters_data
    col_clusters = "Clusters"
    
    result = get_saga_info(df, saga_info)
    
    if is_empty == False:
        result = result.fillna('-Empty-')

    if is_norm == True:
        how_normalize = 'index'
    else:
        how_normalize = False
    
    result = pd.crosstab(result[saga_info], result[col_clusters], margins=is_all,
                         rownames=['Answer'], normalize=how_normalize)

    if how_normalize:
        result = result.mul(100).round(2)
    
    if is_sort == True and 'All' in result.columns:
        result = result.sort_values('All', ascending=False)    
    
    return result
        
        
def clusters_plot(saga_info, is_empty=True, is_norm=False, is_sort=True):
    df = clusters_data
    col_clusters = "Clusters"
    
    data = get_saga_info(df, saga_info)
    
    plot_data = data[[saga_info, col_clusters]].copy().astype(str)

    return plot_amount(plot_data, col_clusters, saga_info, is_empty, is_norm, is_sort)
    

def clusters_agender(type_ge=2):
    data = clusters_data
    
    return agender(data, 'Clusters', type_ge)

In [None]:
clusters = pd.Series(data=X_clusters, index=res_data.index, name='Clusters')
clusters_data = res_data \
                .join(sa_data.drop(res_data.columns, axis=1)) \
                .join(ga_data.drop(res_data.columns, axis=1)) \
                .join(clusters)

print(clusters_data.shape)

In [None]:
clusters_amount('age', is_norm=False)

In [None]:
clusters_agender()

In [None]:
clusters_plot('age')

In [None]:
clusters_amount('gender')

In [None]:
clusters_plot('gender')

In [None]:
# ...