In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import re
from pathlib import Path

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import plotly.graph_objects as go

In [None]:
sns.set_style('white')

pd.set_option('max_columns', 400) # 20
pd.set_option('max_rows', 400) # 60
pd.set_option('max_colwidth', 400) # 50

DATA_PATH = Path('../input/kaggle-survey-2020/')

## Load data

In [None]:
raw_df = pd.read_csv(DATA_PATH/'kaggle_survey_2020_responses.csv', skiprows=[1])
raw_df.shape

In [None]:
questions = pd.read_csv(DATA_PATH/'kaggle_survey_2020_responses.csv', nrows=1)

questions_df = pd.DataFrame({
    'question_id': questions.columns.str.extract('(Q\d+(_[AB])*)')[0].unique(),
    'question_text': questions.loc[0].str.extract('(.*[\:\?]).*')[0].unique()
}).iloc[1:]
questions_df = questions_df.sort_values('question_id', key=lambda x: x.str[1:].str.zfill(2))

questions_df.shape

## Preprocess questions data

In [None]:
def preprocess_multiple_choices(s):
    res = tuple(sorted(set(s) - {np.nan}))
    res = tuple(map(lambda x: str(x).strip(), res))
    return res

def num_range_sort_order(s):
    res = re.search(r'[\d,]+', s).group()
    res = int(res.replace(',', ''))
    return res

In [None]:
df = pd.DataFrame()

### Duration (in seconds)

df['duration'] = raw_df['Time from Start to Finish (seconds)'].copy()

### Question 1 ("What is your age (# years)?")

df['age'] = raw_df['Q1'].copy()
df['age'] = df['age'].astype('category')
age_categories = df['age'].cat.categories.tolist()
df['age'].cat.set_categories(age_categories, ordered=True, inplace=True)

### Question 2 ("What is your gender (# years)?")

df['gender'] = raw_df['Q2'].copy()
df['gender'] = df['gender'].astype('category')
genders = ['Man', 'Woman', 'Nonbinary', 'Prefer not to say', 'Prefer to self-describe']
df['gender'].cat.set_categories(genders, inplace=True)

### Question 3 ("In which country do you currently reside?")

df['country'] = raw_df['Q3'].copy()
df['country'] = df['country'].astype('category')

### Question 4 ("What is the highest level of formal education that you have attained or plan to attain within the next 2 years?")

df['highest_education_level'] = raw_df['Q4'].copy()
df['highest_education_level'] = df['highest_education_level'].astype('category')

education_levels = [
    'No formal education past high school', 'Some college/university study without earning a bachelor’s degree',
    'Bachelor’s degree', 'Master’s degree', 'Doctoral degree', 'Professional degree', 
    'I prefer not to answer'
]
df['highest_education_level'].cat.set_categories(education_levels, inplace=True)

### Question 4 ("Select the title most similar to your current role (or most recent title if retired)")

df['job_title'] = raw_df['Q5'].copy()
df['job_title'] = df['job_title'].astype('category')

job_titles = [
    'Business Analyst',
    'Data Analyst', 'Data Engineer', 'Data Scientist', 'DBA/Database Engineer',
    'Machine Learning Engineer', 'Product/Project Manager',
    'Research Scientist', 'Software Engineer', 'Statistician', 'Student',
    'Currently not employed', 'Other'
]
df['job_title'].cat.set_categories(job_titles, inplace=True)

### Question 6 ("For how many years have you been writing code and/or programming?")

df['programming_experience'] = raw_df['Q6'].copy()
df['programming_experience'] = df['programming_experience'].astype('category')

programming_experience_years = [
    'I have never written code', 
    '< 1 years', '1-2 years', '3-5 years', 
    '5-10 years', '10-20 years', '20+ years'
]
df['programming_experience'].cat.set_categories(programming_experience_years, inplace=True)

### Question 7 ("What programming languages do you use on a regular basis?")

df['programming_languages'] = raw_df.filter(regex='Q7').apply(preprocess_multiple_choices, axis=1)

### Question 8 ("What programming language would you recommend an aspiring data scientist to learn first?")

df['programming_language_to_learn'] = raw_df['Q8'].copy()
df['programming_language_to_learn'] = df['programming_language_to_learn'].astype('category')

### Question 9 ("Which of the following integrated development environments (IDE's) do you use on a regular basis?")

df['ide'] = raw_df.filter(regex='Q9').apply(preprocess_multiple_choices, axis=1)

### Question 10 ("Which of the following hosted notebook products do you use on a regular basis?")

df['hosted_notebooks'] = raw_df.filter(regex='Q10').apply(preprocess_multiple_choices, axis=1)

### Question 11 ("What type of computing platform do you use most often for your data science projects?")

df['computing_platform'] = raw_df['Q11'].copy()
df['computing_platform'] = df['computing_platform'].astype('category')

### Question 12 ("Which types of specialized hardware do you use on a regular basis?")

df['specialized_hardware'] = raw_df.filter(regex='Q12').apply(preprocess_multiple_choices, axis=1)

### Question 13 ("Approximately how many times have you used a TPU (tensor processing unit)?")

df['tpu_using_count'] = raw_df['Q13'].copy()
df['tpu_using_count'] = df['tpu_using_count'].astype('category')

tpu_using_counts = [
    'Never', 'Once', 
    '2-5 times', '6-25 times', 'More than 25 times'
]
df['tpu_using_count'].cat.set_categories(tpu_using_counts, ordered=True, inplace=True)

### Question 14 ("What data visualization libraries or tools do you use on a regular basis?")

df['visualization_libraries'] = raw_df.filter(regex='Q14').apply(preprocess_multiple_choices, axis=1)

### Question 15 ("For how many years have you used machine learning methods?")

df['ml_experience'] = raw_df['Q15'].copy()
df['ml_experience'] = df['ml_experience'].astype('category')

ml_experience_years = [
    'I do not use machine learning methods', 'Under 1 year', '1-2 years',
    '2-3 years', '3-4 years', '4-5 years', '5-10 years', 
    '10-20 years', '20 or more years'
]
df['ml_experience'].cat.set_categories(ml_experience_years, ordered=True, inplace=True)

### Question 16 ("Which of the following machine learning frameworks do you use on a regular basis?")

df['ml_frameworks'] = raw_df.filter(regex='Q16').apply(preprocess_multiple_choices, axis=1)

### Question 17 ("Which of the following ML algorithms do you use on a regular basis?")

df['ml_algorithms'] = raw_df.filter(regex='Q17').apply(preprocess_multiple_choices, axis=1)

### Question 18 ("Which categories of computer vision methods do you use on a regular basis?")

df['cv_methods'] = raw_df.filter(regex='Q18').apply(preprocess_multiple_choices, axis=1)

### Question 19 ("Which of the following natural language processing (NLP) methods do you use on a regular basis?")

df['nlp_methods'] = raw_df.filter(regex='Q19').apply(preprocess_multiple_choices, axis=1)

### Question 20 ("What is the size of the company where you are employed?")

df['company_size'] = raw_df['Q20'].copy()
df['company_size'] = df['company_size'].astype('category')

company_size = sorted(df['company_size'].cat.categories, key=num_range_sort_order)
df['company_size'].cat.set_categories(company_size, ordered=True, inplace=True)

### Question 21 ("Approximately how many individuals are responsible for data science workloads at your place of business?")

df['company_ds_count'] = raw_df['Q21'].copy()
df['company_ds_count'] = df['company_ds_count'].astype('category')

company_ds_count = sorted(df['company_ds_count'].cat.categories, key=num_range_sort_order)
df['company_ds_count'].cat.set_categories(company_ds_count, ordered=True, inplace=True)

### Question 22 ("Does your current employer incorporate machine learning methods into their business?")

df['is_ml_used'] = raw_df['Q22'].copy()
df['is_ml_used'] = df['is_ml_used'].astype('category')

### Question 23 ("Select any activities that make up an important part of your role at work")

df['analytics_activities'] = raw_df.filter(regex='Q23').apply(preprocess_multiple_choices, axis=1)

### Question 24 ("What is your current yearly compensation (approximate $USD)?")

df['yearly_compensation'] = raw_df['Q24'].copy()
df['yearly_compensation'] = df['yearly_compensation'].astype('category')

yearly_compensation = sorted(df['yearly_compensation'].cat.categories, key=num_range_sort_order)
df['yearly_compensation'].cat.set_categories(yearly_compensation, ordered=True, inplace=True)

### Question 25 ("Approximately how much money have you (or your team) spent on machine learning and/or cloud computing services at home (or at work) in the past 5 years (approximate $USD)?")

df['ml_money_spent_5_years'] = raw_df['Q25'].copy()
df['ml_money_spent_5_years'] = df['ml_money_spent_5_years'].astype('category')

ml_money_spent_5_years = sorted(df['ml_money_spent_5_years'].cat.categories, key=num_range_sort_order)
df['ml_money_spent_5_years'].cat.set_categories(ml_money_spent_5_years, ordered=True, inplace=True)

### Question 26-A ("Which of the following cloud computing platforms do you use on a regular basis?")

df['cloud_platforms_a'] = raw_df.filter(regex='Q26_A').apply(preprocess_multiple_choices, axis=1)

### Question 26-B ("Which of the following cloud computing platforms do you hope to become more familiar with in the next 2 years?")

df['cloud_platforms_b'] = raw_df.filter(regex='Q26_B').apply(preprocess_multiple_choices, axis=1)

### Question 27-A ("Do you use any of the following cloud computing products on a regular basis?")

df['cloud_products_a'] = raw_df.filter(regex='Q27_A').apply(preprocess_multiple_choices, axis=1)

### Question 27-B ("In the next 2 years, do you hope to become more familiar with any of these specific cloud computing products?")

df['cloud_products_b'] = raw_df.filter(regex='Q27_B').apply(preprocess_multiple_choices, axis=1)

### Question 28-A ("Do you use any of the following machine learning products on a regular basis?")

df['ml_products_a'] = raw_df.filter(regex='Q28_A').apply(preprocess_multiple_choices, axis=1)

### Question 28-B ("In the next 2 years, do you hope to become more familiar with any of these specific machine learning products?")

df['ml_products_b'] = raw_df.filter(regex='Q28_B').apply(preprocess_multiple_choices, axis=1)

### Question 29-A ("Which of the following big data products (relational databases, data warehouses, data lakes, or similar) do you use on a regular basis?")

df['big_data_products_a'] = raw_df.filter(regex='Q29_A').apply(preprocess_multiple_choices, axis=1)

### Question 29-B ("Which of the following big data products (relational databases, data warehouses, data lakes, or similar) do you hope to become more familiar with in the next 2 years?")

df['big_data_products_b'] = raw_df.filter(regex='Q29_B').apply(preprocess_multiple_choices, axis=1)

### Question 30 ("Which of the following big data products (relational database, data warehouse, data lake, or similar) do you use most often?")

df['big_data_product'] = raw_df['Q30'].copy()
df['big_data_product'] = df['big_data_product'].astype('category')

### Question 31-A ("Which of the following business intelligence tools do you use on a regular basis?")

df['bi_tools_a'] = raw_df.filter(regex='Q31_A').apply(preprocess_multiple_choices, axis=1)

### Question 31-B ("Which of the following business intelligence tools do you hope to become more familiar with in the next 2 years?")

df['bi_tools_b'] = raw_df.filter(regex='Q31_B').apply(preprocess_multiple_choices, axis=1)

### Question 32 ("Which of the following business intelligence tools do you use most often?")

df['bi_tool'] = raw_df['Q32'].copy()
df['bi_tool'] = df['bi_tool'].astype('category')

### Question 33-A ("Do you use any automated machine learning tools (or partial AutoML tools) on a regular basis?")

df['automl_tools_categories_a'] = raw_df.filter(regex='Q33_A').apply(preprocess_multiple_choices, axis=1)

### Question 33-B ("Which categories of automated machine learning tools (or partial AutoML tools) do you hope to become more familiar with in the next 2 years?")

df['automl_tools_categories_b'] = raw_df.filter(regex='Q33_B').apply(preprocess_multiple_choices, axis=1)

### Question 34-A ("Which of the following automated machine learning tools (or partial AutoML tools) do you use on a regular basis?")

df['automl_tools_a'] = raw_df.filter(regex='Q34_A').apply(preprocess_multiple_choices, axis=1)

### Question 34-B ("Which specific automated machine learning tools (or partial AutoML tools) do you hope to become more familiar with in the next 2 years?")

df['automl_tools_b'] = raw_df.filter(regex='Q34_B').apply(preprocess_multiple_choices, axis=1)

### Question 35-A ("Do you use any tools to help manage machine learning experiments?")

df['ml_experiments_tools_a'] = raw_df.filter(regex='Q35_A').apply(preprocess_multiple_choices, axis=1)

### Question 35-B ("In the next 2 years, do you hope to become more familiar with any of these tools for managing ML experiments?")

df['ml_experiments_tools_b'] = raw_df.filter(regex='Q35_B').apply(preprocess_multiple_choices, axis=1)

### Question 36 ("Where do you publicly share or deploy your data analysis or machine learning applications?")

df['ml_sharing_tools'] = raw_df.filter(regex='Q36').apply(preprocess_multiple_choices, axis=1)

### Question 37 ("On which platforms have you begun or completed data science courses?")

df['course_platforms'] = raw_df.filter(regex='Q37').apply(preprocess_multiple_choices, axis=1)

### Question 38 ("What is the primary tool that you use at work or school to analyze data?")

df['primary_analytics_tool'] = raw_df['Q38'].copy()
df['primary_analytics_tool'] = df['primary_analytics_tool'].astype('category')

### Question 39 ("Who/what are your favorite media sources that report on data science topics?")

df['media_sources'] = raw_df.filter(regex='Q39').apply(preprocess_multiple_choices, axis=1)

## Review data

In [None]:
df.head(1).T

In [None]:
questions_df

## Explore survey flow

In [None]:
def sliding_window(arr, window_size):
    return tuple(arr[x: x + window_size] for x in range(len(arr) - window_size + 1))

def calculate_flow_stats(df, flow_windows, flow_transition_name, 
                         top_n=5, line_width=500, length_limit=500):
    flow_filter = flow_windows.apply(lambda x: flow_transition_name in x)
    flow_stats = (df.loc[flow_filter, flow_transition_name[0]]
                    .value_counts(normalize=True, dropna=False)
                    .iloc[:top_n])
    
    flow_stats.index = flow_stats.index.map(str)
    flow_stats.index = flow_stats.index.str.strip()
    flow_stats.index = flow_stats.index.str.wrap(line_width, expand_tabs=True)
    flow_stats.index = flow_stats.index.str.replace('\n', '<br>')
    
    flow_stats_df = flow_stats.reset_index()
    flow_stats_df.columns = [flow_transition_name[0], 'prop']

    flow_stats_string = flow_stats_df.to_string(index=False, header=False).replace('\n', '<br>')
    flow_stats_string = 'Stats text is too long' if len(flow_stats_string) > length_limit else flow_stats_string
    return flow_stats_string

In [None]:
survey_df = df.drop('duration', axis=1)
survey_df = survey_df.applymap(lambda x: np.nan if x==() else x)

survey_mask_df = survey_df.notnull().astype('int8')
survey_mask_df[:] = np.where(survey_mask_df == 1, survey_mask_df.columns, '')

idx_to_question_map = dict(zip(range(len(survey_df.columns)), survey_df.columns))
question_to_idx_map = dict(zip(survey_df.columns, range(len(survey_df.columns))))

flow = (survey_mask_df.apply(lambda questions: tuple(x for x in questions if len(x) > 0), axis=1)
                      .replace(idx_to_question_map))

flow_transitions = flow.apply(sliding_window, window_size=2).explode()
flow_windows = flow.apply(sliding_window, window_size=2)

flow_df = flow_transitions.value_counts().reset_index()
flow_df.columns = ['transition', 'count']
flow_df[['source', 'target']] = flow_df['transition'].apply(pd.Series)
flow_df['source'] = flow_df['source'].replace(question_to_idx_map)
flow_df['target'] = flow_df['target'].replace(question_to_idx_map)
flow_df['link_stats'] = flow_df['transition'].apply(lambda x: calculate_flow_stats(survey_df, flow_windows, flow_transition_name=x))

In [None]:
line_width = 30

labels = questions_df['question_id'] + ': ' + survey_mask_df.columns 
labels = list(labels) 

node_questions = questions_df['question_text'].str.wrap(line_width).str.replace('\n', '<br>')

link = dict(source = flow_df['source'], 
            target = flow_df['target'], 
            value = flow_df['count'],
            customdata = flow_df['link_stats'],
            hovertemplate='<b>Source:</b> %{source.label}<br><br>%{customdata}<br><br><b>Target:</b> %{target.label}')
node = dict(label = labels, 
            customdata = node_questions,
            hovertemplate='%{customdata}')

data = go.Sankey(link = link, node = node, orientation = 'v', valueformat='d')

fig = go.Figure(data, layout = {'height': 3000, 'width': 800})
fig.update_layout(
    title={'text': 'Survey flow', 'x': 0.5},
    font=dict(size = 12),
    hoverlabel = {'align': 'right', 'namelength': -1}
)
fig.show()