# Import and Data¶

In [None]:
import numpy as np
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns

import os
import re

from IPython.core.display import display, HTML

There are 4 different files with data for this competition.

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Kaggle’s State of Machine Learning and Data Science 2019 survey is **34** different questions that fully describe the area.

I conditionally devided questins in **3 types**: 
- Simple as Q1, Q3, Q4, etc.
- Sipmle with Selected Choice as Q2, Q5, etc.
- Multiple Choice (Select all that apply) as Q9, Q12, Q13, etc.

You can find a full list of questions in `/kaggle/input/kaggle-survey-2019/questions_only.csv` 

In [None]:
question_df = pd.read_csv('/kaggle/input/kaggle-survey-2019/questions_only.csv')
questions = ''.join([f'<li>{i}</li>' for i in question_df.T[0][1:]])
display(HTML(f'<h3 style="color:green">Question List:</h3><ol>{questions}</ol>'))

Almost all answers are located in `/kaggle/input/kaggle-survey-2019/multiple_choice_responses.csv`

In [None]:
multiple_choice_responses_file = '/kaggle/input/kaggle-survey-2019/multiple_choice_responses.csv'

In [None]:
df = pd.read_csv(multiple_choice_responses_file, skiprows=(1,1))
df.head()

![](http://)There are 19717 rows and **246** columns. But if all columns are correct?

After additional checking, I realized that all columns with pattern `TEXT` in column names contain wrong data for analysis. It is **28** columns.

There are **15** columns that describe simple questions and **202** columns for multiple choice questions. Lists of columns below.

In [None]:
df.shape

In [None]:
columns_text = [col for col in list(df.columns) if 'TEXT' in col]
print(len(columns_text))
print(columns_text)

In [None]:
columns_multiple = [col for col in list(df.columns) if re.search('Part_\d{1,2}$', col)]
# print(len(columns_multiple))
# print(columns_multiple)

In [None]:
multiple_columns_list = [ [col]+col.split('_') for col in columns_multiple ]
ds_multiple = pd.DataFrame(multiple_columns_list).groupby([1])[0].apply(list)

In [None]:
question_numbers_list = sorted([int(i.split('Q')[1]) for i in list(ds_multiple.index)])
question_list = [ 'Q{}'.format(i) for i in question_numbers_list]
questions = ''.join([f'<li>{i}</li>' for i in question_list])
display(HTML(f'<h3 style="color:green">Multiple Choice Question</h3><ol>{questions}</ol>'))

In [None]:
columns_simple = [col for col in list(df.columns) if re.search('Q\d{1,2}$', col)]
# print(len(columns_simple))
# print(columns_simple)
questions = ''.join([f'<li>{i}</li>' for i in columns_simple])
display(HTML(f'<h3 style="color:green">Simple Question</h3><ol>{questions}</ol>'))

# Part1. Respondents Distribution for Simple Questions¶

In [None]:
df_simple = df[columns_simple]

In [None]:
def countplot_top(data, col, xlabel, ylabel, fs, top, title=''):
    
    plt.figure(figsize=(10, 10))
        
    ax = sns.countplot(y=col, data=data, order=data[col].value_counts().iloc[:top].index, color='green')
    plt.title(title, fontsize=fs)
    plt.xlabel(xlabel, fontsize=fs)
    plt.ylabel(ylabel, fontsize=fs)
    plt.grid(axis='x', linestyle='-.')

    sns.despine()
    
    for patch, value in zip(ax.patches, data[col].value_counts()):  
        ax.text(patch.get_width() + 100, patch.get_y() + (patch.get_bbox().y1-patch.get_y())/2,
            value,
            ha="left", va='center',
            fontsize=18)
        
    new_ytickslabel = []
    for i in ax.get_yticklabels():
        new_ytickslabel.append( ''.join([l + '\n' * (n % 50 == 49) for n, l in enumerate(list(i.get_text()))]) ) \
                if len(i.get_text())>50 else new_ytickslabel.append(' '*(50-len(i.get_text()))+i.get_text())
    ax.set_yticklabels(new_ytickslabel)
    ax.tick_params(axis='both', which='major', labelsize=18)


In [None]:
xlabel='# of Respondents'
ylabel=''
fntsz=20
top=15

In [None]:
column='Q1'
print(question_df[column][0])
countplot_top(data=df_simple, col=column, xlabel=xlabel, ylabel=ylabel, fs=fntsz, top=top)

In [None]:
column='Q2'
print(question_df[column][0])
countplot_top(data=df_simple, col=column, xlabel=xlabel, ylabel=ylabel, fs=fntsz, top=top)

In [None]:
country_dict = {'United States of America':'USA', 
                'United Kingdom of Great Britain and Northern Ireland':'UK'}
df_simple['Q3'] = df_simple['Q3'].replace(country_dict)

In [None]:
column = 'Q3'
print(question_df[column][0])
countplot_top(data=df_simple, col=column, xlabel=xlabel, ylabel=ylabel, fs=fntsz, top=top)

In [None]:
column = 'Q4'
print(question_df[column][0])
countplot_top(data=df_simple, col=column, xlabel=xlabel, ylabel=ylabel, fs=fntsz, top=top)

In [None]:
column = 'Q5'
print(question_df[column][0])
countplot_top(data=df_simple, col=column, xlabel=xlabel, ylabel=ylabel, fs=fntsz, top=top)

In [None]:
column = 'Q6'
print(question_df[column][0])
countplot_top(data=df_simple, col=column, xlabel=xlabel, ylabel=ylabel, fs=fntsz, top=top)

In [None]:
column = 'Q7'
print(question_df[column][0])
countplot_top(data=df_simple, col=column, xlabel=xlabel, ylabel=ylabel, fs=fntsz, top=top)

In [None]:
column = 'Q8'
print(question_df[column][0])
countplot_top(data=df_simple, col=column, xlabel=xlabel, ylabel=ylabel, fs=fntsz, top=top)

In [None]:
column = 'Q10'
print(question_df[column][0])
countplot_top(data=df_simple, col=column, xlabel=xlabel, ylabel=ylabel, fs=fntsz, top=top)

In [None]:
column = 'Q11'
print(question_df[column][0])
countplot_top(data=df_simple, col=column, xlabel=xlabel, ylabel=ylabel, fs=fntsz, top=top)

In [None]:
column = 'Q14'
print(question_df[column][0])
countplot_top(data=df_simple, col=column, xlabel=xlabel, ylabel=ylabel, fs=fntsz, top=top)

In [None]:
column = 'Q15'
print(question_df[column][0])
countplot_top(data=df_simple, col=column, xlabel=xlabel, ylabel=ylabel, fs=fntsz, top=top)

In [None]:
column = 'Q19'
print(question_df[column][0])
countplot_top(data=df_simple, col=column, xlabel=xlabel, ylabel=ylabel, fs=fntsz, top=top)

In [None]:
column = 'Q22'
print(question_df[column][0])
countplot_top(data=df_simple, col=column, xlabel=xlabel, ylabel=ylabel, fs=fntsz, top=top)

In [None]:
column = 'Q23'
print(question_df[column][0])
countplot_top(data=df_simple, col=column, xlabel=xlabel, ylabel=ylabel, fs=fntsz, top=top)

# Part2. Respondents Distribution for Multiple Choice Question¶


In [None]:
def get_multiple_question_data(question, ds_multiple, data):
    columns_list = ds_multiple[question]
    data = data[columns_list]
    data_list = [ data[col].value_counts().to_dict() for col in data.columns ]
    data_dict = { k:v for values in data_list for k, v in values.items() }
    return pd.DataFrame.from_dict(data_dict, orient='index').sort_values(0, ascending=False)

In [None]:
def barplot_top(data, xlabel, ylabel, fs, title=''):
    
    plt.figure(figsize=(10, 10))

    ax = sns.barplot(data[0], data.index, color='green')
    plt.title('{}\n'.format(title), fontsize=22, color='blue')
    plt.xlabel(xlabel, fontsize=fs)
    plt.ylabel(ylabel, fontsize=fs)
    plt.grid(axis='x', linestyle='-.')
    sns.despine()

    for patch, value in zip(ax.patches, data[0]):
        ax.text(patch.get_width() + 100, patch.get_y() + (patch.get_bbox().y1-patch.get_y())/2,
                value,
                ha="left", va='center',
                fontsize=18)

    new_ytickslabel = []
    for i in ax.get_yticklabels():
        new_ytickslabel.append( ''.join([l + '\n' * (n % 50 == 49) for n, l in enumerate(list(i.get_text()))]) ) \
                if len(i.get_text())>50 else new_ytickslabel.append(' '*(50-len(i.get_text()))+i.get_text())
    ax.set_yticklabels(new_ytickslabel)
    ax.tick_params(axis='both', which='major', labelsize=18)


In [None]:
xlabel='# of Respondents'
ylabel=''
fntsz=20

In [None]:
# question_numbers_list = sorted([int(i.split('Q')[1]) for i in list(ds_multiple.index)])
# for question in question_numbers_list:
#     question = 'Q{}'.format(question)
#     data=get_multiple_question_data(question, ds_multiple, df)
#     barplot_top(data=data, xlabel=xlabel, ylabel=ylabel, fs=fntsz, title=question_df.T.loc[question][0])

In [None]:
question = 'Q9'
print(question_df.T.loc[question][0][:-42])
data=get_multiple_question_data(question, ds_multiple, df)
barplot_top(data=data, xlabel=xlabel, ylabel=ylabel, fs=fntsz)

In [None]:
question = 'Q12'
print(question_df.T.loc[question][0][:-42])
data=get_multiple_question_data(question, ds_multiple, df)
barplot_top(data=data, xlabel=xlabel, ylabel=ylabel, fs=fntsz)

In [None]:
question = 'Q13'
print(question_df.T.loc[question][0][:-42])
data=get_multiple_question_data(question, ds_multiple, df)
barplot_top(data=data, xlabel=xlabel, ylabel=ylabel, fs=fntsz)

In [None]:
question = 'Q16'
print(question_df.T.loc[question][0][:-42])
data=get_multiple_question_data(question, ds_multiple, df)
barplot_top(data=data, xlabel=xlabel, ylabel=ylabel, fs=fntsz)

In [None]:
question = 'Q17'
print(question_df.T.loc[question][0][:-42])
data=get_multiple_question_data(question, ds_multiple, df)
barplot_top(data=data, xlabel=xlabel, ylabel=ylabel, fs=fntsz)

In [None]:
question = 'Q18'
print(question_df.T.loc[question][0][:-42])
data=get_multiple_question_data(question, ds_multiple, df)
barplot_top(data=data, xlabel=xlabel, ylabel=ylabel, fs=fntsz)

In [None]:
question = 'Q20'
print(question_df.T.loc[question][0][:-42])
data=get_multiple_question_data(question, ds_multiple, df)
barplot_top(data=data, xlabel=xlabel, ylabel=ylabel, fs=fntsz)

In [None]:
question = 'Q21'
print(question_df.T.loc[question][0][:-42])
data=get_multiple_question_data(question, ds_multiple, df)
barplot_top(data=data, xlabel=xlabel, ylabel=ylabel, fs=fntsz)

In [None]:
question = 'Q24'
print(question_df.T.loc[question][0][:-42])
data=get_multiple_question_data(question, ds_multiple, df)
barplot_top(data=data, xlabel=xlabel, ylabel=ylabel, fs=fntsz)

In [None]:
question = 'Q25'
print(question_df.T.loc[question][0][:-42])
data=get_multiple_question_data(question, ds_multiple, df)
barplot_top(data=data, xlabel=xlabel, ylabel=ylabel, fs=fntsz)

In [None]:
question = 'Q26'
print(question_df.T.loc[question][0][:-42])
data=get_multiple_question_data(question, ds_multiple, df)
barplot_top(data=data, xlabel=xlabel, ylabel=ylabel, fs=fntsz)

In [None]:
question = 'Q27'
print(question_df.T.loc[question][0][:-42])
data=get_multiple_question_data(question, ds_multiple, df)
barplot_top(data=data, xlabel=xlabel, ylabel=ylabel, fs=fntsz)

In [None]:
question = 'Q28'
print(question_df.T.loc[question][0][:-42])
data=get_multiple_question_data(question, ds_multiple, df)
barplot_top(data=data, xlabel=xlabel, ylabel=ylabel, fs=fntsz)

In [None]:
question = 'Q29'
print(question_df.T.loc[question][0][:-42])
data=get_multiple_question_data(question, ds_multiple, df)
barplot_top(data=data, xlabel=xlabel, ylabel=ylabel, fs=fntsz)

In [None]:
question = 'Q30'
print(question_df.T.loc[question][0][:-42])
data=get_multiple_question_data(question, ds_multiple, df)
barplot_top(data=data, xlabel=xlabel, ylabel=ylabel, fs=fntsz)

In [None]:
question = 'Q31'
print(question_df.T.loc[question][0][:-42])
data=get_multiple_question_data(question, ds_multiple, df)
barplot_top(data=data, xlabel=xlabel, ylabel=ylabel, fs=fntsz)

In [None]:
question = 'Q32'
print(question_df.T.loc[question][0][:-42])
data=get_multiple_question_data(question, ds_multiple, df)
barplot_top(data=data, xlabel=xlabel, ylabel=ylabel, fs=fntsz)

In [None]:
question = 'Q33'
print(question_df.T.loc[question][0][:-42])
data=get_multiple_question_data(question, ds_multiple, df)
barplot_top(data=data, xlabel=xlabel, ylabel=ylabel, fs=fntsz)

In [None]:
question = 'Q34'
print(question_df.T.loc[question][0][:-42])
data=get_multiple_question_data(question, ds_multiple, df)
barplot_top(data=data, xlabel=xlabel, ylabel=ylabel, fs=fntsz)

# Conclusion
To be continued...