In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import warnings

plt.rcParams['axes.unicode_minus'] = False
sns.set(font_scale = 1)  
plt.style.use(['fivethirtyeight'])
pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

print("Let's start!")

In [None]:
raw = pd.read_csv('../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv')

display(raw.shape, raw.head(3))

<br>

* The first index named Time from Start to Finish(seconds) shows the duration in seconds the respondent answered the whole question of the survey.


* From Q1 to Q6: We can easily sort and count, eventually use the statistics for each question to visualize them.


* The problem is from Q7 because each of question from Q7 is consist of the sub part which is usually more than 5.


* Therefore we need to create the idea on how to use these data for the visualization.

<br>

## Question and Answer Split

In [None]:
question = raw.iloc[0]

question

In [None]:
# type(question)
# question.values

In [None]:
# raw.drop?

In [None]:
answer = raw.drop([0])   # we can drop the specific row using the index number

answer.head(3)

In [None]:
answer.info()

<br>

* The type of every data in the answer data is object.


* We will use the headers as the key to connect the question object and the answer object.


* Let's dive into the survey analysis from now on!

<br>

<br>

## Q1. What is your age(# years)

<br>

In [None]:
question['Q1']

In [None]:
answer['Q1'].value_counts(normalize = True) * 100

* Let me adjust the above code to show the table with the age-range ascending

In [None]:
Q1 = answer['Q1'].value_counts().sort_index()

Q1

In [None]:
plt.figure(figsize = (8, 4))
ax = sns.countplot(data = answer.sort_values('Q1'),
                   x = 'Q1',
                   palette = 'Blues_r')

ax.bar_label(ax.containers[0])
ax.set_ylim(0, 4500)
ax.set_title(question['Q1'], pad = 20)

plt.show()

<br>

## Q2. What is your gender? - selected choice

<br>

In [None]:
question['Q2']

In [None]:
Q2 = answer['Q2'].value_counts()

Q2

In [None]:
plt.figure(figsize = (6, 4))
ax = sns.countplot(data = answer,
                   x = 'Q2',
                   palette = 'Blues_r')

ax.bar_label(ax.containers[0])
ax.set_ylim(0, 18000)
ax.set_title(question['Q2'])

plt.xticks(rotation = 45)
plt.show()

In [None]:
plt.figure(figsize = (6, 4))
ax = sns.countplot(data = answer,
                   y = 'Q2',
                   palette = 'Blues_r')

ax.bar_label(ax.containers[0])
# ax.set_ylim(0, 17000)
ax.set_title(question['Q2'], pad = 20)

plt.show()

<br>

## Q3. In which country do you currently reside?

<br>

In [None]:
answer.Q3.nunique()

In [None]:
plt.figure(figsize = (10, 15))

ax = sns.countplot(data = answer,
                  y = 'Q3',
                  palette = 'Blues_r',
                  order = answer.Q3.value_counts().index)

ax.bar_label(ax.containers[0])
ax.set_title(question['Q3'], pad = 20)

plt.show()

<br>

## Q4. What is the highest level of formal education that you have attained or plan to attain within the next 2 years ?

<br>

In [None]:
question.Q4

In [None]:
plt.figure(figsize = (6, 4))

ax = sns.countplot(data = answer,
                  y = 'Q4',
                  palette = 'Blues_r',
                  order = answer.Q4.value_counts().index)

ax.bar_label(ax.containers[0])
ax.set_title(question['Q4'], pad = 20)

plt.show()

<br>


We realize that the quetion no.1 ~ no.6 has the same format of countplot because that questions are all about the only one answer to one respondent.


Then what about defining the function to create the visualization of Q1 ~ Q6?


<br>

## Handling the single choice columns by customized function

In [None]:
# define the function

def show_countplot_by_qno(qno, fsize = (6, 4), order = None):
    
    """
    qno: question number of the survey, ex) "Q2"
    fsize: the argument of plt.figure(figsize = ), ex) (5, 5)
    order: defalut is value_counts().index, and it is optional but need to be a list type.
    """
    
    if not order:
        order = answer[qno].value_counts().index
        
    plt.figure(figsize = fsize)
    ax = sns.countplot(data = answer,
                       y = qno,
                       palette = 'Blues_r',
                       order = order)
    
    ax.bar_label(ax.containers[0])
    ax.set_title(question[qno], pad = 20)
    
    plt.show()

<br>

seems better and more efficient to create the visualization for each Q.

Thanks to [corazzon](https://www.kaggle.com/corazzon)

<br>

In [None]:
show_countplot_by_qno('Q4')

<br>


## Q5. Select the title most similar to your current role (or most recent title if retired)


<br>

In [None]:
question.Q5

In [None]:
# used the function defined above

show_countplot_by_qno('Q5')

<br>


### Q6. For how many years have you been writing code and/or programming?


<br>

In [None]:
question.Q6

In [None]:
order_q6 = answer.Q6.value_counts().sort_index().index.to_list()

In [None]:
# used the function defined above
# I'd like to change the order as per the ascending years of the programming experience

show_countplot_by_qno('Q6',
                     order = order_q6)

<br>


## Handling the multiple choice columns


<br>

<br>

Now we face another problem.

Each Q from Q7 has the sub part in which we will meet the value(str) or NaN.

How could we handle it for the visualization?

On this kernel I introduce the powerful but simple function of pandas, "filter".


<br>

<br>


## Q7. What programming languages do you use on a regular basis?


<br>

<br>

Now we face another problem.

Each Q from Q7 has the sub part in which we will meet the value(str) or NaN.

How could we handle it for the visualization?

On this kernel I introduce the powerful but simple function of pandas, "filter".


<br>

In [None]:
question7 = question.filter(regex = 'Q7')[0].split("-")[0]
question7

In [None]:
# regex : the abbreviation of regular expression

answer7 = answer.filter(regex = 'Q7')

answer7.head(3)

<br>

If, for example, we count the number of "Python" in the first column Q7_Part_1, excluding NaN, we will get the number of response of Python user. 


Simple way to do so? Yeah, just look at the below.

<br>

In [None]:
answer7.notnull().sum()

<br>

Hmm, I think there be other way to sort the values instead of using .notnull().sum() method.

What about .describe() ?

<br>

In [None]:
answer7_desc = answer7.describe()
answer7_desc

In [None]:
# .transpose() shows the same result as .T method

answer7 = answer7_desc.loc[['top', 'count']].T
answer7 = answer7.set_index('top')
answer7 = answer7.sort_values(by = 'count', ascending = False)

answer7

In [None]:
ax = sns.barplot(data = answer7,
                 x='count',
                 y = answer7.index,
                 palette = 'Blues_r')

ax.bar_label(ax.containers[0])
ax.set_title(question7, pad = 20)

plt.show()

<br>

We are able to deal with the "Select all that apply" question like Q7 creating the function as below.

<br>

In [None]:
# "get_title" will be used in generating the below function.

def get_title(qno):
    if qno in question.index:
        return question[qno]
    
    else:
        title = question.filter(regex = qno)[0].split('-')[0]
        return title

In [None]:
# define the function

def show_barplot_by_qno(qno, fsize = (6, 4)):
    
    """
    qno: question number of the survey, ex) "Q7"
    fsize: the argument of plt.figure(figsize = ), ex) (5, 5)
    """
    df = answer.filter(regex = qno)
    df = df.describe()
    df = df.loc[['top', 'count']].T.set_index('top')
    df = df.sort_values(by = 'count', ascending = False)
    
    plt.figure(figsize = fsize)
    ax = sns.barplot(data = df,
                     x = 'count',
                     y = df.index,
                     palette = 'Blues_r',
                     ci = None)
    
    ax.bar_label(ax.containers[0])
    ax.set_title(get_title(qno), pad = 20)
    
    plt.show()

In [None]:
# Let's have an experiment using Q7.

show_barplot_by_qno('Q7')

<br>

Recall that we have two kinds of functions to visualize the data.

- show_countplot_by_qno: for the single choice question(ex. Q1)

- show_barplot_by_qno: for the multiple choice question(ex. Q7)


<br>

<br>


## Q8. What programming language would you recommend an aspiring data scientist to learn first? - Selected Choice


<br>

In [None]:
get_title('Q8')

<br>

It's been easy to verify the type of question after creating the definition 'get_title'.

<br>

In [None]:
show_countplot_by_qno('Q8')

<br>


## Q9. Which of the following integrated development environments (IDE's) do you use on a regular basis?


<br>

In [None]:
get_title('Q9')

In [None]:
show_barplot_by_qno('Q9')

<br>


## Q10. Which of the following hosted notebook products do you use on a regular basis?


<br>

In [None]:
get_title('Q10')

In [None]:
show_barplot_by_qno('Q10')

<br>


## Q11. What type of computing platform do you use most often for your data science projects?


<br>

In [None]:
get_title('Q11')

In [None]:
show_countplot_by_qno('Q11')

<br>


## Q12. Which types of specialized hardware do you use on a regular basis?  (Select all that apply)


<br>

In [None]:
get_title('Q12')

In [None]:
show_barplot_by_qno('Q12')

<br>


## Q13. Approximately how many times have you used a TPU (tensor processing unit)?


<br>

In [None]:
get_title('Q13')

In [None]:
Q13_order = ['Never', 'Once', '2-5 times', '6-25 times', 'More than 25 times']

In [None]:
show_countplot_by_qno('Q13', order = Q13_order)

<br>


## Q14. What data visualization libraries or tools do you use on a regular basis?  (Select all that apply)


<br>

In [None]:
get_title('Q14')

In [None]:
show_barplot_by_qno('Q14')

<br>


## Q15. For how many years have you used machine learning methods?


<br>

In [None]:
get_title('Q15')

In [None]:
Q15_order = ['I do not use machine learning methods',
             'Under 1 year',
             '1-2 years',
             '2-3 years',
             '3-4 years',
             '4-5 years',
             '5-10 years',
             '10-20 years',
             '10 or more years'
            ]

In [None]:
show_countplot_by_qno('Q15', order = Q15_order)

<br>


## Q16. Which of the following machine learning frameworks do you use on a regular basis? (Select all that apply)


<br>

In [None]:
get_title('Q16')

In [None]:
show_barplot_by_qno('Q16')

<br>


## Q17. Which of the following ML algorithms do you use on a regular basis? (Select all that apply)


<br>

In [None]:
get_title('Q17')

In [None]:
show_barplot_by_qno('Q17')

<br>


## Q18. Which categories of computer vision methods do you use on a regular basis?  (Select all that apply)


<br>

In [None]:
get_title('Q18')

In [None]:
show_barplot_by_qno('Q18')

<br>


## Q19. Which of the following natural language processing (NLP) methods do you use on a regular basis?  (Select all that apply)


<br>

In [None]:
get_title('Q19')

In [None]:
show_barplot_by_qno('Q19')

<br>


## Changed Q20. What is the size of the company where you are employed?
--> On 2021 survey, Q20 is In what industry is your current employer/contract (or your most recent employer if retired)? - Selected Choice


<br>

In [None]:
get_title('Q20')

In [None]:
show_countplot_by_qno('Q20')

<br>


## Q21. What is the size of the company where you are employed?


<br>

In [None]:
get_title('Q21')

In [None]:
Q21_order = ['0-49 employees', '50-249 employees', '250-999 employees',
            '1000-9,999 employees', '10,000 or more employees']

In [None]:
show_countplot_by_qno('Q21', order=Q21_order)

<br>


## Q22. Approximately how many individuals are responsible for data science workloads at your place of business?


<br>

In [None]:
get_title('Q22')

In [None]:
Q22_order = ['0', '1-2', '3-4', '5-9', '10-14', '15-19', '20+']

In [None]:
show_countplot_by_qno('Q22', order = Q22_order)

<br>


## Q23. Does your current employer incorporate machine learning methods into their business?


<br>

In [None]:
get_title('Q23')

In [None]:
show_countplot_by_qno('Q23')

<br>


## Q24. Select any activities that make up an important part of your role at work: (Select all that apply)


<br>

In [None]:
get_title('Q24')

In [None]:
show_barplot_by_qno('Q24')

<br>


## Q25. What is your current yearly compensation (approximate $USD)?


<br>

In [None]:
get_title('Q25')

In [None]:
answer.Q25.value_counts().sort_index().index

In [None]:
Q25_order = ['$0-999', '1,000-1,999', '2,000-2,999', '3,000-3,999',
             '4,000-4,999', '5,000-7,499', '7,500-9,999',
             '10,000-14,999', '15,000-19,999', '20,000-24,999',
             '25,000-29,999', '30,000-39,999', '40,000-49,999',
             '50,000-59,999', '60,000-69,999', '70,000-79,999',
             '80,000-89,999', '90,000-99,999', '100,000-124,999',
             '125,000-149,999', '150,000-199,999',
             '200,000-249,999', '250,000-299,999', '300,000-500,000',
             '> $500,000']

In [None]:
show_countplot_by_qno('Q25', fsize = (6, 10), order = Q25_order)

<br>


## Q26. Approximately how much money have you (or your team) spent on machine learning and/or cloud computing services at home (or at work) in the past 5 years (approximate $USD)?


<br>

In [None]:
get_title('Q26')

In [None]:
show_countplot_by_qno('Q26', fsize=(15, 6))

<br>


## Q27. Which of the following cloud computing platforms do you use on a regular basis? (Select all that apply)


<br>

In [None]:
get_title('Q27')

In [None]:
show_barplot_by_qno('Q27_A')

In [None]:
show_barplot_by_qno('Q27_B')

<br>


## Q28. Of the cloud platforms that you are familiar with, which has the best developer experience (most enjoyable to use)? - Selected Choice


<br>

In [None]:
get_title('Q28')

In [None]:
show_countplot_by_qno('Q28', fsize = (8, 6))

<br>


## Q29. Do you use any of the following cloud computing products on a regular basis? (Select all that apply)


<br>

In [None]:
get_title('Q29')

In [None]:
show_barplot_by_qno('Q29_A')

In [None]:
show_barplot_by_qno('Q29_B')

<br>


## Q30. Do you use any of the following machine learning products on a regular basis? (Select all that apply)


<br>

In [None]:
get_title('Q30')

In [None]:
show_barplot_by_qno('Q30_A')

<br>


## Q31. Do you use any of the following managed machine learning products on a regular basis? (Select all that apply)


<br>

In [None]:
get_title('Q31')

In [None]:
show_barplot_by_qno('Q31_A')

In [None]:
show_barplot_by_qno('Q31_B', fsize=(8,6))

<br>


## Q32. Which of the following big data products (relational databases, data warehouses, data lakes, or similar) do you use on a regular basis? (Select all that apply)


<br>

In [None]:
get_title('Q32')

In [None]:
show_barplot_by_qno('Q32_A', fsize=(10, 8))

In [None]:
show_barplot_by_qno('Q32_B', fsize=(10, 8))

<br>


## Q33. Which of the following big data products (relational database, data warehouse, data lake, or similar) do you use most often? - Selected Choice


<br>

In [None]:
get_title('Q33')

In [None]:
show_countplot_by_qno('Q33', fsize=(10, 8))

<br>


## Q34. Which of the following business intelligence tools do you use on a regular basis? (Select all that apply)


<br>

In [None]:
get_title('Q34')

In [None]:
show_barplot_by_qno('Q34_A')

In [None]:
show_barplot_by_qno('Q34_B', fsize=(8,6))

<br>


## Q35. Which of the following business intelligence tools do you use most often? - Selected Choice


<br>

In [None]:
get_title('Q35')

In [None]:
show_countplot_by_qno('Q35')

<br>


## Q36. Do you use any automated machine learning tools (or partial AutoML tools) on a regular basis?  (Select all that apply)


<br>

In [None]:
get_title('Q36')

In [None]:
show_barplot_by_qno('Q36_A')

In [None]:
show_barplot_by_qno('Q36_B', fsize=(8, 6))

<br>


## Q37. Which of the following automated machine learning tools (or partial AutoML tools) do you use on a regular basis?  (Select all that apply)


<br>

In [None]:
get_title('Q37')

In [None]:
show_barplot_by_qno('Q37_A')

In [None]:
show_barplot_by_qno('Q37_B', fsize=(8, 6))

<br>


## Q38. Do you use any tools to help manage machine learning experiments? (Select all that apply)


<br>

In [None]:
get_title('Q38')

In [None]:
show_barplot_by_qno('Q38_A')

In [None]:
show_barplot_by_qno('Q38_B', fsize=(8,6))

<br>


## Q39. Where do you publicly share or deploy your data analysis or machine learning applications? (Select all that apply)


<br>

In [None]:
get_title('Q39')

In [None]:
show_barplot_by_qno('Q39')

<br>


## Q40. On which platforms have you begun or completed data science courses? (Select all that apply)


<br>

In [None]:
get_title('Q40')

In [None]:
show_barplot_by_qno('Q40')

<br>


## Q41. What is the primary tool that you use at work or school to analyze data? (Include text response) - Selected Choice


<br>

In [None]:
get_title('Q41')

In [None]:
show_countplot_by_qno('Q41')

<br>


## Q42. Who/what are your favorite media sources that report on data science topics? (Select all that apply)


<br>

In [None]:
get_title('Q42')

In [None]:
show_barplot_by_qno('Q42', fsize=(8, 6))