## Load Dataset

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import plotly.graph_objects as go
import seaborn as sns

In [None]:
feedback_data = pd.read_csv("../input/kaggle-survey-2020/kaggle_survey_2020_responses.csv")
feedback_data_filter = feedback_data[1:]

In [None]:
feedback_data.head()

In [None]:
queryid_map = dict(feedback_data.iloc[0])

## Small statistics on given dataset

In [None]:
feedback_data_filter.describe()

## Age group and Programming language

In [None]:
prog_lang_map = {'Q7_Part_1': 'Python', 'Q7_Part_2': 'R', 'Q7_Part_3': 'SQL', 'Q7_Part_4': 'C', 'Q7_Part_5': 'C++', 'Q7_Part_6': 'Java', 'Q7_Part_7': 'Javascript', 'Q7_Part_8': 'Julia', 'Q7_Part_9': 'Swift', 'Q7_Part_10': 'Bash', 'Q7_Part_11': 'MATLAB', 'Q7_Part_12': 'None', 'Q7_OTHER': 'other'} 

In [None]:
age_python = feedback_data_filter.groupby(['Q1'])['Q7_Part_1', 'Q7_Part_2', 'Q7_Part_3', 'Q7_Part_4', 'Q7_Part_5', 'Q7_Part_6', 'Q7_Part_7', 'Q7_Part_8', 'Q7_Part_9', 'Q7_Part_10', 'Q7_Part_11', 'Q7_Part_12', 'Q7_OTHER'].count()

In [None]:
age_python.columns = prog_lang_map.values()

In [None]:
age_python

In [None]:
categories = list(age_python.index)

In [None]:
fig = go.Figure()

fig.add_trace(go.Scatterpolar(
      r=list(age_python['Python']),
      theta=categories,
      fill='toself',
      name='Python'
))
fig.add_trace(go.Scatterpolar(
      r=list(age_python['R']),
      theta=categories,
      fill='toself',
      name='R'
))
fig.add_trace(go.Scatterpolar(
      r=list(age_python['SQL']),
      theta=categories,
      fill='toself',
      name='SQL'
))
fig.add_trace(go.Scatterpolar(
      r=list(age_python['C']),
      theta=categories,
      fill='toself',
      name='C'
))
fig.add_trace(go.Scatterpolar(
      r=list(age_python['C++']),
      theta=categories,
      fill='toself',
      name='C++'
))
fig.add_trace(go.Scatterpolar(
      r=list(age_python['Java']),
      theta=categories,
      fill='toself',
      name='Java'
))
fig.add_trace(go.Scatterpolar(
      r=list(age_python['Javascript']),
      theta=categories,
      fill='toself',
      name='Javascript'
))
fig.add_trace(go.Scatterpolar(
      r=list(age_python['Julia']),
      theta=categories,
      fill='toself',
      name='Julia'
))
fig.add_trace(go.Scatterpolar(
      r=list(age_python['Swift']),
      theta=categories,
      fill='toself',
      name='Swift'
))
fig.add_trace(go.Scatterpolar(
      r=list(age_python['Bash']),
      theta=categories,
      fill='toself',
      name='Bash'
))
fig.add_trace(go.Scatterpolar(
      r=list(age_python['MATLAB']),
      theta=categories,
      fill='toself',
      name='MATLAB'
))
fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
    )),
  showlegend=True
)

fig.show()

From the radar chart, we can see that Python language has a huge acceptance in young people compared to other languages. SQL is the next language in the list, which is mostly used by the middle aged people. C++ language is mostly used by the adult people.

In [None]:
color_ = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple', 'tab:brown', 'tab:pink', 'tab:gray', 'tab:olive', 'tab:cyan', 'teal']
filtered_prog_lang = age_python.iloc[:, :11]
filtered_prog_lang.plot(kind='bar', color=color_, width=1.0, figsize=(20, 7), )

The same info from the previous analysis is represented using grouped bar chart. In all age groups, Python is the most used language. SQL,R, C++ are also used by the people in different age groups.

## Partcipation by Gender

In [None]:
feedback_country_gender = feedback_data_filter[['Q3', 'Q2']]
feedback_series = feedback_country_gender.groupby('Q2')['Q2'].count()
colors = ['yellowgreen','red','gold','lightskyblue','lightcoral']
feedback_series.plot.pie(label="", title="Total Participation by Gender wise", figsize=(10, 5), autopct='%1.1f%%', radius=1.2, colors=colors, startangle=90)
plt.tight_layout()
plt.show(block=True)

The graph is showing the participation by differnt genders in the survey. Men participation is high compared to woman and other categories.  

## Partcipation from different Countries

In [None]:
count_df = feedback_data_filter.groupby('Q3')['Q3'].agg(['count']).sort_values(by='count', ascending=False)

In [None]:
count_df = count_df[count_df['count']>500].reset_index()
count_df.columns = ['Country', 'Count']

In [None]:
colors = {'yellowgreen', 'orchid', 'orange', 'salmon','teal', 'm'}
count_df.plot(x='Country', y='Count', kind='bar', color=colors, figsize=(10, 5))
plt.xlabel('Country')
plt.ylabel('Count')

India has the highest number of participants from all over the world. We are seeing the filtered countries from where more than 500 partcipants particpated in the survey.

## Maximum participation from different Genders, their origin

In [None]:
test = feedback_country_gender.groupby(["Q3", "Q2"], as_index=False)['Q2'].agg(['count'])
test = test.reset_index()
result = test.loc[test.groupby('Q2')['count'].idxmax()]

In [None]:
result.plot('Q2','count',color=['seagreen', 'firebrick', 'seagreen', 'seagreen', 'seagreen'], kind='bar')
colors = {'India':'seagreen', 'United States of America':'firebrick'}         
labels = list(colors.keys())
handles = [plt.Rectangle((0,0),1,1, color=colors[label]) for label in labels]
plt.legend(handles, labels)
plt.xlabel('Gender')
plt.ylabel('Highest Count')

The graph combines previous two stories together by showing maximum partcipations from different countries in gender wise. India is the country from which both men and women partcipated well. From USA we can see that most people partciapted with gender as Nonbinary 

## Partcipation from different professions

In [None]:
current_career = feedback_data_filter.groupby('Q5')['Q5'].count().sort_values()
current_career.plot(kind='bar', color=['tab:green'], figsize=(10, 5))
plt.xlabel('Career')
plt.ylabel('Count')

As we expected the partcipation from students is huge compared to other professions. Data scientist and software engineers are next professionals in count who have partcipated in the survey. Please not that, there is this category 'other' which has also high count of partcipation which is apart from all the given professions. And partcipation by DBA/Database Engineer is less compared to other professionals, but the usage of SQL language in the population is high. This indicates that other professions or students use SQL frequently in their tasks.

## Distribution of programming languages used by professionals

In [None]:
filtered_prof = feedback_data_filter[~feedback_data_filter['Q5'].isin(['Student', 'Currently not employed', 'Other'])]
grouped_profession_lang = filtered_prof.groupby('Q5')['Q7_Part_1', 'Q7_Part_2', 'Q7_Part_3', 'Q7_Part_4', 'Q7_Part_5', 'Q7_Part_6', 'Q7_Part_7', 'Q7_Part_8', 'Q7_Part_9', 'Q7_Part_10', 'Q7_Part_11'].count()
grouped_profession_lang = grouped_profession_lang.transpose()
grouped_profession_lang = grouped_profession_lang.rename(index=prog_lang_map)
grouped_profession_lang.plot(kind='barh', subplots=True, layout=(5, 2), figsize=(15, 10))

A detailed graph on frequent programming languages used by professionals. Statisticians mostly use 'R' as their programming language, also DBA/database engineer uses SQL as their primary language but in all other cases Python is heavily used language.

# Notebooks usage 

## different Notebooks population in total data

In [None]:
notebooks = { key:value.split('-')[2].strip() for (key,value) in queryid_map.items() if 'Q10_' in key}

In [None]:
notebooks

In [None]:
notebook_counts = {}
for key, value in notebooks.items():
    count = feedback_data_filter[key].count()
    notebook_counts[value] = count


In [None]:
plt.bar(range(len(notebook_counts)), list(notebook_counts.values()), align='center', color='green')
plt.xticks(range(len(notebook_counts)), list(notebook_counts.keys()), rotation='vertical')
plt.xlabel('Notebook')
plt.ylabel('Count')

Colab notebooks are the most used notebooks by the total people in the survey. Kaggle notebooks is the next notebook in the queue which is also used heavily. The interesting thing is that so many people doesn't use notebooks at all.

In [None]:
filtered_prof = feedback_data_filter[~feedback_data_filter['Q5'].isin(['Student', 'Currently not employed', 'Other'])]
grouped_profession_notebk = filtered_prof.groupby('Q5')['Q10_Part_1', 'Q10_Part_2', 'Q10_Part_3', 'Q10_Part_4', 'Q10_Part_5', 'Q10_Part_6', 'Q10_Part_7', 'Q10_Part_8', 'Q10_Part_9', 'Q10_Part_10', 'Q10_Part_11', 'Q10_Part_12', 'Q10_Part_13', 'Q10_OTHER'].count()
grouped_profession_notebk = grouped_profession_notebk.transpose()
grouped_profession_notebk = grouped_profession_notebk.rename(index=notebooks)
grouped_profession_notebk.plot(kind='barh', layout=(7, 2), subplots=True, figsize=(30, 30))
plt.tight_layout(pad=3.0)

We can go to little bit deeper and see which professionals doesn't use notebooks at all. In all of these professions, there are some group of people who doesn't use any kind of notebooks at all. Business analysts, DBA/Database engineers, statistician, product/project manager are the professions where a greater number of people who doen't use notebooks are high compared to other notebooks

Also some data scientists, software engineers, Data analysts, machine learning engineer, Research scientists use some other notebooks comparable in count other than the notebooks showed in the graph

## Choice of Data science courses Vs Profession

In [None]:
course_map = {i:j.split('-')[2].strip() for (i,j) in queryid_map.items() if 'Q37' in i}

In [None]:
course_map

In [None]:
course_career = feedback_data_filter.groupby('Q5')['Q37_Part_1', 'Q37_Part_2', 'Q37_Part_3', 'Q37_Part_4', 'Q37_Part_5', 'Q37_Part_6', 'Q37_Part_7', 'Q37_Part_8', 'Q37_Part_9', 'Q37_Part_10', 'Q37_Part_11', 'Q37_OTHER']\
                    .count()
colors = {'yellowgreen', 'orchid', 'orange', 'salmon','navy', 'm', 'red', 'tab:blue', 'tab:brown', 'tab:cyan', 'chocolate', 'plum', 'yellow'}
course_career = course_career.transpose().rename_axis(None)
course_career = course_career.rename(index=course_map)

In [None]:
course_career.plot(figsize=(15, 10), color=colors)

The above graph shows the choices of Data science courses by people from different professions. Coursera is the data science platform that is used by both professionals as well as students. The next data science course choose by professionals like data scientists are Kaggle Learn Courses.

## Distribution of tools used by the population

In [None]:
feedback_data_filter['Q38'].value_counts()

In [None]:
feedback_data_filter['Q38'].value_counts().plot(kind='pie', autopct='%1.1f%%', radius=1.2, figsize=(10, 5))
plt.title('Tools Usage', pad=20)
plt.tight_layout()
plt.show()

The graph shows the primary tools and usage by the total population. Mostly people use local development environments like RStudio, JupyterLab, etc. People also uses basic statistical software tools like Excel, google sheets for data analysis purpose.All other tools are comparable in usage count.

## Compensation Vs career

In [None]:
career_country_remuneration = feedback_data_filter.groupby(['Q3', 'Q5', 'Q24'])['Q24'].agg(['count'])
career_country_remuneration.reset_index()

In [None]:
result = career_country_remuneration.loc[career_country_remuneration.groupby(['Q3', 'Q5'])['count'].idxmax()]
result = result.reset_index()

In [None]:
career_grouped_plt = sns.factorplot(x='Q24', y='count', hue='Q5', 
               col='Q3', data=result, kind='bar', col_wrap=2, 
              sharey=False, height=4, aspect=4)
career_grouped_plt.set_xticklabels(rotation=30)