In [None]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.gridspec as gridspec 
import seaborn as sns
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from plotly.offline import plot
import warnings
warnings.filterwarnings(action='ignore')

In [None]:
df = pd.read_csv('../input/kaggle-survey-2020/kaggle_survey_2020_responses.csv')
df.head()

In [None]:
df.drop([0,1], inplace=True)

In [None]:
df.isnull().sum().sort_values(ascending = False)

In [None]:
for col in df.columns:
    df[col].fillna("Missing",inplace=True)

In [None]:
def univariate_plots(feat,name):
    count = df[feat].value_counts().sort_values(ascending = False)

    # lets visualize the feature distribution in the dataset
    fig = px.pie(count,
                 values=count.values,
                 names=count.index,
                 color_discrete_sequence=orange_black,
                 hole=.1,title= name + " Distribution")
    fig.update_traces(textinfo='percent', pull=0.01)    
    fig.show()

In [None]:
orange_black = ['#fdc029', '#df861d', '#FF6347', '#aa3d01', '#a30e15', '#800000', '#171820']

# Setting plot styling.
plt.style.use('fivethirtyeight')

Age Distribution

In [None]:
agedata = df['Q1'].sort_index()
agedata

In [None]:
import seaborn as sb
import seaborn as sns
agedata = df['Q1'].sort_index()
x = df['Q1']
sns.countplot( x="Q1", data = df)


plt.title('Distribution of Age',fontsize=20)
plt.xticks(rotation =90, fontsize=20)
plt.xlabel('Age',fontsize=20)

In [None]:
univariate_plots("Q1","Age Wise")

Gender

In [None]:
gender = df['Q2'].value_counts().sort_index()
gender

In [None]:
import seaborn as sb
import seaborn as sns
x = df['Q2']
sns.countplot( x='Q2', data = df,order = gender.index)

plt.title('Gender',fontsize=20)
plt.xticks(rotation=90,fontsize=20)
plt.xlabel('Gender',fontsize=20)

In [None]:
univariate_plots("Q2","Gender")

Country

In [None]:
country = df['Q3'].value_counts().sort_index()
country

Education

In [None]:
education = df['Q4'].value_counts().sort_index()
education

In [None]:
univariate_plots("Q4","Education")

In [None]:
import seaborn as sns
agedata = df['Q4'].sort_index()
x = df['Q4']
sns.countplot( x="Q4", data = df)

plt.title('Distribution of Education',fontsize=20)
plt.xticks(rotation=90, fontsize=20)
plt.xlabel('Education',fontsize=20)

In [None]:
univariate_plots("Q4","Education Status")

Current Role

In [None]:
role = df['Q5'].value_counts().sort_index()
role


Programming Language Experience

In [None]:

experience = df['Q6'].value_counts().sort_index()
experience

Programming Language

In [None]:
language = df[[i for i in df.columns if 'Q7' in i]]
language_all = pd.Series(dtype='int')
for i in language.columns:
    language_all[language[i].value_counts().index[0]] = language[i].count()

language_all = language_all.sort_values(ascending=True)
language_all

Recommended Programming Language

In [None]:
recommended = df['Q8'].value_counts().sort_index()
recommended

In [None]:
values = []
index  = ["Python","R","SQL","C","C++","JAVA","Java Script","Julia","Swift","Bash","MATLAB","None","Other"]

values.append((len(df[df['Q7_Part_1']  != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q7_Part_2']  != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q7_Part_3']  != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q7_Part_4']  != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q7_Part_5']  != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q7_Part_6']  != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q7_Part_7']  != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q7_Part_8']  != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q7_Part_9']  != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q7_Part_10'] != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q7_Part_11'] != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q7_Part_12'] != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q7_OTHER']   != "Missing"])/df.shape[0])*100)

users = pd.DataFrame({"index":index,"value":values})

plt.figure(figsize = (12,8))
sns.barplot(x=index,
            y=values,
            palette=orange_black)
plt.xticks(rotation=30)
plt.title('Programming Languages used on a Regular Basis')

Favourite IDE

In [None]:
df_ides = df[[i for i in df.columns if 'Q9' in i]]
df_ides_all = pd.Series(dtype='int')
for i in df_ides.columns:
    df_ides_all[df_ides[i].value_counts().index[0]] = df_ides[i].count()
df_ides_all

In [None]:
values = []
index  = ["Jupyter","RStudio","Visual Studio","VSCode","PyCharm","Spyder","Notepad++","Sublime Text","Vim/Emacs","MATLAB","None","Other"]

values.append((len(df[df['Q9_Part_1']  != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q9_Part_2']  != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q9_Part_3']  != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q9_Part_4']  != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q9_Part_5']  != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q9_Part_6']  != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q9_Part_7']  != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q9_Part_8']  != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q9_Part_9']  != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q9_Part_10'] != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q9_Part_11'] != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q9_OTHER']   != "Missing"])/df.shape[0])*100)

users = pd.DataFrame({"index":index,"value":values})

plt.figure(figsize = (12,8))
sns.barplot(x=index,
            y=values,
            palette=orange_black)
plt.xticks(rotation=20)
plt.title('Integrated Development Environment')


Hosted Notebooks

In [None]:
df_notebooks = df[[i for i in df.columns if 'Q10' in i]]
df_notebooks_all = pd.Series(dtype='int')
for i in df_notebooks.columns:
    df_notebooks_all[df_notebooks[i].value_counts().index[0]] = df_notebooks[i].count()
df_notebooks_all

In [None]:
values = []
index  = ["Kaggle Notebooks","Colab Notebooks","Azure Notebooks","PaperSpace/Gradient","Binder/JupyterHub","Code Ocean",
          "IBM Watson Studio","Amazon Sagemaker Studio","Amazon EMR Notebooks","Google Cloud AI Platform Notebook",
          "Google Cloud Datalab Notebooks","Databricks Collaborative Notebooks","None","Other"]

values.append((len(df[df['Q10_Part_1']  != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q10_Part_2']  != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q10_Part_3']  != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q10_Part_4']  != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q10_Part_5']  != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q10_Part_6']  != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q10_Part_7']  != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q10_Part_8']  != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q10_Part_9']  != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q10_Part_10'] != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q10_Part_11'] != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q10_Part_12'] != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q10_Part_13'] != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q10_OTHER']   != "Missing"])/df.shape[0])*100)

users = pd.DataFrame({"index":index,"value":values})

plt.figure(figsize = (12,8))
sns.barplot(x=values,
            y=index,
            palette="rocket")
plt.xticks(rotation=30)
plt.title('Hosted Notebook Products used')

Computing Platform

In [None]:
platform = df['Q11'].value_counts().sort_index()
platform

In [None]:
univariate_plots("Q11","Computing Platform Wise")

Hardware

In [None]:
df_hardware = df[[i for i in df.columns if 'Q12' in i]]
df_hardware_all = pd.Series(dtype='int')
for i in df_hardware.columns:
    df_hardware_all[df_hardware[i].value_counts().index[0]] = df_hardware[i].count()
df_hardware_all

In [None]:
values = []
index  = ['GPUs','None','TPUs','Others']


values.append((len(df[df['Q12_Part_1'] == "GPUs"])/df.shape[0])*100)
values.append((len(df[df['Q12_Part_3'] == "None"])/df.shape[0])*100)
values.append((len(df[df['Q12_Part_2'] == "TPUs"])/df.shape[0])*100)
values.append((len(df[df['Q12_OTHER'] == "Other"])/df.shape[0])*100)

users = pd.DataFrame({"Users%":values})

plt.figure(figsize = (12,8))

sns.barplot(x=index,
            y="Users%",
            data=users,
            palette="deep")
plt.title('Specialized Hardware Used')

TPU Use Rate

In [None]:
tpu = df['Q13'].value_counts().sort_index()
tpu

In [None]:
univariate_plots("Q13","TPU Use Rate Wise")

Visualization Libraries

In [None]:
df_library = df[[i for i in df.columns if 'Q14' in i]]
df_library_all = pd.Series(dtype='int')
for i in df_library.columns:
    df_library_all[df_library[i].value_counts().index[0]] = df_library[i].count()
df_library_all

In [None]:
values = []
index  = ["Matplotlib","Seaborn","Plotly","GGplot","Shiny","D3 js","Altair","Bokeh","Geoplotlib","Leafleat/Folium","None","Other"]

values.append((len(df[df['Q14_Part_1']  != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q14_Part_2']  != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q14_Part_3']  != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q14_Part_4']  != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q14_Part_5']  != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q14_Part_6']  != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q14_Part_7']  != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q14_Part_8']  != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q14_Part_9']  != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q14_Part_10'] != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q14_Part_11'] != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q14_OTHER']   != "Missing"])/df.shape[0])*100)

users = pd.DataFrame({"index":index,"value":values})

plt.figure(figsize = (12,8))
sns.barplot(x=values,
            y=index,
            palette="pastel")
plt.xticks(rotation=30)
plt.title('Data Visualization Libraries or Tools')

Years of Using Machine learning

In [None]:
df_mlyears = df[[i for i in df.columns if 'Q15' in i]]
df_mlyears_all = pd.Series(dtype='int')
for i in df_mlyears.columns:
    df_mlyears_all[df_mlyears[i].value_counts().index[0]] = df_mlyears[i].count()
df_mlyears

In [None]:
univariate_plots("Q15","Year using ML Methods")


Most Used Machine Learning Framework

In [None]:
df_mlframe = df[[i for i in df.columns if 'Q16' in i]]
df_mlframe_all = pd.Series(dtype='int')
for i in df_mlframe.columns:
    df_mlframe_all[df_mlframe[i].value_counts().index[0]] = df_mlframe[i].count()
df_mlframe

In [None]:
values = []
index  = ["Scikit-Learn","TensorFlow","Keras","PyTorch","Fast.ai","MXNet","XGBoost","LGBM","CatBoost","Prophet","H20 3",
          "Caret","Tidymodels","JAX","None","Other"]

values.append((len(df[df['Q16_Part_1']  != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q16_Part_2']  != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q16_Part_3']  != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q16_Part_4']  != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q16_Part_5']  != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q16_Part_6']  != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q16_Part_7']  != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q16_Part_8']  != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q16_Part_9']  != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q16_Part_10'] != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q16_Part_11'] != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q16_Part_12'] != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q16_Part_13'] != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q16_Part_14'] != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q16_Part_15'] != "Missing"])/df.shape[0])*100)
values.append((len(df[df['Q16_OTHER']   != "Missing"])/df.shape[0])*100)

users = pd.DataFrame({"index":index,"value":values})

plt.figure(figsize = (12,8))
sns.barplot(x=index,
            y=values,
            palette="Set2")
plt.xticks(rotation=30)
plt.title('Machine Learning Frameworks')

Size of the company where you are employed

In [None]:
univariate_plots("Q20","Size Of the Company")