# 2021 Kaggle Machine Learning & Data Science Survey

# Preprocessings

## Import Important Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
import plotly.express as px
from plotly.offline import init_notebook_mode
import plotly.graph_objects as go
init_notebook_mode(connected=True)
matplotlib.style.use('seaborn-dark')

import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
pd.set_option("display.max_columns" , 400)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


## Read Data set

In [None]:
df = pd.read_csv("../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv" , low_memory=False)

In [None]:
ques = {}
for i in df.columns:
    ques.update({i:df[i][0].split('?')[0] + " ?"})
    
df = df.iloc[1: , :].copy()
df.reset_index(drop=True, inplace=True)

## Helper Functions

In [None]:
def facetgrid_bar(df,column, col_order, num_cols, x_axis, y_axis, hue, order,title, x_axis_title,y_axis_title):

    g = sns.FacetGrid(df, col=column, height=10,col_wrap = num_cols, col_order = col_order, sharex=False)
    
    g.map_dataframe(sns.barplot,data=df, x=x_axis, y = y_axis, hue = hue,
                    order=order,
                    dodge=False, palette = 'Blues_d')

    g.set_titles(col_template="{col_name}", row_template="{row_name}", size = 16)
    g.set_xticklabels(rotation = 90, size = 15) 
    g.fig.subplots_adjust(top=.5)
    g.fig.suptitle(title,fontsize=20, weight = 'bold')

    axes = g.axes.flatten()
    axes[0].set_ylabel(y_axis_title)
    for ax in axes:
        ax.set_xlabel(x_axis_title)

    g.fig.tight_layout()

In [None]:
def columnRange_barplot(df , c  ,ques  ,xlabel=None, ylabel=None  , rotation=45 , lbfontsize=14 , tfontsize=20 , color=sns.color_palette("viridis", 8)):
    start , end = 0 , 0
    ans = {}

    for idx ,i in enumerate(df.columns):
        if(len(i)>6):
            if(i[:len(c)]==c and i[len(i)-6:len(i)] == "Part_1" ):
                start = idx
        if(i[:len(c)]== c and"OTHER" in i):
            # print(f\"Ques {i.split('_')[0]} starts at {start} , ends at {idx+1}\")\n",
            for j in range(start,idx+2):
                vc = df[df.columns[j]].value_counts() 
                ans.update({vc.index[0]:vc.values[0]})

    c +="_Part_1"
    fig, ax =  plt.subplots(figsize = (25, 10))
    ax.bar(ans.keys(),ans.values() , color=color)
    plt.xticks(rotation=rotation)
    plt.xlabel(xlabel  , fontsize=lbfontsize)
    plt.ylabel(ylabel , fontsize=lbfontsize)
    plt.title(ques[c] ,fontsize=tfontsize)
    for p in ax.patches:
        ax.annotate("%.1f%%" % (100*float(p.get_height()/df.shape[0])), (p.get_x() + p.get_width() / 2., abs(p.get_height())),
                        ha='center', va='bottom', color='black', xytext=(0, 5),rotation = 'horizontal',
                        textcoords='offset points')
    plt.show()

    return ans

In [None]:
def simple_countplot(df , c  ,ques  ,xlabel=None, ylabel=None  , rotation=90 , lbfontsize=14 , tfontsize=20):
    plt.figure(figsize=(25,7))
    ax = sns.countplot(df[c].sort_values())
    plt.xticks(rotation=rotation)
    plt.xlabel(xlabel , fontsize=lbfontsize)
    plt.ylabel(ylabel , fontsize=lbfontsize)
    plt.title(ques[c] ,fontsize=tfontsize)
    for p in ax.patches:
        ax.annotate("%.1f%%" % (100*float(p.get_height()/df.shape[0])), (p.get_x() + p.get_width() / 2., abs(p.get_height())),
                        ha='center', va='bottom', color='black', xytext=(0, 5),rotation = 'horizontal',
                        textcoords='offset points')
    plt.show()

In [None]:
def simple_barplot(df , c  ,ques  ,xlabel=None, ylabel=None  , rotation=90 , lbfontsize=14 , tfontsize=20 , color=sns.color_palette("husl" , 8)):
    plt.figure(figsize=(25,7))
    ax = df[c].value_counts().plot(kind='bar' , color=color)
    plt.xticks(rotation=rotation)
    plt.xlabel(xlabel , fontsize=lbfontsize)
    plt.ylabel(ylabel , fontsize=lbfontsize)
    plt.title(ques[c] ,fontsize=tfontsize)
    for p in ax.patches:
        ax.annotate("%.1f%%" % (100*float(p.get_height()/df.shape[0])), (p.get_x() + p.get_width() / 2., abs(p.get_height())),
                        ha='center', va='bottom', color='black', xytext=(0, 5),rotation = 'horizontal',
                        textcoords='offset points')
    plt.show()

In [None]:
def simple_pieChart(df , c  ,ques  ,xlabel=None, ylabel=None  , rotation=90 , lbfontsize=14 , tfontsize=20 , color=sns.color_palette("husl" , 8)):
    plt.figure(figsize=(25,7))
    ax = df[c].value_counts().plot(kind='pie'  , autopct='%1.1f%%')
    plt.xticks(rotation=rotation)
    plt.xlabel(xlabel , fontsize=lbfontsize)
    plt.ylabel(ylabel , fontsize=lbfontsize)
    plt.title(ques[c] ,fontsize=tfontsize)
    plt.show()

# Analysis

In [None]:
simple_countplot(df , "Q1" ,ques  ,xlabel="Age Groups" )


### Observation
* ***Approx 56 percent DS/ML Developers are between 18 to 30 years.*** As data Science is new industry and it attracting young peoples .

In [None]:
simple_pieChart(df , "Q2" , ques , xlabel="Gender")

In [None]:
age_gender = df[['Q1' , 'Q2','Q3' ]]
age_gender = age_gender.sort_values(by=[ "Q1" ,'Q2'])
age_gender['Q2'] = np.where(age_gender['Q2'].isin(['Man' , 'Woman']) , age_gender['Q2'] , 'Nonbinary, self-describe or Undeclared')

plt.figure(figsize=(20,8))
ax = sns.histplot(age_gender,x="Q1", hue="Q2", hue_order=['Man','Woman', 'Nonbinary, self-describe or Undeclared'],
    multiple="stack",
    palette="viridis_r",
    log_scale=False,
    linewidth= .5)
sns.despine(top=True, right=True, left=True, bottom=False)

plt.xticks(rotation=70,fontsize = 12)
ax.set_xlabel('Age Group')
ax.set_ylabel('Number of Participants')
plt.title("Age and Gender Distribution")

for p in ax.patches:
    ax.annotate("%.1f%%" % (100*float(p.get_height()/df.shape[0])), (p.get_x() + p.get_width() / 2., abs(p.get_height())),
         ha='center', va='bottom', color='black', xytext=(0, 6),rotation = 'horizontal',
         textcoords='offset points')

### Observation
* Approx 80% survery participants are man.
* Mostly survey participant are young peoples whether it is male  , woman or nonbinary, self-describe or Undeclared participant

In [None]:
percent_per_country = df['Q3'].value_counts().reset_index()
percent_per_country = pd.DataFrame(percent_per_country)
percent_per_country = percent_per_country.rename({'index':"Country" , "Q3":"# of respondents"}  , axis='columns')

In [None]:
fig = px.choropleth(percent_per_country, 
                    locations = 'Country',  
                    color = "# of respondents",
                    locationmode = 'country names', 
                    color_continuous_scale = 'viridis',
                    title = "Total percentage of responses per countries in 2021",
                    range_color = [0, 1000])
fig.update(layout=dict(title=dict(x=0.5)))
fig.show()

In [None]:
df['Q3'].value_counts()

In [None]:
c = 'Q3'
plt.figure(figsize=(25,7))
ax = df[c].value_counts()[:10].plot(kind='bar' , color=sns.color_palette("rocket" ,8))
plt.xticks(rotation=90)
plt.xlabel("Countries" , fontsize=14)
plt.ylabel("Counts" , fontsize=14)
plt.title(ques[c] ,fontsize=20)
for p in ax.patches:
    ax.annotate("%.1f%%" % (100*float(p.get_height()/df.shape[0])), (p.get_x() + p.get_width() / 2., abs(p.get_height())),
                    ha='center', va='bottom', color='black', xytext=(0, 5),rotation = 'horizontal',
                    textcoords='offset points')
plt.show()

In [None]:
countries = df.loc[(df.Q3 != "Other")].Q3.value_counts().reset_index(name="Count").rename(columns={"index":"Country"})
country_list = countries['Country'].tolist()
age_gender.Q3 = age_gender.Q3.astype('category')
age_gender.Q3.cat.set_categories(country_list , inplace=True)
gender_country = pd.DataFrame(age_gender.groupby(['Q2','Q3']).size(), columns = ['Count'])
gender_country = gender_country.reset_index()

gender_country = gender_country.sort_values(by = ['Q2','Count'], ascending=True)


plt.figure(figsize=(20,8))
ax = sns.barplot(data=gender_country, x='Q3', y = 'Count', hue = 'Q2',order = country_list[0:10], hue_order = ['Man', 'Woman', 'Nonbinary, self-describe or Undeclared'],palette = "viridis")

sns.despine(top=True, right=True, left=True, bottom=False)
plt.xticks(rotation=70,fontsize = 12)
ax.set(xlabel=None)
ax.set(yticklabels=[])
ax.axes.get_yaxis().set_visible(False)
ax.set_ylabel('Number of Participants')
l = ax.legend()
l.set_title('Gender')
plt.title('Top 10 Countries - Gender % According to Countries of Residence')

i=0

for p in ax.patches:
    ax.annotate("%d\n%.0f%%" % (p.get_height(),100*float(p.get_height()/countries.iloc[i,1])), (p.get_x() + p.get_width() / 2., abs(p.get_height())),
    ha='center', va='bottom', color='black', xytext=(0, 6),rotation = 'horizontal',textcoords='offset points')
    i += 1
    if i == len(country_list[0:10]):
        i = 0   

### Observation
* India, United States of America and Japan are the countries with most survey participates
* 10-25 % Woman's participation in Kaggle 2021 survey from all over the world.

In [None]:
simple_barplot(df , "Q4" , ques , xlabel="Formal Education" ,  color=sns.color_palette("mako", 8))

In [None]:
gender_education = df[['Q1' , 'Q2','Q4' ]]
gender_education = pd.DataFrame(gender_education.groupby(['Q2','Q4']).size(), columns = ['Count'])
gender_education = gender_education.reset_index()

plt.figure(figsize=(20,8))
ax = sns.barplot(data=gender_education, x='Q4', y = 'Count', hue = 'Q2', hue_order = ['Man', 'Woman', 'Nonbinary, self-describe or Undeclared'],palette = "rocket_r")

sns.despine(top=True, right=True, left=True, bottom=False)
plt.xticks(rotation=70,fontsize = 12)
ax.set_xlabel('Formal Education')
ax.set_ylabel('Number of Participants')
plt.title("Gender and Formal Education Distribution")

for p in ax.patches:
    ax.annotate("%.1f%%" % (100*float(p.get_height()/df.shape[0])), (p.get_x() + p.get_width() / 2., abs(p.get_height())),
         ha='center', va='bottom', color='black', xytext=(0, 6),rotation = 'horizontal',
         textcoords='offset points')

In [None]:
education = df.loc[:,['Q1','Q2','Q3','Q4','Q5']]

educationCountry = education.groupby(['Q3','Q4']).size().reset_index(name="Count")
educationCountry['%'] = 100*(educationCountry['Count'].div(educationCountry.groupby('Q3')['Count'].transform('sum')))

facetgrid_bar(educationCountry,'Q3',country_list[0:10],5,'Q4','%','Q4', ['Bachelor’s degree', 'Doctoral degree', 'Master’s degree',
       'No formal education past high school', 'Professional doctorate',
       'Some college/university study without earning a bachelor’s degree',
       'I prefer not to answer'],'Formal Education by Country', 'Education','Percentage (%)') 

### Observation
* Formal Education is the biggest concern of Data Science and Machine Learning.In 2021 , Survey participates hold Master's and Bachelor's degree while there is only 10 percent peoples having doctorate degree.
* There is almost same no of men and women in master's and bachelor's degree.

In [None]:
simple_barplot(df , "Q5" , ques , xlabel="Occupation" , color=sns.color_palette("flare"))

### Observation
***Almost 14 percent peoples hold Data Scientist position in Data Science Industry***

In [None]:
simple_barplot(df , "Q6" , ques , xlabel="Programming Experience" , color=sns.color_palette("viridis", 8))

### Observation 
***Data Scientist have good knowledge of programming and have experience of atleast 1 year.*** So we can simply say DS/ML Developers are good programmers also.

In [None]:
prog_lang = columnRange_barplot(df , "Q7" , ques , xlabel="Usual Language usage" , color=sns.color_palette("magma", 8))

In [None]:
simple_barplot(df , "Q8" , ques , xlabel="Programming Language Recommendation for aspirants" , color=sns.color_palette("crest", 8))

### Observation 
***Python is mostly used and recommended language in data science industry with also good knowledge of SQL and R****

In [None]:
ans = columnRange_barplot(df , "Q9" , ques , xlabel="Common IDE's" , color=sns.color_palette("viridis", 8))

### Observation 
* Jupyter notebook , kaggle notebook and VSCode is most common IDE for the analysis
* Also I consider that google colaboratary must be included in this question .

In [None]:
ans = columnRange_barplot(df , "Q10" , ques , xlabel="Common Notebook Products" , color=sns.color_palette("viridis", 8))

### Observation 
* 37-38 percent participants uses hosted kaggle notebooks and colab notebooks for the analysis. So , we can conclude kaggle notebooks and colab notebooks are most used hosted notebooks
* On other side , most survey partipants used their own system for the analysis

In [None]:
simple_barplot(df , "Q11" , ques , xlabel="Common Computing Platforms" , color=sns.color_palette("rocket", 8))

### Observation 
* Cloud computing platform and deep learning workstation are not quite prefered by the participants
* Participants mostly use their own pc/laptop for compuation works.

In [None]:
common_hardware = columnRange_barplot(df , "Q12" , ques , xlabel="Common Hardwares" , color=sns.color_palette("rocket_r", 8))

### Observation 
* Most participants had never used special hardwares for computing 
* While those who used preferes NVIDIA GPU's most.

In [None]:
simple_pieChart(df , "Q13" , ques , xlabel="Number of times TPU used")

### Observation 
* Most oftenly participants have never used the TPU 
* Only 7% survey participants have used the TPU more than 6 times

In [None]:
ans = columnRange_barplot(df , "Q14" , ques , xlabel="Common Programming tools and libraries" , color=sns.color_palette("viridis", 8))

### Observation 
* Matplotlib and seaborn  are mostly used plotting libraries used for data Science followed by plotly.

In [None]:
simple_pieChart(df , "Q15" , ques , xlabel="Experience in Machine Learning")

### Observation 
* 56 percent survery participants are newbies with less than 2 years of experience this is because more young peoples have participated in this survey.

In [None]:
mlframeworks = columnRange_barplot(df , "Q16" , ques , xlabel="Common ML Frameworks" , color=sns.color_palette("rocket", 8))

In [None]:
mlAlgorithms = columnRange_barplot(df , "Q17" , ques , xlabel="Common ML Algorithms" , color=sns.color_palette("viridis", 8))

### Observation 
* Scikit-learn , tensoflow and keras are popular machine learning frameworks.
*  Linear or Logistic Regression and Decision Tree is mostly used ML Algorithms.

In [None]:
ans = columnRange_barplot(df , "Q18" , ques , xlabel="Common Computer Vision Methods" , color=sns.color_palette("rocket_r", 8) , rotation=90)

## Observation 
* Image Classification Algorithms are quite popular among the participants followed by Image Segmentation , Object Detection and word embedding

In [None]:
ans = columnRange_barplot(df , "Q19" , ques , xlabel="Common NLP Methods" , color=sns.color_palette("viridis", 8))

## Observation 
* Most commanly used NLP algorithams among participants are Computer Technology , word embedding , Transoformer language models

In [None]:
simple_barplot(df , "Q21" , ques , xlabel="Employees Count" , color=sns.color_palette("rocket_r", 8) , rotation=45)

## Observations
* 20 percent survey participants belongs to company that have currently 0-49 employee strength.
* So , we can say small companies adapting Data Science Industry more than other.

In [None]:
simple_barplot(df , "Q25" , ques , xlabel="Current CTC ($)" , color=sns.color_palette("rocket_r", 8) , rotation=90)

## Observation 
* As Data Science is new and demanding stream , so company pays good salary to their employees as appreciation

# Conclusion
First of all, I want to thank Kaggle for launching this competition through which I did this work.
The EDA focused on understanding the Kaggle and Data Science Community. While spread worldwide, we saw a predominance of India and USA participants. We also learned that most participants hold at least a Master's degree, and that formal education does pose a weight on the future role.

The USA is where most doctorates reside, even though Switzerland has a higher percentage of doctorates considering its number of respondents. The numbers on gender and diversity were not exciting and have a similar trend across the diverse set of countries. However, we did see a higher percentage of partcipation of women and other minority groups in younger ages, providing hope that the tide is lifting everyone.

The relevant participation of third world countries, such as Brazil and Nigeria, also shows how technology can connect us all and make such countries more competitive. Fortunately, Data Science only requires passion and a decent internet connection. The amount of material and support of the DS community is great and makes it a great example of a career without boundaries.