In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Loading the data

In [None]:
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Loading data
data = pd.read_csv('../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv')
data.head(10)

## Participating Countries

In [None]:
print("There are {} countries took part in this survey.".format(len(data['Q3'].value_counts())))

In [None]:
# Plotting the figure
data['Q3'].value_counts().plot.bar(figsize=(20,8)) 



We can see from the 67 countries that participated, the maximum respondents are from India. 

In the chart above, we can see that more than 7000 records are from India that we are interested, therefore let's create a new dataset which includes information from India.

# Gender Distribution in India

In [None]:
# A new dataset which has information related to India
data_india = data[data['Q3']=='India']

# Plotting of gender distribution
gender_count = data_india['Q2'].value_counts()

fig, axes = plt.subplots(1,1, figsize = (9,6))
sns.set_style('dark')
axes = sns.barplot(x = gender_count, y = gender_count.index, edgecolor = "black")
axes.set_title("Gender Distribution of India", fontsize = 15)
plt.show()

From the plot above, we can see that there are more men in the industry than woman. According to the given data, there are only 22.27% woman who responded to the survey.

# Education in India

In [None]:
# Plotting of education distribution
education_count = data_india['Q4'].value_counts()

fig, axes = plt.subplots(1, 1, figsize = (9,6))
sns.set_style('dark')
axes = sns.barplot(x = education_count, y = education_count.index, edgecolor = "black")
axes.set_title("Education Qualification in India", fontsize = 15)
plt.show()

We can see that almost 53% of the respondents have Bachelor's degree and 32% have Master's degree. 

# Age Distribution in India

In [None]:
# Plotting the age distribution
age_count = data_india['Q1'].value_counts()

fig, axes = plt.subplots(1, 1, figsize = (9, 6))
sns.set_style('dark')
axes = sns.barplot(x = age_count, y = age_count.index, edgecolor = "black")
axes.set_title("Age Distribution in India", fontsize = 15)
plt.show()

Around 77% of the respondents are below the age 30. 

# Current Role/Title in Workplace

In [None]:
# Plotting the Current role/title distribution
current_count = data_india['Q5'].value_counts()

fig, axes = plt.subplots(1, 1, figsize = (9,6))
sns.set_style('dark')
axes = sns.barplot(x = current_count, y = current_count.index, edgecolor = "black")
axes.set_title("Current Role/Title in the Workplace", fontsize = 15)
plt.show()

17% respondents take the job of Data Scientist and Machine Learning Engineer. 10% of the respondents are working as Software Engineers.

# Experience in Coding

In [None]:
# Plotting the coding experience distribution
coding_count = data_india["Q6"].value_counts()

fig, axes = plt.subplots(1, 1, figsize = (9, 6))
sns.set_style("dark")
axes = sns.barplot(x = coding_count, y = coding_count.index, edgecolor = "black")
axes.set_title("Experience in Coding", fontsize = 15)
plt.show()

From the graph above, we can see that almost 65% respondents have a coding experience of 1-3 years. Also we can see that almost 30% of the respondents have less than a year experience.

# Programming Language used on a Daily Basis

In [None]:
# Plotting the programming language experience distribution
programming = {}
for i in range(1, 13):
    programming.update(dict(data_india[f"Q7_Part_{i}"].value_counts()))
programming = pd.DataFrame(programming.items(), columns = ["Languages", "Count"])

sns.set_style("dark")
p = sns.barplot(x = "Count", y = "Languages", data = programming, edgecolor = "black")
p.set_title("Programming Language used on a Daily Basis", fontsize = 15)
plt.figure(figsize = (9,8))
plt.show()

We can see that more than 6000 respondents use Python on a regular basis.

# Recommended Programming Language

In [None]:
# Plotting the Recommended language distribution
recommended_language_count = data_india["Q8"].value_counts()

fig, axes = plt.subplots(1, 1, figsize = (9, 6))
sns.set_style("dark")
axes = sns.barplot(x = recommended_language_count, y = recommended_language_count.index, edgecolor = "black")
axes.set_title("Most Recommended Language", fontsize = 15)
plt.show()

Since most of the professionals use Python on a regular basis, therefore we can conclude the most recommended language is Python. 

# IDE used on a Regular Basis

In [None]:
# Plotting the IDE used distribution
ide = {}
for i in range(1,13):
    ide.update(dict(data_india[f"Q9_Part_{i}"].value_counts()))
ide = pd.DataFrame(ide.items(), columns = ["IDE", "Count"])

sns.set_style("dark")
p = sns.barplot(x = "Count", y = "IDE", data = ide, edgecolor = "black")
p.set_title("IDE used on a Regular Basis", fontsize = 15)
plt.figure(figsize = (9,6))
plt.show()

Jupyter and VSCode are most widely used IDE on a regular basis.

# Usage of Notebooks used on a Regular Basis

In [None]:
# Plotting the Hosted Notebooks distribution
hosted_notebooks = {}
for i in range(1,16):
    hosted_notebooks.update(dict(data_india[f"Q10_Part_{i}"].value_counts()))
hosted_notebooks = pd.DataFrame(hosted_notebooks.items(), columns = ["Hosted_Notebook", "Count"])

sns.set_style("dark")
p = sns.barplot(x = "Count", y = "Hosted_Notebook", data = hosted_notebooks, edgecolor = "black")
p.set_title("Usage of Notebooks used on a Regular Basis", fontsize = 15)
plt.figure(figsize = (9,6))
plt.show()

Colab and Kaggle notebooks are popularly used.

# Usage of Specialised Hardware on a Regular Basis

In [None]:
# Plotting the Specialised Hardware used distribution
hardware = {}
for i in range(1,5):
    hardware.update(dict(data_india[f"Q12_Part_{i}"].value_counts()))
hardware = pd.DataFrame(hardware.items(), columns = ["Specialised_Hardware", "Count"])

sns.set_style("dark")
p = sns.barplot(x = "Count", y = "Specialised_Hardware", data = hardware, edgecolor = "black")
p.set_title("Usage of Specialised Hardware on a Regular Basis", fontsize = 15)
plt.figure(figsize = (9,6))
plt.show()

NVIDIA GPU is used on a regular basis.

# Usage of TPU 

In [None]:
# Plotting the frequency of TPU used distribution
tpu_count = data_india["Q13"].value_counts()

fig, axes = plt.subplots(1, 1, figsize = (9, 6))
sns.set_style("dark")
axes = sns.barplot(x = tpu_count, y = tpu_count.index, edgecolor = "black")
axes.set_title("Usage of TPU", fontsize = 15)
plt.show()

 More than half of the respondents (>4500) have never used any TPU.

# Usage of Data Visualizations Libraries/Tools on a Regular Basis

In [None]:
# Plotting the visualization tools/libraries used on a regular basis distribution
visualization = {}
for i in range(1,12):
    visualization.update(dict(data_india[f"Q14_Part_{i}"].value_counts()))
visualization = pd.DataFrame(visualization.items(), columns = ["Visualization_Tool_Libraries", "Count"])

sns.set_style("dark")
p = sns.barplot(x = "Count", y = "Visualization_Tool_Libraries", data = visualization, edgecolor = "black")
p.set_title("Usage of Data Visualizations Libraries/Tools on a Regular Basis", fontsize = 15)
plt.figure(figsize = (9,6))
plt.show()

Matplotlib and seaborn are the most popular visualization libraries used.

# Experience in the Usage of ML Methods

In [None]:
# Plotting the experience in the usage of ML methods distribution
ml_method_count = data_india["Q15"].value_counts()
ml_method_count

fig, axes = plt.subplots(1, 1, figsize = (9, 6))
sns.set_style("dark")
axes = sns.barplot(x = ml_method_count, y = ml_method_count.index, edgecolor = "black")
axes.set_title("Experience in the Usage of ML Methods", fontsize = 15)
plt.show()

Half of the respondents have less than 2 years of experience in the usage of ML methods where majority of the respondents hold below a year of experience.

# Usage of ML Framework on a Regular Basis

In [None]:
# Plotting the ML Framework used on a regular basis distribution
ml_framework = {}
for i in range(1,18):
    ml_framework.update(dict(data_india[f"Q16_Part_{i}"].value_counts()))
ml_framework = pd.DataFrame(ml_framework.items(), columns = ["ML_Framework", "Count"])

sns.set_style("dark")
p = sns.barplot(x = "Count", y = "ML_Framework", data = ml_framework, edgecolor = "black")
p.set_title("Usage of ML Framework on a Regular Basis", fontsize = 15)
plt.figure(figsize = (9,6))
plt.show()

Popular ML framework are Scikit-Learn, Tensorflow, Keras, XGBoost, PyTorch.

# Usage of ML Algorithms on a Regular Basis

In [None]:
# Plotting the ML algorithms used on a regular basis distribution
ml_algorithm = {}
for i in range(1,12):
    ml_algorithm.update(dict(data_india[f"Q17_Part_{i}"].value_counts()))
ml_algorithm = pd.DataFrame(ml_algorithm.items(), columns = ["ML_Algorithm", "Count"])

sns.set_style("dark")
p = sns.barplot(x = "Count", y = "ML_Algorithm", data = ml_algorithm, edgecolor = "black")
p.set_title("Usage of ML Algorithms on a Regular Basis", fontsize = 15)
plt.figure(figsize = (9,6))
plt.show()

Popular ML algorithms are linear/logistic regression, decision trees, gradient boosting machines, CNN.

# Usage of Computer Vision Methods on a Regular Basis

In [None]:
# Plotting the CV methods used on a regualr basis distribution
cv_methods = {}
for i in range(1,7):
    cv_methods.update(dict(data_india[f"Q18_Part_{i}"].value_counts()))
cv_methods = pd.DataFrame(cv_methods.items(), columns = ["CV_Methods", "Count"])

sns.set_style("dark")
p = sns.barplot(x = "Count", y = "CV_Methods", data = cv_methods, edgecolor = "black")
p.set_title("Usage of Computer Vision Methods on a Regular Basis", fontsize = 15)
plt.figure(figsize = (9,6))
plt.show()

# Usage of NLP Methods on a Regular Basis

In [None]:
# Plotting the NLP methods used on a regular basis distribution
nlp_methods = {}
for i in range(1,6):
    nlp_methods.update(dict(data_india[f"Q19_Part_{i}"].value_counts()))
nlp_methods = pd.DataFrame(nlp_methods.items(), columns = ["NLP_Methods", "Count"])

sns.set_style("dark")
p = sns.barplot(x = "Count", y = "NLP_Methods", data = nlp_methods, edgecolor = "black")
p.set_title("Usage of NLP Methods on a Regular Basis", fontsize = 15)
plt.figure(figsize = (9,6))
plt.show()

# Which industry are you from?

In [None]:
# Plotting the industry distribution
industry_count = data_india["Q20"].value_counts()
industry_count

fig, axes = plt.subplots(1, 1, figsize = (9, 6))
sns.set_style("dark")
axes = sns.barplot(x = industry_count, y = industry_count.index, edgecolor = "black")
axes.set_title("Which industry are you from?", fontsize = 15)
plt.show()

# Incorporation of ML in Business

In [None]:
# Plotting the incorporation of ML in business distribution
incorporate_count = data_india["Q23"].value_counts()
incorporate_count

fig, axes = plt.subplots(1, 1, figsize = (9, 6))
sns.set_style("dark")
axes = sns.barplot(x = incorporate_count, y = incorporate_count.index, edgecolor = "black")
axes.set_title("Incorporation of ML in Business", fontsize = 15)
plt.show()

From the graph above, we can say that more than 1900 respondents are don't know or not at all using ML methods. From which almost 700 respondents are still exploring the ML methods.

In [None]:
# Plotting the important work distribution
important_work = {}
for i in range(1,8):
    important_work.update(dict(data_india[f"Q24_Part_{i}"].value_counts()))
important_work = pd.DataFrame(important_work.items(), columns = ["Important_Part_Work", "Count"])

sns.set_style("dark")
p = sns.barplot(x = "Count", y = "Important_Part_Work", data = important_work, edgecolor = "black")
p.set_title("What's the important part of your work?", fontsize = 15)
plt.figure(figsize = (9,6))
plt.show()

Most of the respondents use ML methods to understand the data so that can make better business decisions.

# Yearly Compensation in USD

In [None]:
# Plotting the yearly compensation distribution
compensation_count = data_india["Q25"].value_counts()
compensation_count

fig, axes = plt.subplots(1, 1, figsize = (9, 6))
sns.set_style("dark")
axes = sns.barplot(x = compensation_count, y = compensation_count.index, edgecolor = "black")
axes.set_title("Yearly Compensation", fontsize = 15)
plt.show()

# Cloud Computing Platforms used on a Regular Basis

In [None]:
# Plotting the cloud computing platforms used on a regular basis distribution
cc_platforms = {}
for i in range(1,12):
    cc_platforms.update(dict(data_india[f"Q27_A_Part_{i}"].value_counts()))
cc_platforms = pd.DataFrame(cc_platforms.items(), columns = ["Cloud_Computing_Platforms", "Count"])

sns.set_style("dark")
p = sns.barplot(x = "Count", y = "Cloud_Computing_Platforms", data = cc_platforms, edgecolor = "black")
p.set_title("Cloud Computing Platforms used on a Regular Basis", fontsize = 15)
plt.figure(figsize = (9,6))
plt.show()

AWS, GCP and Azure are the most widely used Cloud Computing Platforms on a regular basis.

# Recommended Cloud Platform 

In [None]:
# Plotting the recommended cloud platform distribution
cloud_count = data_india["Q28"].value_counts()
cloud_count

fig, axes = plt.subplots(1, 1, figsize = (9, 6))
sns.set_style("dark")
axes = sns.barplot(x = cloud_count, y = cloud_count.index, edgecolor = "black")
axes.set_title("Recommended Cloud Platform (based on developer experience)", fontsize = 15)
plt.show()

AWS is the most recommended cloud computing platform.

# Usage of Data Storage Products on a Regular Basis

In [None]:
# Plotting the usage of data storage products on a regular basis distribution
storage_products = {}
for i in range(1,8):
    storage_products.update(dict(data_india[f"Q30_A_Part_{i}"].value_counts()))
storage_products = pd.DataFrame(storage_products.items(), columns = ["Data_Storage_Products", "Count"])

sns.set_style("dark")
p = sns.barplot(x = "Count", y = "Data_Storage_Products", data = storage_products, edgecolor = "black")
p.set_title("Usage of Data Storage Products used on a Regular Basis", fontsize = 15)
plt.figure(figsize = (9,6))
plt.show()

# Usage of ML Products on a Regular Basis

In [None]:
# Plotting the usage of ML products on a regular basis distribution
ml_products = {}
for i in range(1,10):
    ml_products.update(dict(data_india[f"Q31_A_Part_{i}"].value_counts()))
ml_products = pd.DataFrame(ml_products.items(), columns = ["ML_Products", "Count"])

sns.set_style("dark")
p = sns.barplot(x = "Count", y = "ML_Products", data = ml_products, edgecolor = "black")
p.set_title("Usage of ML Products used on a Regular Basis", fontsize = 15)
plt.figure(figsize = (9,6))
plt.show()

# Usage of Big Data Products on a Regular Basis

In [None]:
# Plotting the usage of big data on a regular basis distribution
big_data_products = {}
for i in range(1,21):
    big_data_products.update(dict(data_india[f"Q32_A_Part_{i}"].value_counts()))
big_data_products = pd.DataFrame(big_data_products.items(), columns = ["Big_Data_Products", "Count"])

sns.set_style("dark")
p = sns.barplot(x = "Count", y = "Big_Data_Products", data = big_data_products, edgecolor = "black")
p.set_title("Usage of Big Data Products used on a Regular Basis", fontsize = 15)
plt.figure(figsize = (9,6))
plt.show()

MySQL, PostgreSQL, MongoDB are used on a regular basis.

# Recomment Big Data Products 

In [None]:
# Plotting the recommended big data products distribution
big_data_count = data_india["Q33"].value_counts()
big_data_count

fig, axes = plt.subplots(1, 1, figsize = (9, 6))
sns.set_style("dark")
axes = sns.barplot(x = big_data_count, y = big_data_count.index, edgecolor = "black")
axes.set_title("Recommended Big Data Products", fontsize = 15)
plt.show()

Most recommended big data product is MySQL.

# Usage of BI tools on a Regular Basis

In [None]:
# Plotting the usage of BI tools on a regular basis distribution
bi_tools = {}
for i in range(1,17):
    bi_tools.update(dict(data_india[f"Q34_A_Part_{i}"].value_counts()))
bi_tools = pd.DataFrame(bi_tools.items(), columns = ["BI_Tools", "Count"])

sns.set_style("dark")
p = sns.barplot(x = "Count", y = "BI_Tools", data = bi_tools, edgecolor = "black")
p.set_title("Usage of BI tools used on a Regular Basis", fontsize = 15)
plt.figure(figsize = (9,6))
plt.show()

Tableau and PowerBI are the most widely used BI tools.

# Usage of Automated/Partial ML Tools on a Regular Basis

In [None]:
# Plotting the automated/partial ML tools on a regular basis distribution
automated_ml_tools = {}
for i in range(1,7):
    automated_ml_tools.update(dict(data_india[f"Q36_A_Part_{i}"].value_counts()))
automated_ml_tools = pd.DataFrame(automated_ml_tools.items(), columns = ["Automated/Partial_ML_Tools", "Count"])

sns.set_style("dark")
p = sns.barplot(x = "Count", y = "Automated/Partial_ML_Tools", data = automated_ml_tools, edgecolor = "black")
p.set_title("Usage of Automated/Partial ML Tools used on a Regular Basis", fontsize = 15)
plt.figure(figsize = (9,6))
plt.show()

# Public Access of your Data Analysis/ML application

In [None]:
# Plotting the public access distribution
public_access = {}
for i in range(1,10):
    public_access.update(dict(data_india[f"Q39_Part_{i}"].value_counts()))
public_access = pd.DataFrame(public_access.items(), columns = ["Public_Access", "Count"])

sns.set_style("dark")
p = sns.barplot(x = "Count", y = "Public_Access", data = public_access, edgecolor = "black")
p.set_title("Public Access of your Data Analysis/ML application", fontsize = 15)
plt.figure(figsize = (9,6))
plt.show()

GitHub and Kaggle are the platforms where respondents share their work related to data analysis and ML application.

# Data Science Course Platform

In [None]:
# Plotting the DS course platform distribution 
course = {}
for i in range(1,12):
    course.update(dict(data_india[f"Q40_Part_{i}"].value_counts()))
course = pd.DataFrame(course.items(), columns = ["Course", "Count"])

sns.set_style("dark")
p = sns.barplot(x = "Count", y = "Course", data = course, edgecolor = "black")
p.set_title("Data Science Course", fontsize = 15)
plt.figure(figsize = (9,6))
plt.show()

# Amount spent on Learning ML/Cloud Computing

In [None]:
# Plotting the amount spent on learning distribution
amount_count = data_india["Q26"].value_counts()
amount_count

fig, axes = plt.subplots(1, 1, figsize = (9, 6))
sns.set_style("dark")
axes = sns.barplot(x = amount_count, y = amount_count.index, edgecolor = "black")
axes.set_title("Amount spent on Learning ML/Cloud Computing", fontsize = 15)
plt.show()

## Conclusion

It is safe to say that Data Science and Machine Learning is growing extremely fast in India. 
I hope you have enjoyed reading through and please let me know where I can improve. 
Thank you.