<h1>2021 Kaggle Survey Dashboard</h1>
<img src="https://storage.googleapis.com/kaggle-competitions/kaggle/23724/logos/header.png?t=2020-10-31-23-22-58" style="width : 100%;margin : auto;">

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools
!pip install pycountry-convert
from pycountry_convert import country_name_to_country_alpha2,country_alpha2_to_country_name

<h1>Data Loading</h1>

In [None]:
data = pd.read_csv("../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv")
data

<h1>Visualisations with Insights</h1>

In [None]:
pd.set_option('display.float',str)
data[data.columns[0]][1:].astype('int32').describe()

<p><b>Inference : </b>15-20 minutes was taken by majority of the people to complete the survey.</p>

<h2>Geospatial Analysis</h2>

In [None]:
countries = []
lat = []
lon = []
un, counts = np.unique(data.Q3.values[1:],return_counts=True)
for i in range(len(un)):
    try:
        countries.append(country_name_to_country_alpha2(un[i]))
    except:
        pass
    
counts = pd.read_csv("../input/locations/world_country_and_usa_states_latitude_and_longitude_values.csv")
for con in countries:
    lat.append(counts[counts["country_code"] == con].latitude.values[0])
    lon.append(counts[counts["country_code"] == con].longitude.values[0])

In [None]:
import folium
from folium.plugins import MarkerCluster
world_map= folium.Map(tiles="cartodbpositron")
marker_cluster = MarkerCluster().add_to(world_map)
for i in range(len(lat)):
        if(lat[i] == 0 ):
            continue
        radius=5
        popup_text = """Country : {}"""
        popup_text = popup_text.format(country_alpha2_to_country_name(countries[i]))
        folium.CircleMarker(location = [lat[i], lon[i]] ,radius=radius, popup= popup_text, fill =True).add_to(marker_cluster)
world_map

<p><b>Inference : </b>Majority of participants from <b>North and South America</b>, <b>Europe</b> and <b>Asia</b>.</p>

In [None]:
plt.rcParams["axes.linewidth"]  = 0
plt.rcParams.update({'axes.facecolor':'whitesmoke'})
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
unique,counts = np.unique(data.Q1[1:],return_counts=True)
plt.title("Age group")
plt.xlabel("Age group")
plt.ylabel("Count")
plt.bar(unique,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='black', linestyle='solid', linewidth=0.1, alpha=0.5,zorder=0)

plt.subplot(1,2,2)
unique,counts = np.unique(data.Q2[1:],return_counts=True)
plt.title("Gender")
plt.ylabel("Gender")
plt.xlabel("Count")
plt.barh(unique,counts,zorder=3,color=['forestgreen','green','springgreen','aquamarine','darkslategrey'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.tight_layout()

plt.figure(figsize=(15,7))
plt.subplot(1,2,1)
data.Q4.replace("Bachelor’s degree","BS",inplace=True)
data.Q4.replace("Doctoral degree","Phd",inplace=True)
data.Q4.replace("Master’s degree","MS",inplace=True)
data.Q4.replace("I prefer not to answer","No Answer",inplace=True)
data.Q4.replace("No formal education past high school","High School",inplace=True)
data.Q4.replace("Some college/university study without earning a bachelor’s degree","College",inplace=True)
unique,counts = np.unique(data.Q4[1:],return_counts=True)
plt.title("Education")
plt.xlabel("Level")
plt.pie(counts,labels=unique)
plt.legend(loc="lower right",bbox_to_anchor=(0,1))

plt.subplot(1,2,2)
data.Q6.replace("I have never written code","Never",inplace=True)
unique,counts = np.unique(data.Q6[1:],return_counts=True)
plt.title("Coding Experience")
plt.ylabel("Years")
plt.xlabel("Count")
plt.barh(unique,counts,zorder=3,color=['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.tight_layout()

plt.figure(figsize=(15,6))
unique,counts = np.unique(data.Q5[1:],return_counts=True)
plt.title("Profession")
plt.ylabel("Type")
plt.xlabel("Count")
plt.barh(unique,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)


<p style="text-align : justify;"><b>Inference : </b>People from age group <b>18-30</b> are more on Kaggle and reduces from there onwards. Majority of the people working on Kaggle are pursuing either <b>BS</b>, <b>MS</b> or <b>Phd</b> which shows that students and researches are more in number. Coders with <b>1-5</b> years of experience mostly use kaggle compared to experienced people (<b>5+ years</b>). As observed before, <b>Students</b> are the major users of kaggle in comparison with industry experts. Within, professionals <b>Data Scientist</b>, <b>Data Analyst</b> and <b>Software Engineer</b> are the major users of Kaggle.</p>

<h2>Question 7 - What programming languages do you use on a regular basis? (Select all that apply)</h2>

In [None]:
languages_used = data[data.columns[data.columns.get_loc("Q7_Part_1") : data.columns.get_loc("Q7_OTHER") + 1 ]]
languages = []
for col in languages_used.columns:
        languages.append([languages_used[col][0].split("-")[2]])
        languages.append(languages_used[col][1:].dropna().values)

In [None]:
languages = list(itertools.chain.from_iterable(languages))
languages = [i.strip() for i in languages]
languages, counts = np.unique(languages,return_counts=True)

In [None]:
plt.figure(figsize=(15,5))
plt.bar(languages,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Programming Language used on Regular basis")
plt.xlabel("Language")
plt.ylabel("Count")
plt.show()

<p><b>Inference : </b>Majority of people on Kaggle use <b>Python</b> and <b>SQL</b> on a regular basis along with <b>R</b> in a good amount. This aligns with the fact from previous visualisation that a large number of people, including students on Kaggle belong to Data Science and Software Engineering Field.</p>

<h2>Question 8 - What programming language would you recommend an aspiring data scientist to learn first?</h2>

In [None]:
languages_preferred = data.Q8.dropna().to_numpy()[1:]
languages_preferred, counts = np.unique(languages_preferred,return_counts=True)
plt.figure(figsize=(6,8))
plt.title("Recommended Languages for Aspiring Data Scientists")
plt.ylabel("Language")
plt.xlabel("Count")
plt.barh(languages_preferred,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.show()

<p><b>Inference : </b>As heard usually, <b>Python</b>, <b>R</b> and <b>SQL</b> are the most recommended languages for an aspiring Data Scientist.</p>

<h2>Question 9 - Which of the following integrated development environments (IDE's) do you use on a regular basis?</h2>

In [None]:
ide_used = data[data.columns[data.columns.get_loc("Q9_Part_1") : data.columns.get_loc("Q9_OTHER") + 1 ]]
ides = []
for col in ide_used.columns:
        ides.append([ide_used[col][0].split("-")[2]])
        ides.append(ide_used[col][1:].dropna().values)

In [None]:
ides = list(itertools.chain.from_iterable(ides))
ides = [i.strip() for i in ides]
ides, counts = np.unique(ides,return_counts=True)

In [None]:
ides = ides[1:]
ides[0] = "Jupyter"
ides[-1] = "VS Code"
counts[1] += counts[0]
counts = counts[1:]

In [None]:
plt.figure(figsize=(18,5))
plt.bar(ides,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("IDEs used on Regular basis")
plt.xlabel("IDE")
plt.ylabel("Count")
plt.show()

<p><b>Inference : </b><b>Jupyter</b>, <b>PyCharm</b> and <b>VS Code</b> are the most commonly used IDEs since they support Notebook Environments.</p>

<h2>Question 10 - Which of the following hosted notebook products do you use on a regular basis?</h2>

In [None]:
notebooks_used = data[data.columns[data.columns.get_loc("Q10_Part_1") : data.columns.get_loc("Q10_OTHER") + 1 ]]
notebooks = []
for col in notebooks_used.columns:
        notebooks.append([notebooks_used[col][0].split("-")[2]])
        notebooks.append(notebooks_used[col][1:].dropna().values)

In [None]:
notebooks = list(itertools.chain.from_iterable(notebooks))
notebooks = [i.strip() for i in notebooks]
notebooks, counts = np.unique(notebooks,return_counts=True)

In [None]:
plt.figure(figsize=(10,7))
plt.barh(notebooks,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Notebook Products used on Regular basis")
plt.ylabel("Product")
plt.xlabel("Count")
plt.show()

<p><b>Inference : </b>Although a good number of peole does not use any hosted notebook IDEs on a regular  basis, <b>Kaggle Notebooks</b> and <b>Colab Notebooks</b> are common and mostly used among the people who uses hosted IDEs.</p>

<h2>Question 11 - What type of computing platform do you use most often for your data science projects?</h2>

In [None]:
platform = data.Q11.dropna().values[1:]
platform, counts = np.unique(platform,return_counts=True)

In [None]:
plt.figure(figsize=(8,8))
plt.pie(counts,labels=platform)
plt.title("Platform used for Data Science Projects")
plt.ylabel("Platform")
plt.xlabel("Count")
plt.legend(loc="lower right",bbox_to_anchor=(0,1))
plt.show()

<p><b>Inference : </b>Majority of the population uses a PC (Laptop / Desktop), few use <b>Cloud Platforms</b> as well (maybe for higher workloads or in competitions).</p>

<h1>Question 12 - Which types of specialized hardware do you use on a regular basis?</h1>

In [None]:
hardware_used = data[data.columns[data.columns.get_loc("Q12_Part_1") : data.columns.get_loc("Q12_OTHER") + 1 ]]
hardwares = []
for col in hardware_used.columns:
        hardwares.append([hardware_used[col][0].split("-")[2]])
        hardwares.append(hardware_used[col][1:].dropna().values)

In [None]:
hardwares = list(itertools.chain.from_iterable(hardwares))
hardwares = [i.strip() for i in hardwares]
hardwares, counts = np.unique(hardwares,return_counts=True)

In [None]:
plt.figure(figsize=(15,5))
plt.bar(hardwares,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Specialized hardware used on Regular basis")
plt.xlabel("Hardware")
plt.ylabel("Count")
plt.show()

<p><b>Inference : </b>Majority of the users do not prefer any accelarators (CPU only), <b>GPUs</b> and <b>TPUs</b> are popular among the people who use accelarators.</p>

<h1>Question 13 - Approximately how many times have you used a TPU (tensor processing unit)?</h1>

In [None]:
tpu = data.Q13.dropna().values[1:]
tpu, counts = np.unique(tpu,return_counts=True)

In [None]:
plt.figure(figsize=(10,7))
plt.bar(tpu,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Usage of TPU")
plt.xlabel("Frequency")
plt.ylabel("Count")
plt.show()

<p><b>Inference : </b>TPU is used almost never or really less in number.</p>

<h1>Question 14 - What data visualization libraries or tools do you use on a regular basis?</h1>

In [None]:
vis_used = data[data.columns[data.columns.get_loc("Q14_Part_1") : data.columns.get_loc("Q14_OTHER") + 1 ]]
tools = []
for col in vis_used.columns:
        tools.append([vis_used[col][0].split("-")[2]])
        tools.append(vis_used[col][1:].dropna().values)

In [None]:
tools = list(itertools.chain.from_iterable(tools))
tools = [i.strip() for i in tools]
tools, counts = np.unique(tools,return_counts=True)

In [None]:
plt.figure(figsize=(5,7))
plt.barh(tools,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Visualization tools used on Regular basis")
plt.ylabel("Tool")
plt.xlabel("Count")
plt.show()

<p><b>Inference : </b><b>Seaborn</b>, <b>Matplotlib</b>, <b>Plotly</b> and <b>Ggplot</b> are the most commonly used Visualization Libraries with <b>Matplotlib</b> as the most used followed by <b>Seaborn</b>.</p>

<h1>Question 15 - For how many years have you used machine learning methods?</h1>

In [None]:
years = data.Q15.dropna().values[1:]
years, counts = np.unique(years,return_counts=True)

In [None]:
plt.figure(figsize=(5,7))
plt.barh(years,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Years of using Machine Learning Methods")
plt.ylabel("Experience")
plt.xlabel("Count")
plt.show()

<p><b>Inference : </b>People who have not much experience with ML Methods are more, pointing to the fact that they are either students or fresh in career.</p>

<h1>Question 16 - Which of the following machine learning frameworks do you use on a regular basis?</h1>

In [None]:
frameworks_used = data[data.columns[data.columns.get_loc("Q16_Part_1") : data.columns.get_loc("Q16_OTHER") + 1 ]]
frameworks = []
for col in frameworks_used.columns:
        frameworks.append([frameworks_used[col][0].split("-")[2]])
        frameworks.append(frameworks_used[col][1:].dropna().values)

In [None]:
frameworks = list(itertools.chain.from_iterable(frameworks))
frameworks = [i.strip() for i in frameworks]
frameworks, counts = np.unique(frameworks,return_counts=True)

In [None]:
frameworks = np.delete(frameworks,14)
counts[15] = counts[15] + counts[14]
counts = np.delete(counts,14)

In [None]:
plt.figure(figsize=(5,9))
plt.barh(frameworks,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("1")
plt.ylabel("Framework")
plt.xlabel("Count")
plt.show()

<p><b>Inference : </b><b>Xgboost</b>, <b>Scikit-learn</b>, <b>Tensorflow</b>, <b>Pytorch</b> and <b>Keras</b> are the most used ML Frameworks</b></p>

<h1>Question 17 - Which of the following ML algorithms do you use on a regular basis? </h1>

In [None]:
algo_used = data[data.columns[data.columns.get_loc("Q17_Part_1") : data.columns.get_loc("Q17_OTHER") + 1 ]]
algos = []
for col in algo_used.columns:
        algos.append([algo_used[col][0].split("-")[2]])
        algos.append(algo_used[col][1:].dropna().values)

In [None]:
algos = list(itertools.chain.from_iterable(algos))
algos = [i.strip() for i in algos]
algos, counts = np.unique(algos,return_counts=True)

counts[0] += counts[1]
algos = np.delete(algos,len(algos) - 2)
counts = np.delete(counts,len(counts) -2)

In [None]:
plt.figure(figsize=(5,9))
plt.barh(algos,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("ML Algorithms used on Regular Basis")
plt.ylabel("Algorithm")
plt.xlabel("Count")
plt.show()

<p><b>Inference : </b>Traditional ML Algorithms like <b>Regression</b>, <b>Decision Trees</b>, <b>Random Forests</b> and <b>Bayesian Models</b> are more used in comparison with <b>Neural Networks</b>.</p>

<h1>Question 18 - Which of the following Computer Vision Methods do you use on a regular basis?</h1>

In [None]:
computer_vision_methods_used = data[data.columns[data.columns.get_loc("Q18_Part_1") : data.columns.get_loc("Q18_OTHER") + 1 ]]
methods = []
for col in computer_vision_methods_used.columns:
        methods.append([computer_vision_methods_used[col][0].split("-")[2]])
        methods.append(computer_vision_methods_used[col][1:].dropna().values)

In [None]:
methods = list(itertools.chain.from_iterable(methods))
methods = [i.strip() for i in methods]
methods, counts = np.unique(methods,return_counts=True)


In [None]:
methods = np.delete(methods,3)
counts[4] = counts[4] + counts[3]
counts = np.delete(counts,3)

In [None]:
plt.figure(figsize=(5,9))
plt.barh(methods,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Computer Vision methods used on Regular Basis")
plt.ylabel("Method")
plt.xlabel("Count")
plt.show()

<p><b>Inference : </b><b>Image Classification</b> and <b>Object Detection</b> models like VGG, ResNet, YOLO etc are commonly used along with libraries like <b>PIL</b> and <b>CV2</b>. Image Classification stands ahead of everything.</p>

<h1>Question 19 - Which of the following natural language processing (NLP) methods do you use on a regular basis?</h1>

In [None]:
nlp_methods_used = data[data.columns[data.columns.get_loc("Q19_Part_1") : data.columns.get_loc("Q19_OTHER") + 1 ]]
methods = []
for col in nlp_methods_used.columns:
        methods.append([nlp_methods_used[col][0].split("-")[2]])
        methods.append(nlp_methods_used[col][1:].dropna().values)

In [None]:
methods = list(itertools.chain.from_iterable(methods))
methods = [i.strip() for i in methods]
methods, counts = np.unique(methods,return_counts=True)

In [None]:
methods = np.delete(methods,5)
counts[6] = counts[6] + counts[5]
counts = np.delete(counts,5)

methods = np.delete(methods,1)
counts[2] = counts[1] + counts[2]
counts = np.delete(counts,1)

In [None]:
plt.figure(figsize=(5,9))
plt.barh(methods,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("NLP methods used on Regular Basis")
plt.ylabel("Method")
plt.xlabel("Count")
plt.show()

<p><b>Inference : </b>Word Embedding Algorithms like <b>word2vec</b> etc are used more often. For modelling, pre-trained <b>Transformer Models</b> and <b>Encoder-Decoder models</b> are preferred.</p>

<h1>Question 20 - In what industry is your current employer/contract (or your most recent employer if retired)?</h1>

In [None]:
employer_ind = data.Q20.dropna().values[1:]
employer_ind, counts = np.unique(employer_ind,return_counts=True)

In [None]:
plt.figure(figsize=(5,9))
plt.barh(employer_ind,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Employer Industry")
plt.ylabel("Industry")
plt.xlabel("Count")
plt.show()

<p><b>Inference : </b>As observed before as well, users majorly belong to <b>Technology</b>, <b>Academics</b> or <b>Education</b> field.</p>

<h1>Question 21 - What is the size of the company where you are employed?
</h1>

In [None]:
employer_size = data.Q21.dropna().values[1:]
employer_size, counts = np.unique(employer_size,return_counts=True)

In [None]:
plt.figure(figsize=(5,9))
plt.barh(employer_size,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Employer Size")
plt.ylabel("Size")
plt.xlabel("Count")
plt.show()

<p><b>Inference : </b>Most People are working in a small scale industries (with approx 50 people) in comparison with the count of people working in Medium and Large scale industries.</p>

<h1>Question 22 - Approximately how many individuals are responsible for data science workloads at your place of business?</h1>

In [None]:
data_science_count = data.Q22.dropna().values[1:]
data_science_count, counts = np.unique(data_science_count,return_counts=True)

In [None]:
plt.figure(figsize=(9,5))
plt.bar(data_science_count,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Number of Individuals responsible for Data Science Workload")
plt.xlabel("Number")
plt.ylabel("Count")
plt.show()

In [None]:
plt.figure(figsize=(15,9))
plt.subplot(2,2,1)
un,counts = np.unique(data[data.Q21 == "0-49 employees"].Q22.dropna().values,return_counts=True)
plt.title("Small Scale Industries")
plt.xlabel("Data Science Team Count")
plt.ylabel("Count")
plt.bar(un,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)

plt.subplot(2,2,2)
un,counts = np.unique(data[(data.Q21 == "50-249 employees") | (data.Q21 == "250-999 employees")].Q22.dropna().values,return_counts=True)
plt.title("Medium Scale Industries")
plt.xlabel("Data Science Team Count")
plt.ylabel("Count")
plt.bar(un,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)

plt.subplot(2,2,3)
un,counts = np.unique(data[(data.Q21 == "1000-9,999 employees") | (data.Q21 == "10000 or more employees")].Q22.dropna().values,return_counts=True)
plt.title("Large Scale Industries")
plt.xlabel("Data Science Team Count")
plt.ylabel("Count")
plt.bar(un,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)

plt.tight_layout()

<p><b>Inference : </b>This visualisation returns mixed results - There are industries where a big team (<b>20+</b>) handles data science workloads while on the other side a small team (<b>1-2</b>) is also deployed for data science workloads. The smaller teams belong to the <b>Startups</b> while <b>Medium Scale</b> industries has a mixed team of smalle and large for varying workloads. <b>Large Scale</b> deploy larger teams (20+) because of their realtively.</p>

<h1>Question 23 - Does your current employer incorporate machine learning methods into their business?</h1>

In [None]:
ML_business = data.Q23.dropna().values[1:]
ML_business, counts = np.unique(ML_business,return_counts=True)

In [None]:
plt.figure(figsize=(9,5))
plt.barh(ML_business,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Use of ML by Current Employer")
plt.ylabel("Use")
plt.xlabel("Count")
plt.show()

<p><b>Inference : </b>Majority of the industries belong to the category where the ML domain is only getting explored or not even considered for using but at the same time, there are a good number of industries who have recently started or already use ML methods.</p>

<h1>Question 24 - Select any activities that make up an important part of your role at work</h1>

In [None]:
activities_work = data[data.columns[data.columns.get_loc("Q24_Part_1") : data.columns.get_loc("Q24_OTHER") + 1 ]]
activities = []
for col in activities_work.columns:
        activities.append([activities_work[col][0].split("-")[2]])
        activities.append(activities_work[col][1:].dropna().values)

In [None]:
activities = list(itertools.chain.from_iterable(activities))
activities = [i.strip() for i in activities]
activities, counts = np.unique(activities,return_counts=True)

In [None]:
plt.figure(figsize=(9,5))
plt.barh(activities,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Major Activity at work")
plt.ylabel("Activity")
plt.xlabel("Count")
plt.show()

<p><b>Inference : </b>Industries often use analyatics to generate business decisions while there are a good number who built it into production or apply experimentation to find areas for implementing ML or improve the existing ones. There also exists industries who work on Data Science workloads related to data storage and analytics infrastructure.</p>

<h1>Question 25 - What is your current yearly compensation (approximate $USD)?</h1>

In [None]:
compensation = data.Q25.dropna().values[1:]
compensation, counts = np.unique(compensation,return_counts=True)

In [None]:
plt.figure(figsize=(9,10))
plt.barh(compensation,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Current Yearly Compensation")
plt.ylabel("Compensation Range")
plt.xlabel("Count")
plt.show()

<p><b>Inference : </b><b>0-999 USD</b> is the most chosen Compensation.</p>

<h1>Question 26 - Approximately how much money have you (or your team) spent on machine learning and/or cloud computing services at home (or at work) in the past 5 years (approximate $USD)?</h1>

In [None]:
money_spent = data.Q26.dropna().values[1:]
money_spent, counts = np.unique(money_spent,return_counts=True)

In [None]:
plt.figure(figsize=(15,5))
plt.bar(money_spent,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Money spent on ML / Cloud Computing Services at Home or at Work in past 5 years")
plt.xlabel("Money Spent")
plt.ylabel("Count")
plt.show()

In [None]:
plt.figure(figsize=(20,30))
unique,counts = np.unique(data.Q26.dropna().values[1:],return_counts=True)
for i in range(len(unique)):
    plt.subplot(len(unique),2,i+1)
    un,counts = np.unique(data[data.Q26 == unique[i]].Q5.dropna().values,return_counts=True)
    plt.title(unique[i])
    plt.xlabel("Count")
    plt.ylabel("Profession")
    plt.barh(un,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
    plt.tick_params(bottom=False,left=False)
    plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.tight_layout()

<p><b>Inference : </b>Most people do not spend much on Cloud Servies, although there are few people who spent money on Cloud Services. The larger spending mostly came from Work while the smaller ones may belong to personal projects / learning etc. The profession of the group of people spending on Cloud Servies mainly belong to <b>Software Engineer</b>, <b>Data Scientist</b> or <b>Data Analyst</b>.</p>

<h1>Question 27 Part A - Which of the following cloud computing platforms do you use on a regular basis?</h1>

In [None]:
platforms_used = data[data.columns[data.columns.get_loc("Q27_A_Part_1") : data.columns.get_loc("Q27_A_OTHER") + 1 ]]
platforms = []
for col in platforms_used.columns:
        platforms.append([platforms_used[col][0].split("-")[2]])
        platforms.append(platforms_used[col][1:].dropna().values)

In [None]:
platforms = list(itertools.chain.from_iterable(platforms))
platforms = [i.strip() for i in platforms]
platforms, counts = np.unique(platforms,return_counts=True)

In [None]:
plt.figure(figsize=(6,6))
plt.pie(counts,labels=platforms)
plt.title("Cloud Computing Platforms used on Regular basis")
plt.legend(loc="lower right",bbox_to_anchor=(0,1))
plt.show()

<p><b>Inference : </b><b>AWS</b> is used on a large number followed by <b>GCP</b> and <b>Azure</b>. They can be either for personal work or for Professional Purpose.</p>

<h1>Question 27 Part B -Which of the following cloud computing platforms do you hope to become more familiar with in the next 2 years?</h1>

In [None]:
platforms_familiar = data[data.columns[data.columns.get_loc("Q27_B_Part_1") : data.columns.get_loc("Q27_B_OTHER") + 1 ]]
platforms = []
for col in platforms_familiar.columns:
        platforms.append([platforms_familiar[col][0].split("-")[2]])
        platforms.append(platforms_familiar[col][1:].dropna().values)

In [None]:
platforms = list(itertools.chain.from_iterable(platforms))
platforms = [i.strip() for i in platforms]
platforms, counts = np.unique(platforms,return_counts=True)

In [None]:
plt.figure(figsize=(6,6))
plt.pie(counts,labels=platforms)
plt.title("     Cloud Computing Platforms preferred to become familiar")
plt.legend(loc="lower right",bbox_to_anchor=(0,1))
plt.show()

<p><b>Inference : </b>The commonly used <b>AWS</b>, <b>GCP</b> and <b>Azure</b> are the ones people prefer to get familiar.</p>

<h1>Question 28 - Of the cloud platforms that you are familiar with, which has the best developer experience (most enjoyable to use)?</h1>

In [None]:
best_platform = data.Q28.dropna().values[1:]
best_platform, counts = np.unique(best_platform,return_counts=True)

In [None]:
plt.figure(figsize=(15,8))
plt.barh(best_platform,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Cloud Computing Platforms with best Developer Experience")
plt.ylabel("Platform")
plt.xlabel("Count")
plt.show()

<p><b>Inference : </b><b>AWS</b> has a good number of votes followed by <b>GCP</b> and <b>Azure</b>. Large group of people also find all of the cloud platforms to have a similar user experience.</p>

<h1>Question 29 Part A - Do you use any of the following cloud computing products on a regular basis?</h1>

In [None]:
products_used = data[data.columns[data.columns.get_loc("Q29_A_Part_1") : data.columns.get_loc("Q29_A_OTHER") + 1 ]]
products = []
for col in products_used.columns:
        products.append([products_used[col][0].split("-")[2]])
        products.append(products_used[col][1:].dropna().values)

In [None]:
products = list(itertools.chain.from_iterable(products))
products = [i.strip() for i in products]
products, counts = np.unique(products,return_counts=True)

In [None]:
plt.figure(figsize=(20,5))
plt.bar(products,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Cloud Computing Products used on Regular basis")
plt.xlabel("Product")
plt.ylabel("Count")
plt.show()

<p><b>Inference : </b><b>Cloud VMs</b> are the mostly used product.</p>

<h1>Question 29 Part B - In the next 2 years, do you hope to become more familiar with any of these specific cloud computing products? </h1>

In [None]:
products_preferred = data[data.columns[data.columns.get_loc("Q29_B_Part_1") : data.columns.get_loc("Q29_B_OTHER") + 1 ]]
products = []
for col in products_preferred.columns:
        products.append([products_preferred[col][0].split("-")[2]])
        products.append(products_preferred[col][1:].dropna().values)

In [None]:
products = list(itertools.chain.from_iterable(products))
products = [i.strip() for i in products]
products, counts = np.unique(products,return_counts=True)

In [None]:
plt.figure(figsize=(20,5))
plt.bar(products,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Cloud Computing Products preferred to become familiar")
plt.xlabel("Product")
plt.ylabel("Count")
plt.show()

<p><b>Inference : </b>Again, <b>Cloud VMs</b> are the preferred product to get familiar with. <b>GCP</b> is the most chosen platform followed for <b>AWS</b> and <b>Azure</b>.</p>

<h1>Question 30 Part A - Do you use any of the following data storage products on a regular basis?</h1>

In [None]:
products_used = data[data.columns[data.columns.get_loc("Q30_A_Part_1") : data.columns.get_loc("Q30_A_OTHER") + 1 ]]
products = []
for col in products_used.columns:
        products.append([products_used[col][0].split("-")[2]])
        products.append(products_used[col][1:].dropna().values)

In [None]:
products = list(itertools.chain.from_iterable(products))
products = [i.strip() for i in products]
products, counts = np.unique(products,return_counts=True)

In [None]:
plt.figure(figsize=(10,7))
plt.barh(products,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Storage Products used on Regular basis")
plt.ylabel("Product")
plt.xlabel("Count")
plt.show()

<p><b>Inference : </b>Basic storage products like <b>GCS</b> and <b>S3</b> are more commonly used.</p>

<h1>Question 30 Part B - In the next 2 years, do you hope to become more familiar with any of these specific data storage products?</h1>

In [None]:
products_preferred = data[data.columns[data.columns.get_loc("Q30_B_Part_1") : data.columns.get_loc("Q30_B_OTHER") + 1 ]]
products = []
for col in products_preferred.columns:
        products.append([products_preferred[col][0].split("-")[2]])
        products.append(products_preferred[col][1:].dropna().values)

In [None]:
products = list(itertools.chain.from_iterable(products))
products = [i.strip() for i in products]
products, counts = np.unique(products,return_counts=True)

In [None]:
plt.figure(figsize=(10,7))
plt.barh(products,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Storage Products preferred to become familiar with")
plt.ylabel("Product")
plt.xlabel("Count")
plt.show()

<p><b>Inference : </b>Really few votes for this question - But all products are equally preferred.</p>

<h1>Question 31 Part A - Do you use any of the following managed machine learning products on a regular basis?</h1>

In [None]:
ml_products_used = data[data.columns[data.columns.get_loc("Q31_A_Part_1") : data.columns.get_loc("Q31_A_OTHER") + 1 ]]
ml_products = []
for col in ml_products_used.columns:
        ml_products.append([ml_products_used[col][0].split("-")[2]])
        ml_products.append(ml_products_used[col][1:].dropna().values)

In [None]:
ml_products = list(itertools.chain.from_iterable(ml_products))
ml_products = [i.strip() for i in ml_products]
ml_products, counts = np.unique(ml_products,return_counts=True)

In [None]:
plt.figure(figsize=(10,7))
plt.barh(ml_products,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Managed ML Products used on Regular basis")
plt.ylabel("Product")
plt.xlabel("Count")
plt.show()

<p><b>Inference : </b>Managed ML Products</p>

<h1>Question 31 Part B - In the next 2 years, do you hope to become more familiar with any of these managed machine learning products?</h1>

In [None]:
ml_products_preferred = data[data.columns[data.columns.get_loc("Q31_B_Part_1") : data.columns.get_loc("Q31_B_OTHER") + 1 ]]
ml_products = []
for col in ml_products_preferred.columns:
        ml_products.append([ml_products_preferred[col][0].split("-")[2]])
        ml_products.append(ml_products_preferred[col][1:].dropna().values)

In [None]:
ml_products = list(itertools.chain.from_iterable(ml_products))
ml_products = [i.strip() for i in ml_products]
ml_products, counts = np.unique(ml_products,return_counts=True)

In [None]:
plt.figure(figsize=(10,7))
plt.barh(ml_products,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Managed ML Products preferred to get familiar with")
plt.ylabel("Product")
plt.xlabel("Count")
plt.show()

<p><b>Inference : </b>For people willing to Managed ML proucts, <b>Google Cloud Vertex AI</b>, <b>Azure ML Studio</b> and <b>Amazon SageMaker</b>.</p>

<h1>Question 32 Part A - Which of the following big data products (relational databases, data warehouses, data lakes, or similar) do you use on a regular basis?</h1>

In [None]:
products_used = data[data.columns[data.columns.get_loc("Q32_A_Part_1") : data.columns.get_loc("Q32_A_OTHER") + 1 ]]
products = []
for col in products_used.columns:
        products.append([products_used[col][0].split("-")[2]])
        products.append(products_used[col][1:].dropna().values)

In [None]:
products = list(itertools.chain.from_iterable(products))
products = [i.strip() for i in products]
products, counts = np.unique(products,return_counts=True)

In [None]:
plt.figure(figsize=(10,7))
plt.barh(products,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Big Data Products used on Regular basis")
plt.ylabel("Product")
plt.xlabel("Count")
plt.show()

<p><b>Inference : </b>SQL related databases are used most followed by Big Data Processing systems like <b>BigQuery</b>, <b>BigTable</b> etc.</p>

<h1>Question 32 Part B - Which of the following big data products (relational databases, data warehouses, data lakes, or similar) do you hope to become more familiar with in the next 2 years?</h1>

In [None]:
products_preferred = data[data.columns[data.columns.get_loc("Q32_B_Part_1") : data.columns.get_loc("Q32_B_OTHER") + 1 ]]
products = []
for col in products_preferred.columns:
        products.append([products_preferred[col][0].split("-")[2]])
        products.append(products_preferred[col][1:].dropna().values)

In [None]:
products = list(itertools.chain.from_iterable(products))
products = [i.strip() for i in products]
products, counts = np.unique(products,return_counts=True)

In [None]:
plt.figure(figsize=(10,7))
plt.barh(products,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Big Data Products used on Regular basis")
plt.ylabel("Product")
plt.xlabel("Count")
plt.show()

<p><b>Inference : </b>SQL related databases are used mostly followed by Big Data Processing Systems like <b>BigQuery</b>, <b>BigTable</b>, <b>Redshift</b> etc. </p>

<h1>Question 33 - Which of the following big data products (relational database, data warehouse, data lake, or similar) do you use most often?</h1>

In [None]:
products = data.Q33.dropna().values[1:]
products, counts = np.unique(products,return_counts=True)

In [None]:
plt.figure(figsize=(10,7))
plt.barh(products,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Big Data Products used most often")
plt.ylabel("Product")
plt.xlabel("Count")
plt.show()

<p><b>Inference : </b>SQL related databases are used most followed by Big Data Processing systems like <b>BigQuery</b>, <b>BigTable</b> etc.</p>

<h1>Question 34 Part A - Which of the following business intelligence tools do you use on a regular basis?</h1>

In [None]:
bi_used = data[data.columns[data.columns.get_loc("Q34_A_Part_1") : data.columns.get_loc("Q34_A_OTHER") + 1 ]]
bi = []
for col in bi_used.columns:
        bi.append([bi_used[col][0].split("-")[2]])
        bi.append(bi_used[col][1:].dropna().values)

In [None]:
bi = list(itertools.chain.from_iterable(bi))
bi = [i.strip() for i in bi]
bi, counts = np.unique(bi,return_counts=True)

In [None]:
plt.figure(figsize=(10,7))
plt.barh(bi,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Big Intelligence Tools used on Regular basis")
plt.ylabel("Tool")
plt.xlabel("Count")
plt.show()

<p><b>Inference : </b><b>Tableau</b>, <b>Power BI</b> and <b>Data Studio</b> are used on a regular basis, although a good number of users don't use any business intelligence tools.</p>

<h1>Question 34 Part B - Which of the following business intelligence tools do you hope to become more familiar with in the next 2 years?</h1>

In [None]:
tools_preferred = data[data.columns[data.columns.get_loc("Q34_B_Part_1") : data.columns.get_loc("Q34_B_OTHER") + 1 ]]
tools = []
for col in tools_preferred.columns:
        tools.append([tools_preferred[col][0].split("-")[2]])
        tools.append(tools_preferred[col][1:].dropna().values)

In [None]:
tools = list(itertools.chain.from_iterable(tools))
tools = [i.strip() for i in tools]
tools, counts = np.unique(tools,return_counts=True)

In [None]:
plt.figure(figsize=(10,7))
plt.barh(tools,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Big Intelligence Tools preferred to become familiar")
plt.ylabel("Tool")
plt.xlabel("Count")
plt.show()

<p><b>Inference : </b><b>Tableau</b>, <b>Power BI</b> and <b>Data Studio</b> are the tools preferred to become familiar</p>

<h1>Question 35 - Which of the following business intelligence tools do you use most often?</h1>

In [None]:
bi_used = data.Q35.dropna().values[1:]
bi_used,counts = np.unique(bi_used,return_counts=True)

In [None]:
plt.figure(figsize=(10,7))
plt.barh(bi_used,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Big Intelligence Tools used most often")
plt.ylabel("Tool")
plt.xlabel("Count")
plt.show()

<p><b>Inference : </b><b>Tableau</b>, <b>Power BI</b> and <b>Data Studio</b> are the tools used most often.</p>

<h1>Question 36 Part A - Do you use any automated machine learning tools (or partial AutoML tools) on a regular basis?</h1>

In [None]:
tools_used = data[data.columns[data.columns.get_loc("Q36_A_Part_1") : data.columns.get_loc("Q36_A_OTHER") + 1 ]]
tools = []
for col in tools_used.columns:
        tools.append([tools_used[col][0].split("-")[2]])
        tools.append(tools_used[col][1:].dropna().values)

In [None]:
tools = list(itertools.chain.from_iterable(tools))
tools = [i.strip() for i in tools]
tools, counts = np.unique(tools,return_counts=True)

In [None]:
tools = np.delete(tools,4)
counts[5] = counts[4] + counts[5]
counts = np.delete(counts,4)

In [None]:
plt.figure(figsize=(10,7))
plt.barh(tools,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Auto ML Tools used most often")
plt.ylabel("Tool")
plt.xlabel("Count")
plt.show()

<p><b>Inference : </b>Majority of the users not use any Auto ML tools.</p>

<h1>Question 36 Part B - Which categories of automated machine learning tools (or partial AutoML tools) do you hope to become more familiar with in the next 2 years? </h1>

In [None]:
tools_preferred = data[data.columns[data.columns.get_loc("Q36_B_Part_1") : data.columns.get_loc("Q36_B_OTHER") + 1 ]]
tools = []
for col in tools_preferred.columns:
        tools.append([tools_preferred[col][0].split("-")[2]])
        tools.append(tools_preferred[col][1:].dropna().values)

In [None]:
tools = list(itertools.chain.from_iterable(tools))
tools = [i.strip() for i in tools]
tools, counts = np.unique(tools,return_counts=True)

In [None]:
tools = np.delete(tools,4)
counts[5] = counts[4] + counts[5]
counts = np.delete(counts,4)

In [None]:
plt.figure(figsize=(10,7))
plt.barh(tools,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Auto ML Tools preferred to get familiar")
plt.ylabel("Tool")
plt.xlabel("Count")
plt.show()

<p><b>Inference : </b>Users majorly prefer tools related to Automation of <b>ML Pipelines</b>, <b>Model Selection</b>, <b>Hyperparameter Tuning</b>, <b>Feature Engieering</b>, <b>Data Augmentation</b>.</p>

<h1>Question 37 Part A - Which of the following automated machine learning tools (or partial AutoML tools) do you use on a regular basis?</h1>

In [None]:
tools_used = data[data.columns[data.columns.get_loc("Q37_A_Part_1") : data.columns.get_loc("Q37_A_OTHER") + 1 ]]
tools = []
for col in tools_used.columns:
        tools.append([tools_used[col][0].split("-")[2]])
        tools.append(tools_used[col][1:].dropna().values)

In [None]:
tools = list(itertools.chain.from_iterable(tools))
tools = [i.strip() for i in tools]
tools, counts = np.unique(tools,return_counts=True)

In [None]:
plt.figure(figsize=(10,7))
plt.barh(tools,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Auto ML Tools used on Regular basis")
plt.ylabel("Tool")
plt.xlabel("Count")
plt.show()

<p><b>Inference : </b>Majority of the users do not use any Auto ML prodoucts on a regular basis but there are good number of users for <b>Google Cloud AutoML</b> and <b>Azure Automated ML</b> and <b>AWS Sagemaker</b>.</p>

<h1>Question 37 Part B - Which specific automated machine learning tools (or partial AutoML tools) do you hope to become more familiar with in the next 2 years?</h1>

In [None]:
tools_preferred = data[data.columns[data.columns.get_loc("Q37_B_Part_1") : data.columns.get_loc("Q37_B_OTHER") + 1 ]]
tools = []
for col in tools_preferred.columns:
        tools.append([tools_preferred[col][0].split("-")[2]])
        tools.append(tools_preferred[col][1:].dropna().values)

In [None]:
tools = list(itertools.chain.from_iterable(tools))
tools = [i.strip() for i in tools]
tools, counts = np.unique(tools,return_counts=True)

In [None]:
plt.figure(figsize=(10,7))
plt.barh(tools,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Auto ML Tools preferred to become familiar")
plt.ylabel("Tool")
plt.xlabel("Count")
plt.show()

<p><b>Inference : </b>The most commonly hear <b>Google Cloud AutoML</b>, <b>Azure Automated Machine Learning</b>, <b>AWS Sagemaker</b> are the ones which users prefer to get familiar.</p>

<h1>Question 38 Part A - Do you use any tools to help manage machine learning experiments?</h1>

In [None]:
tools_used = data[data.columns[data.columns.get_loc("Q38_A_Part_1") : data.columns.get_loc("Q38_A_OTHER") + 1 ]]
tools = []
for col in tools_used.columns:
        tools.append([tools_used[col][0].split("-")[2]])
        tools.append(tools_used[col][1:].dropna().values)

In [None]:
tools = list(itertools.chain.from_iterable(tools))
tools = [i.strip() for i in tools]
tools, counts = np.unique(tools,return_counts=True)

In [None]:
plt.figure(figsize=(10,7))
plt.barh(tools,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Tools used to manage Machine Learning Projects")
plt.ylabel("Tool")
plt.xlabel("Count")
plt.show()

<p><b>Inference : </b>Majority of the users do not use any tools for managing ML Products, although there are a good number of users opting for <b>Weights & Biases</b>, <b>TensorBoard</b> and <b>MLflow</b>.</p>

<h1>Question 38 Part B - In the next 2 years, do you hope to become more familiar with any of these tools for managing ML experiments</h1>

In [None]:
tools_preferred = data[data.columns[data.columns.get_loc("Q38_B_Part_1") : data.columns.get_loc("Q38_B_OTHER") + 1 ]]
tools = []
for col in tools_preferred.columns:
        tools.append([tools_preferred[col][0].split("-")[2]])
        tools.append(tools_preferred[col][1:].dropna().values)

In [None]:
tools = list(itertools.chain.from_iterable(tools))
tools = [i.strip() for i in tools]
tools, counts = np.unique(tools,return_counts=True)

In [None]:
plt.figure(figsize=(10,7))
plt.barh(tools,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Tools preferred to become familiar")
plt.ylabel("Tool")
plt.xlabel("Count")
plt.show()

<p><b>Inference : </b>Majority of the users do not prefer any tools for managing ML Projects, but there are good number of votes for <b>Weights & Biases</b>, <b>Tensorboard</b> and <b>MLflow</b>.</p>

<h1>Question 39 - Where do you publicly share your data analysis or machine learning applications?</h1>

In [None]:
share_loc = data[data.columns[data.columns.get_loc("Q39_Part_1") : data.columns.get_loc("Q39_OTHER") + 1 ]]
share = []
for col in share_loc.columns:
        share.append([share_loc[col][0].split("-")[2]])
        share.append(share_loc[col][1:].dropna().values)

In [None]:
share = list(itertools.chain.from_iterable(share))
share = [i.strip() for i in share]
share, counts = np.unique(share,return_counts=True)

In [None]:
plt.figure(figsize=(10,7))
plt.barh(share,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Platform used for sharing projects publicly")
plt.ylabel("Platform")
plt.xlabel("Count")
plt.show()

<p><b>Inference : </b>Users who share prefer <b>Personal Blog</b>, <b>Kaggle</b>, <b>GitHub</b> or <b>Colab</b> as their platform while a good number of users do not prefer sharing as well.</p>

<h1>Question 40 - On which platforms have you begun or completed data science courses?</h1>

In [None]:
platforms_used = data[data.columns[data.columns.get_loc("Q40_Part_1") : data.columns.get_loc("Q40_OTHER") + 1 ]]
platform = []
for col in platforms_used.columns:
        platform.append([platforms_used[col][0].split("-")[2]])
        platform.append(platforms_used[col][1:].dropna().values)

In [None]:
platform = list(itertools.chain.from_iterable(platform))
platform = [i.strip() for i in platform]
platform, counts = np.unique(platform,return_counts=True)

In [None]:
platform = np.delete(platform,0)
counts[1] = counts[0] + counts[1]
counts = np.delete(counts,0)

In [None]:
plt.figure(figsize=(10,7))
plt.barh(platform,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Platform used for learning Data Science")
plt.ylabel("Platform")
plt.xlabel("Count")
plt.show()

<p><b>Inference : </b>For external sources, <b>Cousera</b>, <b>Kaggle Courses</b> and <b>Udemy</b> stands forward. In the case formal education, <b>University Courses</b> or a <b>Degree</b> is preferrred.</p>

<h1>Question 41 - What is the primary tool that you use at work or school to analyze data?</h1>

In [None]:
tools_used = data.Q41.dropna().values[1:]
tools,counts = np.unique(tools_used,return_counts=True)

In [None]:
plt.figure(figsize=(10,7))
plt.barh(tools,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Primary Tool used to Analyze data")
plt.ylabel("Tool")
plt.xlabel("Count")
plt.show()

<p><b>Inference : </b><b>Local IDEs</b> or <b>Basic Statistical Software</b> is used the most to analyze data.</p>

<h1>Question 42 - Who/what are your favorite media sources that report on data science topics?</h1>

In [None]:
media_sources = data[data.columns[data.columns.get_loc("Q42_Part_1") : data.columns.get_loc("Q42_OTHER") + 1 ]]
sources = []
for col in media_sources.columns:
        sources.append([media_sources[col][0].split("-")[2]])
        sources.append(media_sources[col][1:].dropna().values)

In [None]:
sources = list(itertools.chain.from_iterable(sources))
sources = [i.strip() for i in sources]
sources, counts = np.unique(sources,return_counts=True)

In [None]:
sources = np.delete(sources,3)
counts[4] = counts[4] + counts[3]
counts = np.delete(counts,3)

In [None]:
plt.figure(figsize=(10,7))
plt.barh(sources,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Media Sources used that report Data Science Topics")
plt.ylabel("Source")
plt.xlabel("Count")
plt.show()

<p><b>Inference : </b><b>Blogs</b>, <b>Kaggle</b> and <b>Youtube Channels</b> are most preferred platforms that report Data Science topics.</p>

<h1>Dashboard</h1>

In [None]:
from IPython.display import display
plt.rcParams.update({'figure.max_open_warning': 0})
plt.rcParams["axes.linewidth"]  = 0
plt.rcParams.update({'axes.facecolor':'whitesmoke'})
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
unique,counts = np.unique(data.Q1[1:],return_counts=True)
plt.title("Age group")
plt.xlabel("Age group")
plt.ylabel("Count")
plt.bar(unique,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='black', linestyle='solid', linewidth=0.1, alpha=0.5,zorder=0)

plt.subplot(1,2,2)
unique,counts = np.unique(data.Q2[1:],return_counts=True)
plt.title("Gender")
plt.ylabel("Gender")
plt.xlabel("Count")
plt.barh(unique,counts,zorder=3,color=['forestgreen','green','springgreen','aquamarine','darkslategrey'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.tight_layout()

plt.figure(figsize=(15,7))
plt.subplot(1,2,1)
data.Q4.replace("Bachelor’s degree","BS",inplace=True)
data.Q4.replace("Doctoral degree","Phd",inplace=True)
data.Q4.replace("Master’s degree","MS",inplace=True)
data.Q4.replace("I prefer not to answer","No Answer",inplace=True)
data.Q4.replace("No formal education past high school","High School",inplace=True)
data.Q4.replace("Some college/university study without earning a bachelor’s degree","College",inplace=True)
unique,counts = np.unique(data.Q4[1:],return_counts=True)
plt.title("Education")
plt.xlabel("Level")
plt.pie(counts,labels=unique)
plt.legend(loc="lower right",bbox_to_anchor=(0,1))

plt.subplot(1,2,2)
data.Q6.replace("I have never written code","Never",inplace=True)
unique,counts = np.unique(data.Q6[1:],return_counts=True)
plt.title("Coding Experience")
plt.ylabel("Years")
plt.xlabel("Count")
plt.barh(unique,counts,zorder=3,color=['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.tight_layout()

plt.figure(figsize=(15,6))
unique,counts = np.unique(data.Q5[1:],return_counts=True)
plt.title("Profession")
plt.ylabel("Type")
plt.xlabel("Count")
plt.barh(unique,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.tight_layout()

countries = []
lat = []
lon = []
un, counts = np.unique(data.Q3.values[1:],return_counts=True)
for i in range(len(un)):
    try:
        countries.append(country_name_to_country_alpha2(un[i]))
    except:
        pass
    
counts = pd.read_csv("../input/locations/world_country_and_usa_states_latitude_and_longitude_values.csv")
for con in countries:
    lat.append(counts[counts["country_code"] == con].latitude.values[0])
    lon.append(counts[counts["country_code"] == con].longitude.values[0])

import folium
from folium.plugins import MarkerCluster
world_map= folium.Map(tiles="cartodbpositron")
marker_cluster = MarkerCluster().add_to(world_map)
for i in range(len(lat)):
        if(lat[i] == 0 ):
            continue
        radius=5
        popup_text = """Country : {}"""
        popup_text = popup_text.format(country_alpha2_to_country_name(countries[i]))
        folium.CircleMarker(location = [lat[i], lon[i]] ,radius=radius, popup= popup_text, fill =True).add_to(marker_cluster)
display(world_map)
#<p style="text-align : justify;"><b>Inference : </b>People from age group <b>18-30</b> are more on Kaggle and reduces from there onwards. Majority of the people working on Kaggle are pursuing either <b>BS</b>, <b>MS</b> or <b>Phd</b> which shows that students and researches are more in number. Coders with <b>1-5</b> years of experience mostly use kaggle compared to experienced people (<b>5+ years</b>). As observed before, <b>Students</b> are the major users of kaggle in comparison with industry experts. Within, professionals <b>Data Scientist</b>, <b>Data Analyst</b> and <b>Software Engineer</b> are the major users of Kaggle.</p>

#<h2>Question 7 - What programming languages do you use on a regular basis? (Select all that apply)</h2>

languages_used = data[data.columns[data.columns.get_loc("Q7_Part_1") : data.columns.get_loc("Q7_OTHER") + 1 ]]
languages = []
for col in languages_used.columns:
        languages.append([languages_used[col][0].split("-")[2]])
        languages.append(languages_used[col][1:].dropna().values)

languages = list(itertools.chain.from_iterable(languages))
languages = [i.strip() for i in languages]
languages, counts = np.unique(languages,return_counts=True)

plt.figure(figsize=(15,5))
plt.bar(languages,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Programming Language used on Regular basis")
plt.xlabel("Language")
plt.ylabel("Count")
plt.tight_layout()

#<p><b>Inference : </b>Majority of people on Kaggle use <b>Python</b> and <b>SQL</b> on a regular basis along with <b>R</b> in a good amount. This aligns with the fact from previous visualisation that a large number of people, including students on Kaggle belong to Data Science and Software Engineering Field.</p>

#<h2>Question 8 - What programming language would you recommend an aspiring data scientist to learn first?</h2>

languages_preferred = data.Q8.dropna().to_numpy()[1:]
languages_preferred, counts = np.unique(languages_preferred,return_counts=True)
plt.figure(figsize=(15,8))
plt.title("Recommended Languages for Aspiring Data Scientists")
plt.ylabel("Language")
plt.xlabel("Count")
plt.barh(languages_preferred,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.tight_layout()

#<p><b>Inference : </b>As heard usually, <b>Python</b>, <b>R</b> and <b>SQL</b> are the most recommended languages for an aspiring Data Scientist.</p>

#<h2>Question 9 - Which of the following integrated development environments (IDE's) do you use on a regular basis?</h2>

ide_used = data[data.columns[data.columns.get_loc("Q9_Part_1") : data.columns.get_loc("Q9_OTHER") + 1 ]]
ides = []
for col in ide_used.columns:
        ides.append([ide_used[col][0].split("-")[2]])
        ides.append(ide_used[col][1:].dropna().values)

ides = list(itertools.chain.from_iterable(ides))
ides = [i.strip() for i in ides]
ides, counts = np.unique(ides,return_counts=True)

ides = ides[1:]
ides[0] = "Jupyter"
ides[-1] = "VS Code"
counts[1] += counts[0]
counts = counts[1:]

plt.figure(figsize=(15,5))
plt.bar(ides,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("IDEs used on Regular basis")
plt.xlabel("IDE")
plt.ylabel("Count")
plt.tight_layout()

#<p><b>Inference : </b><b>Jupyter</b>, <b>PyCharm</b> and <b>VS Code</b> are the most commonly used IDEs since they support Notebook Environments.</p>

#<h2>Question 10 - Which of the following hosted notebook products do you use on a regular basis?</h2>

notebooks_used = data[data.columns[data.columns.get_loc("Q10_Part_1") : data.columns.get_loc("Q10_OTHER") + 1 ]]
notebooks = []
for col in notebooks_used.columns:
        notebooks.append([notebooks_used[col][0].split("-")[2]])
        notebooks.append(notebooks_used[col][1:].dropna().values)

notebooks = list(itertools.chain.from_iterable(notebooks))
notebooks = [i.strip() for i in notebooks]
notebooks, counts = np.unique(notebooks,return_counts=True)

plt.figure(figsize=(15,7))
plt.barh(notebooks,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Notebook Products used on Regular basis")
plt.ylabel("Product")
plt.xlabel("Count")
plt.tight_layout()

#<p><b>Inference : </b>Although a good number of peole does not use any hosted notebook IDEs on a regular  basis, <b>Kaggle Notebooks</b> and <b>Colab Notebooks</b> are common and mostly used among the people who uses hosted IDEs.</p>

#<h2>Question 11 - What type of computing platform do you use most often for your data science projects?</h2>

platform = data.Q11.dropna().values[1:]
platform, counts = np.unique(platform,return_counts=True)

plt.figure(figsize=(15,8))
plt.pie(counts,labels=platform)
plt.title("Platform used for Data Science Projects")
plt.ylabel("Platform")
plt.xlabel("Count")
plt.legend(loc="lower right",bbox_to_anchor=(0,1))
plt.tight_layout()

#<p><b>Inference : </b>Majority of the population uses a PC (Laptop / Desktop), few use <b>Cloud Platforms</b> as well (maybe for higher workloads or in competitions).</p>

#<h1>Question 12 - Which types of specialized hardware do you use on a regular basis?</h1>

hardware_used = data[data.columns[data.columns.get_loc("Q12_Part_1") : data.columns.get_loc("Q12_OTHER") + 1 ]]
hardwares = []
for col in hardware_used.columns:
        hardwares.append([hardware_used[col][0].split("-")[2]])
        hardwares.append(hardware_used[col][1:].dropna().values)

hardwares = list(itertools.chain.from_iterable(hardwares))
hardwares = [i.strip() for i in hardwares]
hardwares, counts = np.unique(hardwares,return_counts=True)

plt.figure(figsize=(15,5))
plt.bar(hardwares,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Specialized hardware used on Regular basis")
plt.xlabel("Hardware")
plt.ylabel("Count")
plt.tight_layout()

#<p><b>Inference : </b>Majority of the users do not prefer any accelarators (CPU only), <b>GPUs</b> and <b>TPUs</b> are popular among the people who use accelarators.</p>

#<h1>Question 13 - Approximately how many times have you used a TPU (tensor processing unit)?</h1>

tpu = data.Q13.dropna().values[1:]
tpu, counts = np.unique(tpu,return_counts=True)

plt.figure(figsize=(15,7))
plt.bar(tpu,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Usage of TPU")
plt.xlabel("Frequency")
plt.ylabel("Count")
plt.tight_layout()

#<p><b>Inference : </b>TPU is used almost never or really less in number.</p>

#<h1>Question 14 - What data visualization libraries or tools do you use on a regular basis?</h1>

vis_used = data[data.columns[data.columns.get_loc("Q14_Part_1") : data.columns.get_loc("Q14_OTHER") + 1 ]]
tools = []
for col in vis_used.columns:
        tools.append([vis_used[col][0].split("-")[2]])
        tools.append(vis_used[col][1:].dropna().values)

tools = list(itertools.chain.from_iterable(tools))
tools = [i.strip() for i in tools]
tools, counts = np.unique(tools,return_counts=True)

plt.figure(figsize=(15,7))
plt.barh(tools,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Visualization tools used on Regular basis")
plt.ylabel("Tool")
plt.xlabel("Count")
plt.tight_layout()

#<p><b>Inference : </b><b>Seaborn</b>, <b>Matplotlib</b>, <b>Plotly</b> and <b>Ggplot</b> are the most commonly used Visualization Libraries with <b>Matplotlib</b> as the most used followed by <b>Seaborn</b>.</p>

#<h1>Question 15 - For how many years have you used machine learning methods?</h1>

years = data.Q15.dropna().values[1:]
years, counts = np.unique(years,return_counts=True)

plt.figure(figsize=(15,7))
plt.barh(years,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Years of using Machine Learning Methods")
plt.ylabel("Experience")
plt.xlabel("Count")
plt.tight_layout()

#<p><b>Inference : </b>People who have not much experience with ML Methods are more, pointing to the fact that they are either students or fresh in career.</p>

#<h1>Question 16 - Which of the following machine learning frameworks do you use on a regular basis?</h1>

frameworks_used = data[data.columns[data.columns.get_loc("Q16_Part_1") : data.columns.get_loc("Q16_OTHER") + 1 ]]
frameworks = []
for col in frameworks_used.columns:
        frameworks.append([frameworks_used[col][0].split("-")[2]])
        frameworks.append(frameworks_used[col][1:].dropna().values)

frameworks = list(itertools.chain.from_iterable(frameworks))
frameworks = [i.strip() for i in frameworks]
frameworks, counts = np.unique(frameworks,return_counts=True)

frameworks = np.delete(frameworks,14)
counts[15] = counts[15] + counts[14]
counts = np.delete(counts,14)

plt.figure(figsize=(15,9))
plt.barh(frameworks,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("ML Frameworks used on Regular Basis")
plt.ylabel("Framework")
plt.xlabel("Count")
plt.tight_layout()

#<p><b>Inference : </b><b>Xgboost</b>, <b>Scikit-learn</b>, <b>Tensorflow</b>, <b>Pytorhc</b> and <b>Keras</b> are the most used ML Frameworks</b></p>

#<h1>Question 17 - Which of the following ML algorithms do you use on a regular basis? </h1>

algo_used = data[data.columns[data.columns.get_loc("Q17_Part_1") : data.columns.get_loc("Q17_OTHER") + 1 ]]
algos = []
for col in algo_used.columns:
        algos.append([algo_used[col][0].split("-")[2]])
        algos.append(algo_used[col][1:].dropna().values)

algos = list(itertools.chain.from_iterable(algos))
algos = [i.strip() for i in algos]
algos, counts = np.unique(algos,return_counts=True)

counts[0] += counts[1]
algos = np.delete(algos,len(algos) - 2)
counts = np.delete(counts,len(counts) -2)

plt.figure(figsize=(15,9))
plt.barh(algos,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("ML Algorithms used on Regular Basis")
plt.ylabel("Algorithm")
plt.xlabel("Count")
plt.tight_layout()

#<p><b>Inference : </b>Traditional ML Algorithms like <b>Regression</b>, <b>Decision Trees</b>, <b>Random Forests</b> and <b>Bayesian Models</b> are more used in comparison with <b>Neural Networks</b>.</p>

#<h1>Question 18 - Which of the following Computer Vision Methods do you use on a regular basis?</h1>

computer_vision_methods_used = data[data.columns[data.columns.get_loc("Q18_Part_1") : data.columns.get_loc("Q18_OTHER") + 1 ]]
methods = []
for col in computer_vision_methods_used.columns:
        methods.append([computer_vision_methods_used[col][0].split("-")[2]])
        methods.append(computer_vision_methods_used[col][1:].dropna().values)

methods = list(itertools.chain.from_iterable(methods))
methods = [i.strip() for i in methods]
methods, counts = np.unique(methods,return_counts=True)


methods = np.delete(methods,3)
counts[4] = counts[4] + counts[3]
counts = np.delete(counts,3)

plt.figure(figsize=(15,9))
plt.barh(methods,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Computer Vision methods used on Regular Basis")
plt.ylabel("Method")
plt.xlabel("Count")
plt.tight_layout()

#<p><b>Inference : </b><b>Image Classification</b> and <b>Object Detection</b> models like VGG, ResNet, YOLO etc are commonly used along with libraries like <b>PIL</b> and <b>CV2</b>. Image Classification stands ahead of everything.</p>

#<h1>Question 19 - Which of the following natural language processing (NLP) methods do you use on a regular basis?</h1>

nlp_methods_used = data[data.columns[data.columns.get_loc("Q19_Part_1") : data.columns.get_loc("Q19_OTHER") + 1 ]]
methods = []
for col in nlp_methods_used.columns:
        methods.append([nlp_methods_used[col][0].split("-")[2]])
        methods.append(nlp_methods_used[col][1:].dropna().values)

methods = list(itertools.chain.from_iterable(methods))
methods = [i.strip() for i in methods]
methods, counts = np.unique(methods,return_counts=True)

methods = np.delete(methods,5)
counts[6] = counts[6] + counts[5]
counts = np.delete(counts,5)

methods = np.delete(methods,1)
counts[2] = counts[1] + counts[2]
counts = np.delete(counts,1)

plt.figure(figsize=(15,9))
plt.barh(methods,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("NLP methods used on Regular Basis")
plt.ylabel("Method")
plt.xlabel("Count")
plt.tight_layout()

#<p><b>Inference : </b>Word Embedding Algorithms like <b>word2vec</b> etc are used more often. For modelling, pre-trained <b>Transformer Models</b> and <b>Encoder-Decoder models</b> are preferred.</p>

#<h1>Question 20 - In what industry is your current employer/contract (or your most recent employer if retired)?</h1>

employer_ind = data.Q20.dropna().values[1:]
employer_ind, counts = np.unique(employer_ind,return_counts=True)

plt.figure(figsize=(15,9))
plt.barh(employer_ind,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Employer Industry")
plt.ylabel("Industry")
plt.xlabel("Count")
plt.tight_layout()

#<p><b>Inference : </b>As observed before as well, users majorly belong to <b>Technology</b>, <b>Academics</b> or <b>Education</b> field.</p>

#<h1>Question 21 - What is the size of the company where you are employed?
#</h1>

employer_size = data.Q21.dropna().values[1:]
employer_size, counts = np.unique(employer_size,return_counts=True)

plt.figure(figsize=(15,9))
plt.barh(employer_size,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Employer Size")
plt.ylabel("Size")
plt.xlabel("Count")
plt.tight_layout()

#<p><b>Inference : </b>Most People are working in a small scale industries (with approx 50 people) in comparison with the count of people working in Medium and Large scale industries.</p>

#<h1>Question 22 - Approximately how many individuals are responsible for data science workloads at your place of business?</h1>

data_science_count = data.Q22.dropna().values[1:]
data_science_count, counts = np.unique(data_science_count,return_counts=True)

plt.figure(figsize=(15,5))
plt.bar(data_science_count,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Number of Individuals responsible for Data Science Workload")
plt.xlabel("Number")
plt.ylabel("Count")
plt.tight_layout()

plt.figure(figsize=(15,9))
plt.subplot(2,2,1)
un,counts = np.unique(data[data.Q21 == "0-49 employees"].Q22.dropna().values,return_counts=True)
plt.title("Small Scale Industries")
plt.xlabel("Data Science Team Count")
plt.ylabel("Count")
plt.bar(un,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)

plt.subplot(2,2,2)
un,counts = np.unique(data[(data.Q21 == "50-249 employees") | (data.Q21 == "250-999 employees")].Q22.dropna().values,return_counts=True)
plt.title("Medium Scale Industries")
plt.xlabel("Data Science Team Count")
plt.ylabel("Count")
plt.bar(un,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)

plt.subplot(2,2,3)
un,counts = np.unique(data[(data.Q21 == "1000-9,999 employees") | (data.Q21 == "10000 or more employees")].Q22.dropna().values,return_counts=True)
plt.title("Large Scale Industries")
plt.xlabel("Data Science Team Count")
plt.ylabel("Count")
plt.bar(un,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)

plt.tight_layout()

#<p><b>Inference : </b>This visualisation returns mixed results - There are industries where a big team (<b>20+</b>) handles data science workloads while on the other side a small team (<b>1-2</b>) is also deployed for data science workloads. The smaller teams belong to the <b>Startups</b> while <b>Medium Scale</b> industries has a mixed team of smalle and large for varying workloads. <b>Large Scale</b> deploy larger teams (20+) because of their realtively.</p>

#<h1>Question 23 - Does your current employer incorporate machine learning methods into their business?</h1>

ML_business = data.Q23.dropna().values[1:]
ML_business, counts = np.unique(ML_business,return_counts=True)

plt.figure(figsize=(15,5))
plt.barh(ML_business,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Use of ML by Current Employer")
plt.ylabel("Use")
plt.xlabel("Count")
plt.tight_layout()

#<p><b>Inference : </b>Majority of the industries belong to the category where the ML domain is only getting explored or not even considered for using but at the same time, there are a good number of industries who have recently started or already use ML methods.</p>

#<h1>Question 24 - Select any activities that make up an important part of your role at work</h1>

activities_work = data[data.columns[data.columns.get_loc("Q24_Part_1") : data.columns.get_loc("Q24_OTHER") + 1 ]]
activities = []
for col in activities_work.columns:
        activities.append([activities_work[col][0].split("-")[2]])
        activities.append(activities_work[col][1:].dropna().values)

activities = list(itertools.chain.from_iterable(activities))
activities = [i.strip() for i in activities]
activities, counts = np.unique(activities,return_counts=True)

plt.figure(figsize=(15,5))
plt.barh(activities,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Major Activity at work")
plt.ylabel("Activity")
plt.xlabel("Count")
plt.tight_layout()

#<p><b>Inference : </b>Industries often use analyatics to generate business decisions while there are a good number who built it into production or apply experimentation to find areas for implementing ML or improve the existing ones. There also exists industries who work on Data Science workloads related to data storage and analytics infrastructure.</p>

#<h1>Question 25 - What is your current yearly compensation (approximate $USD)?</h1>

compensation = data.Q25.dropna().values[1:]
compensation, counts = np.unique(compensation,return_counts=True)

plt.figure(figsize=(15,10))
plt.barh(compensation,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Current Yearly Compensation")
plt.ylabel("Compensation Range")
plt.xlabel("Count")
plt.tight_layout()

#<p><b>Inference : </b><b>0-999 USD</b> is the most chosen Compensation.</p>

#<h1>Question 26 - Approximately how much money have you (or your team) spent on machine learning and/or cloud computing services at home (or at work) in the past 5 years (approximate $USD)?</h1>

money_spent = data.Q26.dropna().values[1:]
money_spent, counts = np.unique(money_spent,return_counts=True)

plt.figure(figsize=(15,5))
plt.bar(money_spent,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Money spent on ML / Cloud Computing Services at Home or at Work in past 5 years")
plt.xlabel("Money Spent")
plt.ylabel("Count")
plt.tight_layout()

plt.figure(figsize=(15,30))
unique,counts = np.unique(data.Q26.dropna().values[1:],return_counts=True)
for i in range(len(unique)):
    plt.subplot(len(unique),2,i+1)
    un,counts = np.unique(data[data.Q26 == unique[i]].Q5.dropna().values,return_counts=True)
    plt.title(unique[i])
    plt.xlabel("Count")
    plt.ylabel("Profession")
    plt.barh(un,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
    plt.tick_params(bottom=False,left=False)
    plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.tight_layout()

#<p><b>Inference : </b>Most people do not spend much on Cloud Servies, although there are few people who spent money on Cloud Services. The larger spending mostly came from Work while the smaller ones may belong to personal projects / learning etc. The profession of the group of people spending on Cloud Servies mainly belong to <b>Software Engineer</b>, <b>Data Scientist</b> or <b>Data Analyst</b>.</p>

#<h1>Question 27 Part A - Which of the following cloud computing platforms do you use on a regular basis?</h1>

platforms_used = data[data.columns[data.columns.get_loc("Q27_A_Part_1") : data.columns.get_loc("Q27_A_OTHER") + 1 ]]
platforms = []
for col in platforms_used.columns:
        platforms.append([platforms_used[col][0].split("-")[2]])
        platforms.append(platforms_used[col][1:].dropna().values)

platforms = list(itertools.chain.from_iterable(platforms))
platforms = [i.strip() for i in platforms]
platforms, counts = np.unique(platforms,return_counts=True)

plt.figure(figsize=(15,6))
plt.subplot(1,2,1)
plt.pie(counts,labels=platforms)
plt.title("     Cloud Computing Platforms used on Regular basis")
#plt.legend(loc="lower right",bbox_to_anchor=(0,1))


#<p><b>Inference : </b><b>AWS</b> is used on a large number followed by <b>GCP</b> and <b>Azure</b>. They can be either for personal work or for Professional Purpose.</p>
#<h1>Question 27 Part B -Which of the following cloud computing platforms do you hope to become more familiar with in the next 2 years?</h1>

platforms_familiar = data[data.columns[data.columns.get_loc("Q27_B_Part_1") : data.columns.get_loc("Q27_B_OTHER") + 1 ]]
platforms = []
for col in platforms_familiar.columns:
        platforms.append([platforms_familiar[col][0].split("-")[2]])
        platforms.append(platforms_familiar[col][1:].dropna().values)

platforms = list(itertools.chain.from_iterable(platforms))
platforms = [i.strip() for i in platforms]
platforms, counts = np.unique(platforms,return_counts=True)

plt.subplot(1,2,2)
plt.pie(counts,labels=platforms)
plt.title("     Cloud Computing Platforms preferred to become familiar")
#plt.legend(loc="lower right",bbox_to_anchor=(0,1))
plt.tight_layout()

#<p><b>Inference : </b>The commonly used <b>AWS</b>, <b>GCP</b> and <b>Azure</b> are the ones people prefer to get familiar.</p>

#<h1>Question 28 - Of the cloud platforms that you are familiar with, which has the best developer experience (most enjoyable to use)?</h1>

best_platform = data.Q28.dropna().values[1:]
best_platform, counts = np.unique(best_platform,return_counts=True)

plt.figure(figsize=(15,8))
plt.barh(best_platform,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Cloud Computing Platforms with best Developer Experience")
plt.ylabel("Platform")
plt.xlabel("Count")
plt.tight_layout()

#<p><b>Inference : </b><b>AWS</b> has a good number of votes followed by <b>GCP</b> and <b>Azure</b>. Large group of people also find all of the cloud platforms to have a similar user experience.</p>

#<h1>Question 29 Part A - Do you use any of the following cloud computing products on a regular basis?</h1>

products_used = data[data.columns[data.columns.get_loc("Q29_A_Part_1") : data.columns.get_loc("Q29_A_OTHER") + 1 ]]
products = []
for col in products_used.columns:
        products.append([products_used[col][0].split("-")[2]])
        products.append(products_used[col][1:].dropna().values)

products = list(itertools.chain.from_iterable(products))
products = [i.strip() for i in products]
products, counts = np.unique(products,return_counts=True)

plt.figure(figsize=(15,5))
plt.bar(products,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Cloud Computing Products used on Regular basis")
plt.xlabel("Product")
plt.ylabel("Count")
plt.tight_layout()

#<p><b>Inference : </b><b>Cloud VMs</b> are the mostly used product.</p>

#<h1>Question 29 Part B - In the next 2 years, do you hope to become more familiar with any of these specific cloud computing products? </h1>

products_preferred = data[data.columns[data.columns.get_loc("Q29_B_Part_1") : data.columns.get_loc("Q29_B_OTHER") + 1 ]]
products = []
for col in products_preferred.columns:
        products.append([products_preferred[col][0].split("-")[2]])
        products.append(products_preferred[col][1:].dropna().values)

products = list(itertools.chain.from_iterable(products))
products = [i.strip() for i in products]
products, counts = np.unique(products,return_counts=True)

plt.figure(figsize=(15,5))
plt.bar(products,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Cloud Computing Products preferred to become familiar")
plt.xlabel("Product")
plt.ylabel("Count")
plt.tight_layout()

#<p><b>Inference : </b>Again, <b>Cloud VMs</b> are the preferred product to get familiar with. <b>GCP</b> is the most chosen platform followed for <b>AWS</b> and <b>Azure</b>.</p>

#<h1>Question 30 Part A - Do you use any of the following data storage products on a regular basis?</h1>

products_used = data[data.columns[data.columns.get_loc("Q30_A_Part_1") : data.columns.get_loc("Q30_A_OTHER") + 1 ]]
products = []
for col in products_used.columns:
        products.append([products_used[col][0].split("-")[2]])
        products.append(products_used[col][1:].dropna().values)

products = list(itertools.chain.from_iterable(products))
products = [i.strip() for i in products]
products, counts = np.unique(products,return_counts=True)

plt.figure(figsize=(15,7))
plt.barh(products,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Storage Products used on Regular basis")
plt.ylabel("Product")
plt.xlabel("Count")
plt.tight_layout()

#<p><b>Inference : </b>Basic storage products like <b>GCS</b> and <b>S3</b> are more commonly used.</p>

#<h1>Question 30 Part B - In the next 2 years, do you hope to become more familiar with any of these specific data storage products?</h1>

products_preferred = data[data.columns[data.columns.get_loc("Q30_B_Part_1") : data.columns.get_loc("Q30_B_OTHER") + 1 ]]
products = []
for col in products_preferred.columns:
        products.append([products_preferred[col][0].split("-")[2]])
        products.append(products_preferred[col][1:].dropna().values)

products = list(itertools.chain.from_iterable(products))
products = [i.strip() for i in products]
products, counts = np.unique(products,return_counts=True)

plt.figure(figsize=(15,7))
plt.barh(products,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Storage Products preferred to become familiar with")
plt.ylabel("Product")
plt.xlabel("Count")
plt.tight_layout()

#<p><b>Inference : </b>Really few votes for this question - But all products are equally preferred.</p>

#<h1>Question 31 Part A - Do you use any of the following managed machine learning products on a regular basis?</h1>

ml_products_used = data[data.columns[data.columns.get_loc("Q31_A_Part_1") : data.columns.get_loc("Q31_A_OTHER") + 1 ]]
ml_products = []
for col in ml_products_used.columns:
        ml_products.append([ml_products_used[col][0].split("-")[2]])
        ml_products.append(ml_products_used[col][1:].dropna().values)

ml_products = list(itertools.chain.from_iterable(ml_products))
ml_products = [i.strip() for i in ml_products]
ml_products, counts = np.unique(ml_products,return_counts=True)

plt.figure(figsize=(15,7))
plt.barh(ml_products,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Managed ML Products used on Regular basis")
plt.ylabel("Product")
plt.xlabel("Count")
plt.tight_layout()

#<p><b>Inference : </b>Managed ML Products</p>

#<h1>Question 31 Part B - In the next 2 years, do you hope to become more familiar with any of these managed machine learning products?</h1>

ml_products_preferred = data[data.columns[data.columns.get_loc("Q31_B_Part_1") : data.columns.get_loc("Q31_B_OTHER") + 1 ]]
ml_products = []
for col in ml_products_preferred.columns:
        ml_products.append([ml_products_preferred[col][0].split("-")[2]])
        ml_products.append(ml_products_preferred[col][1:].dropna().values)

ml_products = list(itertools.chain.from_iterable(ml_products))
ml_products = [i.strip() for i in ml_products]
ml_products, counts = np.unique(ml_products,return_counts=True)

plt.figure(figsize=(15,7))
plt.barh(ml_products,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Managed ML Products preferred to get familiar with")
plt.ylabel("Product")
plt.xlabel("Count")
plt.tight_layout()

#<p><b>Inference : </b>For people willing to Managed ML proucts, <b>Google Cloud Vertex AI</b>, <b>Azure ML Studio</b> and <b>Amazon SageMaker</b>.</p>

#<h1>Question 32 Part A - Which of the following big data products (relational databases, data warehouses, data lakes, or similar) do you use on a regular basis?</h1>

products_used = data[data.columns[data.columns.get_loc("Q32_A_Part_1") : data.columns.get_loc("Q32_A_OTHER") + 1 ]]
products = []
for col in products_used.columns:
        products.append([products_used[col][0].split("-")[2]])
        products.append(products_used[col][1:].dropna().values)

products = list(itertools.chain.from_iterable(products))
products = [i.strip() for i in products]
products, counts = np.unique(products,return_counts=True)

plt.figure(figsize=(15,7))
plt.barh(products,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Big Data Products used on Regular basis")
plt.ylabel("Product")
plt.xlabel("Count")
plt.tight_layout()

#<p><b>Inference : </b>SQL related databases are used most followed by Big Data Processing systems like <b>BigQuery</b>, <b>BigTable</b> etc.</p>

#<h1>Question 32 Part B - Which of the following big data products (relational databases, data warehouses, data lakes, or similar) do you hope to become more familiar with in the next 2 years?</h1>

products_preferred = data[data.columns[data.columns.get_loc("Q32_B_Part_1") : data.columns.get_loc("Q32_B_OTHER") + 1 ]]
products = []
for col in products_preferred.columns:
        products.append([products_preferred[col][0].split("-")[2]])
        products.append(products_preferred[col][1:].dropna().values)

products = list(itertools.chain.from_iterable(products))
products = [i.strip() for i in products]
products, counts = np.unique(products,return_counts=True)

plt.figure(figsize=(15,7))
plt.barh(products,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Big Data Products used on Regular basis")
plt.ylabel("Product")
plt.xlabel("Count")
plt.tight_layout()

#<p><b>Inference : </b>SQL related databases are used mostly followed by Big Data Processing Systems like <b>BigQuery</b>, <b>BigTable</b>, <b>Redshift</b> etc. </p>

#<h1>Question 33 - Which of the following big data products (relational database, data warehouse, data lake, or similar) do you use most often?</h1>

products = data.Q33.dropna().values[1:]
products, counts = np.unique(products,return_counts=True)

plt.figure(figsize=(15,7))
plt.barh(products,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Big Data Products used most often")
plt.ylabel("Product")
plt.xlabel("Count")
plt.tight_layout()

#<p><b>Inference : </b>SQL related databases are used most followed by Big Data Processing systems like <b>BigQuery</b>, <b>BigTable</b> etc.</p>

#<h1>Question 34 Part A - Which of the following business intelligence tools do you use on a regular basis?</h1>

bi_used = data[data.columns[data.columns.get_loc("Q34_A_Part_1") : data.columns.get_loc("Q34_A_OTHER") + 1 ]]
bi = []
for col in bi_used.columns:
        bi.append([bi_used[col][0].split("-")[2]])
        bi.append(bi_used[col][1:].dropna().values)

bi = list(itertools.chain.from_iterable(bi))
bi = [i.strip() for i in bi]
bi, counts = np.unique(bi,return_counts=True)

plt.figure(figsize=(15,7))
plt.barh(bi,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Big Intelligence Tools used on Regular basis")
plt.ylabel("Tool")
plt.xlabel("Count")
plt.tight_layout()

#<p><b>Inference : </b><b>Tableau</b>, <b>Power BI</b> and <b>Data Studio</b> are used on a regular basis, although a good number of users don't use any business intelligence tools.</p>

#<h1>Question 34 Part B - Which of the following business intelligence tools do you hope to become more familiar with in the next 2 years?</h1>

tools_preferred = data[data.columns[data.columns.get_loc("Q34_B_Part_1") : data.columns.get_loc("Q34_B_OTHER") + 1 ]]
tools = []
for col in tools_preferred.columns:
        tools.append([tools_preferred[col][0].split("-")[2]])
        tools.append(tools_preferred[col][1:].dropna().values)

tools = list(itertools.chain.from_iterable(tools))
tools = [i.strip() for i in tools]
tools, counts = np.unique(tools,return_counts=True)

plt.figure(figsize=(15,7))
plt.barh(tools,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Big Intelligence Tools preferred to become familiar")
plt.ylabel("Tool")
plt.xlabel("Count")
plt.tight_layout()

#<p><b>Inference : </b><b>Tableau</b>, <b>Power BI</b> and <b>Data Studio</b> are the tools preferred to become familiar</p>

#<h1>Question 35 - Which of the following business intelligence tools do you use most often?</h1>

bi_used = data.Q35.dropna().values[1:]
bi_used,counts = np.unique(bi_used,return_counts=True)

plt.figure(figsize=(15,7))
plt.barh(bi_used,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Big Intelligence Tools used most often")
plt.ylabel("Tool")
plt.xlabel("Count")
plt.tight_layout()

#<p><b>Inference : </b><b>Tableau</b>, <b>Power BI</b> and <b>Data Studio</b> are the tools used most often.</p>

#<h1>Question 36 Part A - Do you use any automated machine learning tools (or partial AutoML tools) on a regular basis?</h1>

tools_used = data[data.columns[data.columns.get_loc("Q36_A_Part_1") : data.columns.get_loc("Q36_A_OTHER") + 1 ]]
tools = []
for col in tools_used.columns:
        tools.append([tools_used[col][0].split("-")[2]])
        tools.append(tools_used[col][1:].dropna().values)

tools = list(itertools.chain.from_iterable(tools))
tools = [i.strip() for i in tools]
tools, counts = np.unique(tools,return_counts=True)

tools = np.delete(tools,4)
counts[5] = counts[4] + counts[5]
counts = np.delete(counts,4)

plt.figure(figsize=(15,7))
plt.barh(tools,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Auto ML Tools used most often")
plt.ylabel("Tool")
plt.xlabel("Count")
plt.tight_layout()

#<p><b>Inference : </b>Majority of the users not use any Auto ML tools.</p>

#<h1>Question 36 Part B - Which categories of automated machine learning tools (or partial AutoML tools) do you hope to become more familiar with in the next 2 years? </h1>

tools_preferred = data[data.columns[data.columns.get_loc("Q36_B_Part_1") : data.columns.get_loc("Q36_B_OTHER") + 1 ]]
tools = []
for col in tools_preferred.columns:
        tools.append([tools_preferred[col][0].split("-")[2]])
        tools.append(tools_preferred[col][1:].dropna().values)

tools = list(itertools.chain.from_iterable(tools))
tools = [i.strip() for i in tools]
tools, counts = np.unique(tools,return_counts=True)

tools = np.delete(tools,4)
counts[5] = counts[4] + counts[5]
counts = np.delete(counts,4)

plt.figure(figsize=(15,7))
plt.barh(tools,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Auto ML Tools preferred to get familiar")
plt.ylabel("Tool")
plt.xlabel("Count")
plt.tight_layout()

#<p><b>Inference : </b>Users majorly prefer tools related to Automation of <b>ML Pipelines</b>, <b>Model Selection</b>, <b>Hyperparameter Tuning</b>, <b>Feature Engieering</b>, <b>Data Augmentation</b>.</p>

#<h1>Question 37 Part A - Which of the following automated machine learning tools (or partial AutoML tools) do you use on a regular basis?</h1>

tools_used = data[data.columns[data.columns.get_loc("Q37_A_Part_1") : data.columns.get_loc("Q37_A_OTHER") + 1 ]]
tools = []
for col in tools_used.columns:
        tools.append([tools_used[col][0].split("-")[2]])
        tools.append(tools_used[col][1:].dropna().values)

tools = list(itertools.chain.from_iterable(tools))
tools = [i.strip() for i in tools]
tools, counts = np.unique(tools,return_counts=True)

plt.figure(figsize=(15,7))
plt.barh(tools,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Auto ML Tools used on Regular basis")
plt.ylabel("Tool")
plt.xlabel("Count")
plt.tight_layout()

#<p><b>Inference : </b>Majority of the users do not use any Auto ML prodoucts on a regular basis but there are good number of users for <b>Google Cloud AutoML</b> and <b>Azure Automated ML</b> and <b>AWS Sagemaker</b>.</p>

#<h1>Question 37 Part B - Which specific automated machine learning tools (or partial AutoML tools) do you hope to become more familiar with in the next 2 years?</h1>

tools_preferred = data[data.columns[data.columns.get_loc("Q37_B_Part_1") : data.columns.get_loc("Q37_B_OTHER") + 1 ]]
tools = []
for col in tools_preferred.columns:
        tools.append([tools_preferred[col][0].split("-")[2]])
        tools.append(tools_preferred[col][1:].dropna().values)

tools = list(itertools.chain.from_iterable(tools))
tools = [i.strip() for i in tools]
tools, counts = np.unique(tools,return_counts=True)

plt.figure(figsize=(15,7))
plt.barh(tools,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Auto ML Tools preferred to become familiar")
plt.ylabel("Tool")
plt.xlabel("Count")
plt.tight_layout()

#<p><b>Inference : </b>The most commonly hear <b>Google Cloud AutoML</b>, <b>Azure Automated Machine Learning</b>, <b>AWS Sagemaker</b> are the ones which users prefer to get familiar.</p>

#<h1>Question 38 Part A - Do you use any tools to help manage machine learning experiments?</h1>

tools_used = data[data.columns[data.columns.get_loc("Q38_A_Part_1") : data.columns.get_loc("Q38_A_OTHER") + 1 ]]
tools = []
for col in tools_used.columns:
        tools.append([tools_used[col][0].split("-")[2]])
        tools.append(tools_used[col][1:].dropna().values)

tools = list(itertools.chain.from_iterable(tools))
tools = [i.strip() for i in tools]
tools, counts = np.unique(tools,return_counts=True)

plt.figure(figsize=(15,7))
plt.barh(tools,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Tools used to manage Machine Learning Projects")
plt.ylabel("Tool")
plt.xlabel("Count")
plt.tight_layout()

#<p><b>Inference : </b>Majority of the users do not use any tools for managing ML Products, although there are a good number of users opting for <b>Weights & Biases</b>, <b>TensorBoard</b> and <b>MLflow</b>.</p>

#<h1>Question 38 Part B - In the next 2 years, do you hope to become more familiar with any of these tools for managing ML experiments</h1>

tools_preferred = data[data.columns[data.columns.get_loc("Q38_B_Part_1") : data.columns.get_loc("Q38_B_OTHER") + 1 ]]
tools = []
for col in tools_preferred.columns:
        tools.append([tools_preferred[col][0].split("-")[2]])
        tools.append(tools_preferred[col][1:].dropna().values)

tools = list(itertools.chain.from_iterable(tools))
tools = [i.strip() for i in tools]
tools, counts = np.unique(tools,return_counts=True)

plt.figure(figsize=(15,7))
plt.barh(tools,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Tools preferred to become familiar")
plt.ylabel("Tool")
plt.xlabel("Count")
plt.tight_layout()

#<p><b>Inference : </b>Majority of the users do not prefer any tools for managing ML Projects, but there are good number of votes for <b>Weights & Biases</b>, <b>Tensorboard</b> and <b>MLflow</b>.</p>

#<h1>Question 39 - Where do you publicly share your data analysis or machine learning applications?</h1>

share_loc = data[data.columns[data.columns.get_loc("Q39_Part_1") : data.columns.get_loc("Q39_OTHER") + 1 ]]
share = []
for col in share_loc.columns:
        share.append([share_loc[col][0].split("-")[2]])
        share.append(share_loc[col][1:].dropna().values)

share = list(itertools.chain.from_iterable(share))
share = [i.strip() for i in share]
share, counts = np.unique(share,return_counts=True)

plt.figure(figsize=(15,7))
plt.barh(share,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Platform used for sharing projects publicly")
plt.ylabel("Platform")
plt.xlabel("Count")
plt.tight_layout()

#<p><b>Inference : </b>Users who share prefer <b>Personal Blog</b>, <b>Kaggle</b>, <b>GitHub</b> or <b>Colab</b> as their platform while a good number of users do not prefer sharing as well.</p>

#<h1>Question 40 - On which platforms have you begun or completed data science courses?</h1>

platforms_used = data[data.columns[data.columns.get_loc("Q40_Part_1") : data.columns.get_loc("Q40_OTHER") + 1 ]]
platform = []
for col in platforms_used.columns:
        platform.append([platforms_used[col][0].split("-")[2]])
        platform.append(platforms_used[col][1:].dropna().values)

platform = list(itertools.chain.from_iterable(platform))
platform = [i.strip() for i in platform]
platform, counts = np.unique(platform,return_counts=True)

platform = np.delete(platform,0)
counts[1] = counts[0] + counts[1]
counts = np.delete(counts,0)

plt.figure(figsize=(15,7))
plt.barh(platform,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.legend(loc="lower right",bbox_to_anchor=(0,1))
plt.title("     Platform used for learning Data Science")
plt.ylabel("Platform")
plt.xlabel("Count")
plt.tight_layout()

#<p><b>Inference : </b>For external sources, <b>Cousera</b>, <b>Kaggle Courses</b> and <b>Udemy</b> stands forward. In the case formal education, <b>University Courses</b> or a <b>Degree</b> is preferrred.</p>

#<h1>Question 41 - What is the primary tool that you use at work or school to analyze data?</h1>

tools_used = data.Q41.dropna().values[1:]
tools,counts = np.unique(tools_used,return_counts=True)

plt.figure(figsize=(15,7))
plt.barh(tools,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Primary Tool used to Analyze data")
plt.ylabel("Tool")
plt.xlabel("Count")
plt.tight_layout()

#<p><b>Inference : </b><b>Local IDEs</b> or <b>Basic Statistical Software</b> is used the most to analyze data.</p>

#<h1>Question 42 - Who/what are your favorite media sources that report on data science topics?</h1>

media_sources = data[data.columns[data.columns.get_loc("Q42_Part_1") : data.columns.get_loc("Q42_OTHER") + 1 ]]
sources = []
for col in media_sources.columns:
        sources.append([media_sources[col][0].split("-")[2]])
        sources.append(media_sources[col][1:].dropna().values)

sources = list(itertools.chain.from_iterable(sources))
sources = [i.strip() for i in sources]
sources, counts = np.unique(sources,return_counts=True)

sources = np.delete(sources,3)
counts[4] = counts[4] + counts[3]
counts = np.delete(counts,3)

plt.figure(figsize=(15,7))
plt.barh(sources,counts,zorder=3,color = ['forestgreen','green','springgreen','aquamarine','darkslategrey','aqua','powderblue','lightskyblue','lightslategray','lightsteelblue'])
plt.tick_params(bottom=False,left=False)
plt.grid(True,color='white', linestyle='solid', linewidth=2.0, alpha=0.5,zorder=0)
plt.title("Media Sources used that report Data Science Topics")
plt.ylabel("Source")
plt.xlabel("Count")
plt.tight_layout()

plt.savefig("fig.png")
#<p><b>Inference : </b><b>Blogs</b>, <b>Kaggle</b> and <b>Youtube Channels</b> are most preferred platforms that report Data Science topics.</p>