## **2020 Kaggle ML and DS Survey Results**

The challenge objective: tell a data story about a subset of the data science community represented in this survey, through a combination of both narrative text and data exploration. A “story” could be defined any number of ways, and that’s deliberate. The challenge is to deeply explore (through data) the impact, priorities, or concerns of a specific group of data science and machine learning practitioners. That group can be defined in the macro (for example: anyone who does most of their coding in Python) or the micro (for example: female data science students studying machine learning in masters programs).

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from wordcloud import WordCloud
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import plotly.graph_objects as go
import plotly
import plotly.offline as py
from plotly.offline import iplot
import plotly.io as pio
import cufflinks as cf
import plotly.express as px
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# **Reading Data**

In [None]:
data = pd.read_csv('../input/kaggle-survey-2020/kaggle_survey_2020_responses.csv',sep=',')

In [None]:
#Making the First Row the column names
new_header = data.iloc[0] #grab the first row for the header
data = data[1:] #take the data less the header row
data.columns = new_header #set the header row as the df header
data.rename(columns=data.iloc[0])
data.head()

# **EDA**

Basic information about people who is filling the survey

In [None]:
#Extracting the basic information about people
basic_information = data.iloc[:,1:7]
basic_information.columns = ['AgeRange','DataScientistGender','Country','CurrentEducation','CurrentRole','CodingExperience']

In [None]:
#Creating interactive countplot for age range
AgeRange = basic_information['AgeRange'].value_counts()
AgeRange.iplot(kind="bar",color='green',title='KAGGLE USERS AGE',size=15)

In [None]:
#Checking Gender for Data Scientists
basic_information['DataScientistGender'].isnull().values.any()
scientist_gender = basic_information['DataScientistGender'].value_counts()
scientist_gender = pd.DataFrame(scientist_gender)
scientist_gender.columns = ['DataScientistCount']
scientist_gender.index.names = ['Gender']
fig = px.bar(scientist_gender, y='DataScientistCount', x=scientist_gender.index,text='DataScientistCount',title='K A G G L E  U S E R S  G E N D E R')
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=12, uniformtext_mode='hide')
fig.show()

In [None]:
#Percentage per gender
fig = px.pie(scientist_gender, values='DataScientistCount', names=scientist_gender.index, title='K A G G L E R S  G E N D E R  D I S T R I B U T I O N',height=600)
fig.show()

## **Kagglers Countries**

In [None]:
country = basic_information['Country'].value_counts()
country.columns = [['Countries_Count']]
country.index.names = ['Countries']
fig = px.bar(country, x=country.index, y='Country',
             hover_data=[country.index, 'Country'], color='Country',
             labels={'Country':'Count'}, height=800,title='K A G G L E  U S E R S  B Y  C O U N T R Y')
fig.show()

In [None]:
#Percentage per country
fig = px.pie(country, values='Country', names=country.index, title='Percentage of kagglers distributed by countries',height=1000)
fig.show()

In [None]:
#Country WordCloud
allwords = ' '.join([twts for twts in country.index])
wordcloud = WordCloud(width=600, height=300, random_state=21, max_font_size=119,max_words=300,background_color='white').generate(allwords)
plt.figure(figsize=(12,8))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

In [None]:
#Filter Genders by Country
countryAndGender = basic_information[['DataScientistGender','Country']]
mancountry = countryAndGender[countryAndGender['DataScientistGender']=='Man']
womancountry = countryAndGender[countryAndGender['DataScientistGender']=='Woman']
prefer_not_country = countryAndGender[countryAndGender['DataScientistGender']=='Prefer not to say']
self_describe_country = countryAndGender[countryAndGender['DataScientistGender']=='Prefer to self-describe']
nonBinaryCountry = countryAndGender[countryAndGender['DataScientistGender']=='Nonbinary']

#Value count for each dataframe
mancountry = mancountry ['Country'].value_counts()
womancountry = womancountry['Country'].value_counts()
prefer_not_country = prefer_not_country ['Country'].value_counts()
self_describe_country = self_describe_country['Country'].value_counts()
nonBinaryCountry = nonBinaryCountry['Country'].value_counts()
#Man Barchart
mancountry = pd.DataFrame(mancountry)
mancountry.columns = ['Count']
mancountry.index.names = ['Countries']
fig = px.bar(mancountry , y='Count', x=mancountry.index,text='Count',color='Count')
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=10, uniformtext_mode='hide',height=900,title='M A L E  U S E R S')
fig.show()

#woman Barchart
womancountry = pd.DataFrame(womancountry)
womancountry.columns = ['Count']
womancountry.index.names = ['Countries']
fig = px.bar(womancountry , y='Count', x=womancountry.index,text='Count',color='Count')
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=10, uniformtext_mode='hide',height=900,title='F E M A L E  U S E R S')
fig.show()

#prefer not to say Barchart
prefer_not_country  = pd.DataFrame(prefer_not_country )
prefer_not_country.columns = ['Count']
prefer_not_country.index.names = ['Countries']
fig = px.bar(prefer_not_country  , y='Count', x=prefer_not_country.index, text='Count',color='Count')
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=10, uniformtext_mode='hide',height=900,title='P R E F E R  N O T  T O S H A R E   U S E R S')
fig.show()

#self describe Barchart
self_describe_country  = pd.DataFrame(self_describe_country )
self_describe_country.columns = ['Count']
self_describe_country.index.names = ['Countries']
fig = px.bar(self_describe_country  , y='Count', x=self_describe_country.index ,text='Count',color='Count')
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=10, uniformtext_mode='hide',height=900,title='S E L F  D E S C R I B E  U S E R S')
fig.show()

#nonbinary Barchart
nonBinaryCountry = pd.DataFrame(nonBinaryCountry)
nonBinaryCountry.columns = ['Count']
nonBinaryCountry.index.names = ['Countries']
fig = px.bar(nonBinaryCountry, y='Count', x=nonBinaryCountry.index,text='Count',color='Count')
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=10, uniformtext_mode='hide',height=900,title='N O N  B I N A R Y  U S E R S')
fig.show()

In [None]:
#Percentage pe male kagglers
fig = px.pie(mancountry, values='Count', names=mancountry.index, title='Percentage of kagglers by male gender for each country',height=950)
fig.show()

#Percentage per woman kagglers
fig = px.pie(womancountry, values='Count', names=womancountry.index, title='Percentage of kagglers by Woman gender for each country',height=950)
fig.show()

#Percentage per prefer not to say kagglers
fig = px.pie(prefer_not_country, values='Count', names=prefer_not_country.index, title='Percentage of kagglers by "Perefer not to say" gender for each country',height=950)
fig.show()

#Percentage per self describe kagglers
fig = px.pie(self_describe_country, values='Count', names=self_describe_country.index, title='Percentage of kagglers by "Prefer to self-describe" gender for each country',height=950)
fig.show()

#Percentage nonbinary kagglers
fig = px.pie(nonBinaryCountry, values='Count', names=nonBinaryCountry.index, title='Percentage of kagglers by Nonbinary gender for each country',height=950)
fig.show()

In [None]:
#Taking info about current education
currentEducation = basic_information[['CurrentEducation']]
currentEducation = currentEducation.value_counts()
currentEducation = pd.DataFrame(currentEducation)
currentEducation.columns = ['Count']
currentEducation.reset_index(level=0, inplace=True)
#Percentage per country
fig = px.pie(currentEducation, values='Count', title='K A G G L E R S  C U R R E N T  E D U C A T I O N',height=700,names='CurrentEducation')
fig.show()

In [None]:
#Creating Dataframe for coding exp
codingExp = basic_information['CodingExperience']
codingExp = pd.DataFrame(codingExp)
codingExp = codingExp.value_counts()
codingExp.columns = ['ExpCount']
expCountlist = codingExp.tolist()
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=['3-5 years','1-2 years','< 1 years','5-10 years','10-20 years','20+ years'],
    y=expCountlist ,
    marker=dict(color="green", size=12),
    mode="markers",
    name ='Programmers Count Based on Years Writing Code',
))
fig.add_trace(go.Scatter(
    x=['3-5 years','1-2 years','< 1 years','5-10 years','10-20 years','20+ years'],
    y=[(round(codingExp[0] / sum(codingExp) * 100)),(round(codingExp[1] / sum(codingExp) * 100)),(round(codingExp[2] / sum(codingExp) * 100)),(round(codingExp[3] / sum(codingExp) * 100)),(round(codingExp[4] / sum(codingExp) * 100)),(round(codingExp[5] / sum(codingExp) * 100)),(round(codingExp[6] / sum(codingExp) * 100))],
    marker=dict(color="red", size=12),
    mode="markers",
    name="% Percentage",
))

fig.update_layout(title="W R I T I N G  C O  D E  E X P E R I E N C E  C O U N T  B A S E D  O N  Y E A R S  D O I N G  I T",
                  xaxis_title="Years Writing Code",
                  yaxis_title="Count Based on Years")

fig.show()


In [None]:
#Programming Language use
progBasis = data.iloc[:,7:20]
progBasis.columns =  ['language1','language2','language3','language4','language5','language6','language7','language8','language9','language10','language11','language12','language13']
python = progBasis['language1'].value_counts()
R = progBasis['language2'].value_counts()
SQL = progBasis['language3'].value_counts()
C = progBasis['language4'].value_counts()
Cplus = progBasis['language5'].value_counts()
java = progBasis['language6'].value_counts()
javascript = progBasis['language7'].value_counts()
julia = progBasis['language8'].value_counts()
swift = progBasis['language9'].value_counts()
bash = progBasis['language10'].value_counts()
matlab = progBasis['language11'].value_counts()
none = progBasis['language12'].value_counts()
other = progBasis['language13'].value_counts()
#concat Dataframes
progBasis = pd.concat([python,R,SQL,C,Cplus,java,javascript,julia,swift,bash,matlab,none,other])
progBasis.index.names = ['Programming_Languages']
fig = px.bar(progBasis, x=progBasis, y=progBasis.index, color=progBasis.index, orientation='h',
             height=500,)
fig.update_layout(title="P R O G R A M M I N G  L A N G U A G E S  U S E D  B Y  K A G G L E R S",
                  xaxis_title="Count",
                  yaxis_title='Programming Languages')
fig.show()

#Percentage
fig = px.pie(progBasis, values=progBasis, title='D I S T R I B U T I O N  O F  "IDE"  U S E D  B Y  M O S T  K A G G L E R S',height=700,names=progBasis.index)
fig.show()

#programming Language WordCloud
allwords = ' '.join([programminglanguage for programminglanguage in progBasis.index])
wordcloud = WordCloud(width=600, height=300, random_state=21, max_font_size=119,max_words=300,background_color='pink').generate(allwords)
plt.figure(figsize=(12,8))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

In [None]:
#Selecting Columns related to ide`s
IDE = data.iloc[:,21:33]
IDE.columns =  ['IDE1', 'IDE2', 'IDE3', 'IDE4', 'IDE5', 'IDE6', 'IDE7', 'IDE8', 'IDE9', 'IDE10', 'IDE11', 'IDE12']
jupyter = IDE['IDE1'].value_counts()
rstudio = IDE['IDE2'].value_counts()
visualStudio = IDE['IDE3'].value_counts()
visualStudioCode = IDE['IDE4'].value_counts()
pycharm = IDE['IDE5'].value_counts()
spyder = IDE['IDE6'].value_counts()
notepadplus = IDE['IDE7'].value_counts()
sublimetext = IDE['IDE8'].value_counts()
vim = IDE['IDE9'].value_counts()
matlab = IDE['IDE10'].value_counts()
none = IDE['IDE11'].value_counts()
other= IDE['IDE12'].value_counts()
#concat Dataframes
IDE = pd.concat([jupyter,rstudio,visualStudio,visualStudioCode,pycharm,spyder,notepadplus,sublimetext,vim,matlab,none,other])
IDE.index.names = ['IDE']
fig = px.bar(progBasis, x=IDE, y=IDE.index, color=IDE.index, orientation='h',
             height=500,)
fig.update_layout(title="C O U N T  O F  E N V I R O N M E N T S  U S E D  B Y  K A G G L E R S",
                  xaxis_title="Count",
                  yaxis_title='Integrated development environment')
fig.show()

#Percentage
fig = px.pie(IDE, values=IDE, title='D I S T R I B U T I O N  O F  "IDE"  U S E D  B Y  M O S T  K A G G L E R S',height=700,names=IDE.index)
fig.show()

In [None]:
#Selecting Columns related to ide`s
notebooks = data.iloc[:,33:47]
notebooks.columns =  ['notebooks1', 'notebooks2', 'notebooks3', 'notebooks4', 'notebooks5', 'notebooks6', 'notebooks7', 'notebooks8', 'notebooks9', 'notebooks10', 'notebooks11', 'notebooks12','notebooks13','notebooks14']
kagglenotebook = notebooks['notebooks1'].value_counts()
colab = notebooks['notebooks2'].value_counts()
azure = notebooks['notebooks3'].value_counts()
paperspace = notebooks['notebooks4'].value_counts()
binder = notebooks['notebooks5'].value_counts()
ocean = notebooks['notebooks6'].value_counts()
ibm = notebooks['notebooks7'].value_counts()
sagemaker = notebooks['notebooks8'].value_counts()
amazonEMR = notebooks['notebooks9'].value_counts()
googleplatform = notebooks['notebooks10'].value_counts()
googledatalab = notebooks['notebooks11'].value_counts()
databricks = notebooks['notebooks12'].value_counts()
none = notebooks['notebooks13'].value_counts()
other= notebooks['notebooks14'].value_counts()
#concat Dataframes
notebooks = pd.concat([kagglenotebook,colab,azure,paperspace,binder,ocean,ibm,sagemaker,amazonEMR,googleplatform,googledatalab,databricks,none,other])
notebooks.index.names = ['notebooks']
fig = px.bar(notebooks, x=notebooks, y=notebooks.index, color=notebooks.index, orientation='h',
             height=500,)
fig.update_layout(title="C O U N T  O F  E N V I R O N M E N T S  U S E D  B Y  K A G G L E R S",
                  xaxis_title="Count",
                  yaxis_title='Notebook')
fig.show()

#Percentage
fig = px.pie(notebooks, values=notebooks, title='D I S T R I B U T I O N  O F  N O T E B O O K S   U S E D  B Y  M O S T  K A G G L E R S',height=700,names=notebooks.index)
fig.show()

In [None]:
#Plaftorm
dsplatform = data.iloc[:,47]
dsplatform = dsplatform.value_counts()
dsplatform = pd.DataFrame(dsplatform)
dsplatform.columns = ['platformcount']
dsplatform.index.names = ['platform']
#Only 5 rows with Na values, let´s drop it
dsplatform.dropna(inplace=True)

#Plots
fig = px.bar(dsplatform , y='platformcount', x=dsplatform.index,text='platformcount',color='platformcount')
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=10, uniformtext_mode='hide',height=900,title='COUNT OF COMPUTER PLATFORMS USED BY USERS')
fig.show()

#Percentage
fig = px.pie(dsplatform, values='platformcount', title='D I S T R I B U T I O N  O F  C O M P U T E R  P L A T F O R M S  U S E D  A T  D A T A  S C I E N C E  P R O J E C T S',height=900,names=dsplatform.index,hole=0.4)
fig.show()

In [None]:
#Hardware use
hardware = data.iloc[:,48:52]
hardware.columns =  ['hardware1','hardware2','hardware3','hardware4']
gpu = hardware['hardware1'].value_counts()
tpu = hardware['hardware2'].value_counts()
none = hardware['hardware3'].value_counts()
other = hardware['hardware4'].value_counts()
#concat Dataframes
hardware = pd.concat([gpu,tpu,none,other])
hardware.index.names = ['hardware']
fig = px.bar(hardware, x=hardware, y=hardware.index, color=hardware.index, orientation='h',
             height=500,)
fig.update_layout(title="H A R D W A R E  U S E D  B Y  K A G G L E R S",
                  xaxis_title="Count",
                  yaxis_title='Hardware')
fig.show()

#Percentage
fig = px.pie(hardware, values=hardware, title='D I S T R I B U T I O N  O F  H A R D WA R E  U S E D  B Y  M O S T  K A G G L E R S',height=700,names=hardware.index,hole=0.5)
fig.show()


In [None]:
#How many times have you used a TPU
tpu_use = data.iloc[:,52]
tpu_use = tpu_use.value_counts()
tpu_use = pd.DataFrame(tpu_use)
tpu_use.columns = ['count']
tpu_use.index.names = ['tpu']
#Percentage
fig = px.pie(tpu_use , values='count' , title='¿HOW MANY TIMES KAGGLERS HAVE USED A TPU?',height=700,names=tpu_use.index,color_discrete_sequence=px.colors.sequential.RdBu,hole=0.3)
fig.show()


In [None]:
#libraries use
libraries = data.iloc[:,53:65]
libraries.columns = ['library1','library2','library3','library4','library5','library6','library7','library8','library9','library10','library11','library12']
matplotlib = libraries ['library1'].value_counts()
seaborn = libraries ['library2'].value_counts()
plotly = libraries ['library3'].value_counts()
ggplot = libraries ['library4'].value_counts()
shiny = libraries ['library5'].value_counts()
d3 = libraries ['library6'].value_counts()
altair = libraries ['library7'].value_counts()
bokeh = libraries ['library8'].value_counts()
geoplotlib = libraries ['library9'].value_counts()
folium = libraries ['library10'].value_counts()
none = libraries ['library11'].value_counts()
other = libraries ['library12'].value_counts()

#Concat Dataframes
libraries = pd.concat([matplotlib,seaborn,plotly,ggplot,shiny,d3,altair,bokeh,geoplotlib,folium,none,other])
libraries.columns = ['libraryCount']
libraries.index.names = ['library']

#plot
fig = px.bar(libraries, x=libraries, y=libraries.index, color=libraries.index, orientation='h',
             height=500)
fig.update_layout(title="L I B R A R I E S  U S E D  B Y  K A G G L E R S",
                  xaxis_title="Count",
                  yaxis_title='Library')
fig.show()

#Percentage
fig = px.pie(libraries , values=libraries  , title='L I B R A R I E S  U S E D  B Y  K A G G L E R S',height=700,names=libraries.index,color_discrete_sequence=px.colors.sequential.RdBu,hole=0.3)
fig.show()

In [None]:
#For how many years have you used machine learning methods?
ml_methods = data.iloc[:,65]
ml_methods = ml_methods.value_counts()
ml_methods.index.names = ['Time']

#plot
fig = px.bar(ml_methods , x=ml_methods , y=ml_methods .index, color=ml_methods .index, orientation='h',
             height=700)
fig.update_layout(title="¿For how many years have you used machine learning methods?",
                  xaxis_title="Count",
                  yaxis_title='Time')
fig.show()

#Percentage
fig = px.pie(ml_methods, values=ml_methods  , title='DISTRIBUTION OF YEARS',height=700,names=ml_methods.index,color_discrete_sequence=px.colors.sequential.RdBu,hole=0.3)
fig.show()

In [None]:
#Which of the following machine learning frameworks do you use on a regular basis? 
ml_frameworks = data.iloc[:,66:82]
ml_frameworks.columns = ['frameworks1','frameworks2','frameworks3','frameworks4','frameworks5','frameworks6','frameworks7','frameworks8','frameworks9','frameworks10','frameworks11','frameworks12','frameworks13','frameworks14','frameworks15','frameworks16']
skn = ml_frameworks['frameworks1'].value_counts()
tf = ml_frameworks['frameworks2'].value_counts()
keras = ml_frameworks['frameworks3'].value_counts()
pytorch = ml_frameworks['frameworks4'].value_counts()
fast = ml_frameworks['frameworks5'].value_counts()
mxnet = ml_frameworks['frameworks6'].value_counts()
xgboost = ml_frameworks['frameworks7'].value_counts()
lightgbm = ml_frameworks['frameworks8'].value_counts()
catboost = ml_frameworks['frameworks9'].value_counts()
prophet = ml_frameworks['frameworks10'].value_counts()
h2o = ml_frameworks['frameworks11'].value_counts()
caret = ml_frameworks['frameworks12'].value_counts()
tidymodels = ml_frameworks['frameworks13'].value_counts()
jax = ml_frameworks['frameworks14'].value_counts()
none = ml_frameworks['frameworks15'].value_counts()
other = ml_frameworks['frameworks16'].value_counts()

#dataframe concat
ml_frameworks = pd.concat([skn,tf,keras,pytorch,fast,mxnet,xgboost,catboost,prophet,h2o,caret,tidymodels,jax,none,other])
ml_frameworks.index.names = ['frameworks']
ml_frameworks.columns = ['frameworkcount']

#plot
fig = px.bar(ml_frameworks , x=ml_frameworks , y=ml_frameworks.index, color=ml_frameworks.index, orientation='h',
             height=700)
fig.update_layout(title="MACHINE LEARNING FRAMEWORK COUNT BY USERS",
                  xaxis_title="Count",
                  yaxis_title='FRAMEWORKS')
fig.show()

#Percentage
fig = px.pie(ml_frameworks ,values=ml_frameworks  , title='DISTRIBUTION OF FRAMEWORKS',height=700,names=ml_frameworks.index,color_discrete_sequence=px.colors.sequential.RdBu,hole=0.3)
fig.show()

#Machine learning frameworks WordCloud
allwords = ' '.join([framework for framework in ml_frameworks.index])
wordcloud = WordCloud(width=600, height=300, random_state=21, max_font_size=119,max_words=300,background_color='skyblue').generate(allwords)
plt.figure(figsize=(12,8))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

In [None]:
#Which of the following ML algorithms do you use on a regular basis
algorithms = data.iloc[:,82:94]
algorithms.columns = ['algorithm1','algorithm2','algorithm3','algorithm4','algorithm5','algorithm6','algorithm7','algorithm8','algorithm9','algorithm10','algorithm11','algorithm12']
lr = algorithms['algorithm1'].value_counts()
rf = algorithms['algorithm2'].value_counts()
xgboost_alg = algorithms['algorithm3'].value_counts()
bayesian = algorithms['algorithm4'].value_counts()
evolutionary = algorithms['algorithm5'].value_counts()
MLP = algorithms['algorithm6'].value_counts()
CNN = algorithms['algorithm7'].value_counts()
GAN = algorithms['algorithm8'].value_counts()
RNN = algorithms['algorithm9'].value_counts()
BERT = algorithms['algorithm10'].value_counts()
none = algorithms['algorithm11'].value_counts()
other = algorithms['algorithm12'].value_counts()

#Concat Dataframes
algorithms = pd.concat([lr,rf,xgboost_alg,bayesian,evolutionary,MLP,CNN,GAN,RNN,BERT,none,other])
algorithms.columns = ['algorithmCount']
algorithms.index.names = ['Algorithm']

#Barchart
fig = px.bar(algorithms, x=algorithms.index, y=algorithms,color=algorithms.index)
fig.update_layout(title="ALGORITHMS COUNT BY USERS",
                  xaxis_title="Count",
                  yaxis_title='Algorithms')
fig.show()

#Percentage
fig = px.pie(algorithms ,values=algorithms  , title='DISTRIBUTION OF ALGORITHMS',height=700,names=algorithms.index,hole=0.3)
fig.show()

#Percentage column based on count column
algorithms = pd.DataFrame(algorithms)
algorithms.columns = ['Count']
algorithms['Percentage'] = (algorithms['Count'] / algorithms['Count'].sum()) * 100
algorithms.reset_index(level=0, inplace=True)

fig = px.scatter(algorithms, x="Count", y="Percentage", text='Algorithm', log_x=False, size_max=60,color='Algorithm')

fig.update_traces(textposition='top center')

fig.update_layout(
    height=900,
    title_text='PERCENTAGE AND COUNT BY ALGORITHM'
)

fig.show()

In [None]:
#Which categories of computer vision methods do you use on a regular basis
computer_vision = data.iloc[:,95:101]
computer_vision.columns = ['computervision1','computervision2','computervision3','computervision4','computervision5','computervision6']
image_seg = computer_vision['computervision1'].value_counts()
object_det = computer_vision['computervision2'].value_counts()
image_classification = computer_vision['computervision3'].value_counts()
gan_cv = computer_vision['computervision4'].value_counts()
none = computer_vision['computervision5'].value_counts()
other = computer_vision['computervision6'].value_counts()

#Concat
computer_vision = pd.concat ([image_seg, object_det, image_classification, gan_cv, none, other])
computer_vision.columns = ['computervisionCount']
computer_vision.index.names = ['ComputerVision']


#Percentage
fig = px.pie(computer_vision ,values=computer_vision   , title='DISTRIBUTION OF COMPUTER VISION TECHNIQUES',height=1200,names=computer_vision.index,hole=0.3)
fig.show()



In [None]:
#Which of the following natural language processing (NLP) methods do you use on a regular basis
nlp = data.iloc[:,101:107]
nlp.columns = ['nlp1','nlp2','nlp3','nlp4','nlp5','nlp6']
word_embeddings = nlp['nlp1'].value_counts()
enconder_decoder = nlp['nlp2'].value_counts()
contextualized_embeddings = nlp['nlp3'].value_counts()
transformer_language = nlp['nlp4'].value_counts()
none = nlp['nlp5'].value_counts()
other = nlp['nlp6'].value_counts()

#Concat
nlp = pd.concat([word_embeddings ,enconder_decoder,contextualized_embeddings,transformer_language,none,other])
nlp.index.names = ['Natural Language Processing']

#Barchart
fig = px.bar(nlp  , y=nlp , x=nlp.index,text=nlp ,color=nlp.index,orientation='v')
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=10, uniformtext_mode='hide',height=1000,title='NATURAL LANGUAGE PROCESSING TECNHIQUES COUNT BY USERS')
fig.show()


#Percentage
fig = px.pie(nlp , values=nlp  , title='DISTRIBUTION OF NLP TECHNIQUES',height=1200,names=nlp.index,color_discrete_sequence=px.colors.sequential.RdBu,hole=0.4)
fig.show()


In [None]:
#Company Size
company_size = data.iloc[:,108]
company_size = pd.DataFrame(company_size)
#Drop NA values
company_size.dropna(inplace=True)
company_size.columns = ['CompanySize']
company_size = company_size['CompanySize'].value_counts()
company_size.index.names = ['Company Size']
#Percentage
colors = ['pink','skyblue','purple','green','yellow','lightgreen','orange']
fig = px.pie(company_size , values=company_size  , title='DISTRIBUTION OF COMPANY SIZE BASED ON EMPLOYEES',height=900,names=company_size.index,hole=0.8,color_discrete_sequence=colors)
fig.show()


In [None]:
#Select any activities that make up an important part of your role at work:
company_role = data.iloc[:,110:118]
company_role.columns = ['role1','role2','role3','role4','role5','role6','role7','role8']
influence_business = company_role['role1'].value_counts()
infrastructure = company_role['role2'].value_counts()
prototypes = company_role['role3'].value_counts()
workflows = company_role['role4'].value_counts()
experimentation = company_role['role5'].value_counts()
research = company_role['role6'].value_counts()
none = company_role['role7'].value_counts()
other = company_role['role8'].value_counts()

#DF concat
company_role = pd.concat([influence_business, infrastructure, prototypes, workflows, experimentation, research, none, other])
company_role.columns = ['roleCount']
company_role.index.names = ['role']

#Pie Chart
role_pie = px.pie(company_role, values=company_role,title='DISTRIBUTION OF ROLES AT COMPANIES',height=1200,names=company_role.index,hole=0.1)
role_pie.show()

In [None]:
#What is your current yearly compensation (approximate $USD)?
year_compensation = data.iloc[:,119]
year_compensation = year_compensation.value_counts()
year_compensation.index.names = ['SalaryRange']

#Barchart
fig = px.bar(year_compensation  , y=year_compensation , x=year_compensation.index,text=year_compensation ,color=year_compensation.index,orientation='v')
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=10, uniformtext_mode='hide',height=900,title='COUNT OF SALARY COMPENSATION BY USERS ON ONE YEAR',yaxis_title='Count')
fig.show()



In [None]:
#Cloud Computing Platforms
cloud_platform = data.iloc[:,120:132]
cloud_platform.columns = ['cloudplatform1','cloudplatform2','cloudplatform3','cloudplatform4','cloudplatform5','cloudplatform6','cloudplatform7','cloudplatform8','cloudplatform9','cloudplatform10','cloudplatform11','cloudplatform12']
#Dataframes
aws = cloud_platform['cloudplatform1'].value_counts()
azure = cloud_platform['cloudplatform2'].value_counts()
gpc = cloud_platform['cloudplatform3'].value_counts()
red_hat = cloud_platform['cloudplatform4'].value_counts()
oracle_cloud = cloud_platform['cloudplatform5'].value_counts()
sap_cloud = cloud_platform['cloudplatform6'].value_counts()
salesforce_cloud = cloud_platform['cloudplatform7'].value_counts()
vmware_cloud = cloud_platform['cloudplatform8'].value_counts()
alibaba = cloud_platform['cloudplatform9'].value_counts()
tencent = cloud_platform['cloudplatform10'].value_counts()
none = cloud_platform['cloudplatform11'].value_counts()
other = cloud_platform['cloudplatform12'].value_counts()

#Concat Dataframes
cloud_platform = pd.concat([aws, azure, gpc, red_hat, oracle_cloud, sap_cloud, salesforce_cloud, vmware_cloud, alibaba, tencent, none, other])
cloud_platform.index.names = ['CloudPlatform']
cloud_platform.columns = ['PlatformCount']

#Barchart
fig = px.bar(cloud_platform, x=cloud_platform.index, y=cloud_platform,
             hover_data=[cloud_platform.index,], color=cloud_platform.index,
              height=800,title='PLATFORM COUNT BY KAGGLERS')
fig.update_layout(yaxis_title='Count')
fig.show()