# Survey Methodology

*  The 2020 Kaggle DS & ML Survey received 20,036 usable responses from participants in 171
    different countries and territories. If a country or territory received less than 50 respondents, we
    grouped them into a group named “Other” for anonymity.
*  An invitation to participate in the survey was sent to the entire Kaggle community (anyone
    opted-in to the Kaggle Email List). The survey was also promoted on the Kaggle website and on
    the Kaggle Twitter channel.
*  The survey was live from 10/07/2020 to 10/30/2020. We allowed respondents to complete the
    survey at any time during that window.
*  Responses to multiple choice questions (only a single choice can be selected) were recorded in
    individual columns. Responses to multiple selection questions (multiple choices can be selected)
    were split into multiple columns (with one column per answer choice).
*  To protect the respondents’ privacy, free-form text responses were not included in the public
    survey dataset, and the order of the rows was shuffled (responses are not displayed in
    chronological order).
*  We excluded respondents that were flagged by our survey system as “Spam”.
*  The survey data was released under a CC 2.0 license:


## Main Data:
- kaggle_survey_2020_responses.csv: 39+ questions and 20,036 responses :

   - Responses to multiple choice questions (only a single choice can be selected) were recorded in individual columns. Responses to multiple selection questions (multiple choices can be selected) were split into multiple columns (with one column per answer choice).
   
## Supplementary Data:
- kaggle_survey_2020_answer_choices.pdf: list of answer choices for every question
    - With footnotes describing which questions were asked to which respondents.
        kaggle_survey_2020_methodology.pdf: a description of how the survey was conducted


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import plotly.graph_objects as go
import plotly
import plotly.offline as py
from plotly.offline import iplot
import plotly.io as pio
import cufflinks as cf
import plotly.express as px
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
!pip install jovian --upgrade -q
import jovian
project_name = "zerotopandas-course-project-starter"

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
jovian.commit(project=project_name)

In [None]:
df = pd.read_csv('/kaggle/input/kaggle-survey-2020/kaggle_survey_2020_responses.csv')

In [None]:
df.head()

In [None]:
questions = df.iloc[0, :].T
data = df.iloc[1:, :]

In [None]:
questions.head()

In [None]:

data.columns

In [None]:
data.shape

In [None]:
data.sample()

## Extracting the Information about people

In [None]:
#Extracting the basic information about people
df1 = data.iloc[:,1:7]
df1.columns = ['Age','Gender','Country','Education','CurrentRole','CodingExperience']

In [None]:
df1.sample()

# Q1 - What is Kaggle User Age-Group's (# years)?


In [None]:
#Creating interactive countplot for age range
AgeRange = df1['Age'].value_counts()
AgeRange.iplot(kind="bar",title='Age Group',size=15)

# Q2 - What is Kaggle User gender?


In [None]:
#Checking Gender for Data Scientists
df1['Gender'].isnull().values.any()
gender = df1['Gender'].value_counts()
gender = pd.DataFrame(gender)
gender.columns = ['Count']
gender.index.names = ['Gender']
fig = px.bar(gender, y='Count', x=gender.index,text='Count',title='G E N D E R')
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=12, uniformtext_mode='hide')
fig.show()

# Q3 - In which country do have most kaggle user?


In [None]:
#Country WordCloud
country = df1['Country'].value_counts()
allwords = ' '.join([twts for twts in country.index])
wordcloud = WordCloud(width=2800, height=600, random_state=21, max_font_size=219,max_words=1100,background_color='black').generate(allwords)
plt.figure(figsize=(30,30))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

In [None]:
country = df1['Country'].value_counts()
country.columns = [['Countries_Count']]
country.index.names = ['Countries']
fig = px.bar(country, x=country.index, y='Country',
             hover_data=[country.index, 'Country'], color='Country',
             labels={'Country':'Count'}, height=800,title='C O U N T R Y')
fig.show()

# Q4 - Education ?

In [None]:
Education = df1['Education'].value_counts()


#Country WordCloud
allwords = ' '.join([twts for twts in Education.index])
wordcloud = WordCloud(width=1800, height=300, random_state=21, max_font_size=119,max_words=300,background_color='black').generate(allwords)
plt.figure(figsize=(20,20))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

In [None]:
#Taking info about current education
currentEducation = df1[['Education']]
currentEducation = currentEducation.value_counts()
currentEducation = pd.DataFrame(currentEducation)
currentEducation.columns = ['Count']
currentEducation.reset_index(level=0, inplace=True)
#Percentage per country
fig = px.pie(currentEducation, values='Count', title='E D U C A T I O N',height=700,names='Education')

fig.show()


# CurrentRole

In [None]:
#'CurrentRole 
#Taking info about current education
currentEducation = df1[['CurrentRole']]
currentEducation = currentEducation.value_counts()
currentEducation = pd.DataFrame(currentEducation)
currentEducation.columns = ['Count']
currentEducation.reset_index(level=0, inplace=True)
#Percentage per country
fig = px.pie(currentEducation, values='Count', title='C U R R E N T  R O L E',height=700,names='CurrentRole')

fig.show()

Extracting the Information about 

# Q5 - Most used Programming Language?

In [None]:
#Programming Language use
progBasis = data.iloc[:,7:20]
progBasis.columns =  ['language1','language2','language3','language4','language5','language6','language7','language8','language9','language10','language11','language12','language13']
python = progBasis['language1'].value_counts()
R = progBasis['language2'].value_counts()
SQL = progBasis['language3'].value_counts()
C = progBasis['language4'].value_counts()
Cplus = progBasis['language5'].value_counts()
java = progBasis['language6'].value_counts()
javascript = progBasis['language7'].value_counts()
julia = progBasis['language8'].value_counts()
swift = progBasis['language9'].value_counts()
bash = progBasis['language10'].value_counts()
matlab = progBasis['language11'].value_counts()
none = progBasis['language12'].value_counts()
other = progBasis['language13'].value_counts()
#concat Dataframes
progBasis = pd.concat([python,R,SQL,C,Cplus,java,javascript,julia,swift,bash,matlab])
progBasis.index.names = ['Programming_Languages']


In [None]:
fig = px.bar(progBasis, x=progBasis, y=progBasis.index, color=progBasis.index, orientation='h',
             height=500)
fig.update_layout(title="M O S T  U S E D  P R O G R A M M I N G  L A N G U A G E S ",
                  xaxis_title="Count",
                  yaxis_title='Programming Languages')
fig.show()

In [None]:
#programming Language WordCloud
allwords = ' '.join([programminglanguage for programminglanguage in progBasis.index])
wordcloud = WordCloud(width=800, height=200, random_state=21, max_font_size=119,max_words=300,background_color='black').generate(allwords)
plt.figure(figsize=(20,20))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

# Q6 - hosted notebook products 

In [None]:
notebooks = data.iloc[:,33:45]
notebooks.columns =  ['notebooks1', 'notebooks2', 'notebooks3', 'notebooks4', 'notebooks5', 'notebooks6', 'notebooks7', 'notebooks8', 'notebooks9', 'notebooks10', 'notebooks11', 'notebooks12']
kagglenotebook = notebooks['notebooks1'].value_counts()
colab = notebooks['notebooks2'].value_counts()
azure = notebooks['notebooks3'].value_counts()
paperspace = notebooks['notebooks4'].value_counts()
binder = notebooks['notebooks5'].value_counts()
ocean = notebooks['notebooks6'].value_counts()
ibm = notebooks['notebooks7'].value_counts()
sagemaker = notebooks['notebooks8'].value_counts()
amazonEMR = notebooks['notebooks9'].value_counts()
googleplatform = notebooks['notebooks10'].value_counts()
googledatalab = notebooks['notebooks11'].value_counts()
databricks = notebooks['notebooks12'].value_counts()
#none = notebooks['notebooks13'].value_counts()
#other= notebooks['notebooks14'].value_counts()
#concat Dataframes
notebooks = pd.concat([kagglenotebook,colab,azure,paperspace,binder,ocean,ibm,sagemaker,amazonEMR,googleplatform,googledatalab,databricks])
notebooks.index.names = ['notebooks']
fig = px.bar(notebooks, x=notebooks, y=notebooks.index, color=notebooks.index, orientation='h',
             height=500,)
fig.update_layout(title=" E N V I R O N M E N T S  U S E D  ",
                  xaxis_title="Count",
                  yaxis_title='Notebook')
fig.show()


#  Type of computing platform

In [None]:
dsplatform = data.iloc[:,47]
dsplatform = dsplatform.value_counts()
dsplatform = pd.DataFrame(dsplatform)
dsplatform.columns = ['count']
dsplatform.index.names = ['platform']
#Only 5 rows with Na values, let´s drop it
dsplatform.dropna(inplace=True)

#Plots
fig = px.bar(dsplatform , y='count', x=dsplatform.index,text='count',color='count')
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=1, uniformtext_mode='hide',height=900,title=' COMPUTER PLATFORMS ')
fig.show()



# Q8 - Which types of specialized hardware used?

In [None]:
#Hardware use
hardware = data.iloc[:,48:52]
hardware.columns =  ['hardware1','hardware2','hardware3','hardware4']
gpu = hardware['hardware1'].value_counts()
tpu = hardware['hardware2'].value_counts()
cpu = hardware['hardware3'].value_counts()
other = hardware['hardware4'].value_counts()
#concat Dataframes
hardware = pd.concat([gpu,tpu,cpu,other])
hardware.index.names = ['hardware']
fig = px.bar(hardware, x=hardware, y=hardware.index, color=hardware.index, orientation='h',
             height=500,)
fig.update_layout(title="H A R D W A R E  U S E D  ",
                  xaxis_title="Count",
                  yaxis_title='Hardware')
fig.show()

# Q10 - How many times have you used a TPU

In [None]:
#How many times have you used a TPU
tpu_use = data.iloc[:,52]
tpu_use = tpu_use.value_counts()
tpu_use = pd.DataFrame(tpu_use)
tpu_use.columns = ['count']
tpu_use.index.names = ['tpu']
#Percentage
fig = px.pie(tpu_use , values='count' , title='HOW MANY TIMES KAGGLERS HAVE USED A TPU?',height=700,names=tpu_use.index,color_discrete_sequence=px.colors.sequential.RdBu,hole=0.3)
fig.show()

# Q11 - Most common libraries use

In [None]:
#libraries use
libraries = data.iloc[:,53:65]
libraries.columns = ['library1','library2','library3','library4','library5','library6','library7','library8','library9','library10','library11','library12']
matplotlib = libraries ['library1'].value_counts()
seaborn = libraries ['library2'].value_counts()
plotly = libraries ['library3'].value_counts()
ggplot = libraries ['library4'].value_counts()
shiny = libraries ['library5'].value_counts()
d3 = libraries ['library6'].value_counts()
altair = libraries ['library7'].value_counts()
bokeh = libraries ['library8'].value_counts()
geoplotlib = libraries ['library9'].value_counts()
folium = libraries ['library10'].value_counts()
none = libraries ['library11'].value_counts()
other = libraries ['library12'].value_counts()

#Concat Dataframes
libraries = pd.concat([matplotlib,seaborn,plotly,ggplot,shiny,d3,altair,bokeh,geoplotlib,folium])
libraries.columns = ['libraryCount']
libraries.index.names = ['library']

#plot
fig = px.bar(libraries, x=libraries, y=libraries.index, color=libraries.index, orientation='h',
             height=500)
fig.update_layout(title="L I B R A R I E S  U S E D  B Y  K A G G L E R S",
                  xaxis_title="Count",
                  yaxis_title='Library')
fig.show()

# Q12 - Which of the following machine learning frameworks do you use on a regular basis? 

In [None]:
#Which of the following machine learning frameworks do you use on a regular basis? 
ml_frameworks = data.iloc[:,66:82]
ml_frameworks.columns = ['frameworks1','frameworks2','frameworks3','frameworks4','frameworks5','frameworks6','frameworks7','frameworks8','frameworks9','frameworks10','frameworks11','frameworks12','frameworks13','frameworks14','frameworks15','frameworks16']
skn = ml_frameworks['frameworks1'].value_counts()
tf = ml_frameworks['frameworks2'].value_counts()
keras = ml_frameworks['frameworks3'].value_counts()
pytorch = ml_frameworks['frameworks4'].value_counts()
fast = ml_frameworks['frameworks5'].value_counts()
mxnet = ml_frameworks['frameworks6'].value_counts()
xgboost = ml_frameworks['frameworks7'].value_counts()
lightgbm = ml_frameworks['frameworks8'].value_counts()
catboost = ml_frameworks['frameworks9'].value_counts()
prophet = ml_frameworks['frameworks10'].value_counts()
h2o = ml_frameworks['frameworks11'].value_counts()
caret = ml_frameworks['frameworks12'].value_counts()
tidymodels = ml_frameworks['frameworks13'].value_counts()
jax = ml_frameworks['frameworks14'].value_counts()
none = ml_frameworks['frameworks15'].value_counts()
other = ml_frameworks['frameworks16'].value_counts()

#dataframe concat
ml_frameworks = pd.concat([skn,tf,keras,pytorch,fast,mxnet,xgboost,catboost,prophet,h2o,caret,tidymodels,jax,none,other])
ml_frameworks.index.names = ['frameworks']
ml_frameworks.columns = ['frameworkcount']

#plot
fig = px.bar(ml_frameworks , x=ml_frameworks , y=ml_frameworks.index, color=ml_frameworks.index, orientation='h',
             height=700)
fig.update_layout(title="MACHINE LEARNING FRAMEWORK COUNT BY USERS",
                  xaxis_title="Count",
                  yaxis_title='FRAMEWORKS')
fig.show()


In [None]:
#Machine learning frameworks WordCloud
allwords = ' '.join([framework for framework in ml_frameworks.index])
wordcloud = WordCloud(width=600, height=300, random_state=21, max_font_size=119,max_words=300,background_color='skyblue').generate(allwords)
plt.figure(figsize=(12,8))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

# Q13 - Which of the following natural language processing (NLP) methods do you use on a regular basis

In [None]:
#Which of the following natural language processing (NLP) methods do you use on a regular basis
nlp = data.iloc[:,101:107]
nlp.columns = ['nlp1','nlp2','nlp3','nlp4','nlp5','nlp6']
word_embeddings = nlp['nlp1'].value_counts()
enconder_decoder = nlp['nlp2'].value_counts()
contextualized_embeddings = nlp['nlp3'].value_counts()
transformer_language = nlp['nlp4'].value_counts()
none = nlp['nlp5'].value_counts()
other = nlp['nlp6'].value_counts()

#Concat
nlp = pd.concat([word_embeddings ,enconder_decoder,contextualized_embeddings,transformer_language,none,other])
nlp.index.names = ['Natural Language Processing']

#Barchart
fig = px.bar(nlp  , y=nlp , x=nlp.index,text=nlp ,color=nlp.index,orientation='v')
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=10, uniformtext_mode='hide',height=1000,title='NATURAL LANGUAGE PROCESSING TECNHIQUES COUNT BY USERS')
fig.show()


In [None]:
jovian.commit(project=project_name)