# 2020 Kaggle Machine Learning & Data Science Survey

1. basic exploratory (done)
1. basic dataviz (done)
1. insights dataviz (doing)
1. conclusion (to do)
  

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

In [None]:
df = pd.read_csv('/kaggle/input/kaggle-survey-2020/kaggle_survey_2020_responses.csv')

# **Exploratory Data Analysis**

In [None]:
def eda(dfA, allEDA=False, desc='Exploratory Data Analysis'):
    print(desc)
    print(f'\nShape:\n{dfA.shape}')
    print(f'\nIs Null:\n{dfA.isnull().mean().sort_values(ascending=False)}')
    dup = dfA.duplicated()
    print(f'\nDuplicated: \n{dfA[dup].shape}\n')
    try:
        print(dfA[dfA.duplicated(keep=False)].sample(4))
    except:
        pass
    if allEDA:  # here you put yours prefered analysis that detail more your dataset
        
        print(f'\nDTypes - Numerics')
        print(dfA.describe(include=[np.number]))
        print(f'\nDTypes - Categoricals')
        print(dfA.describe(include=['object']))
        
        #print(df.loc[:, df.dtypes=='object'].columns)
        print(f'\nHead of dataframe:\n{dfA.head()}')
        print(f'\nFive Samples :\n{dfA.sample(5)}')
        print(f'\nTail of dataframe:\n{dfA.tail()}')

In [None]:
df.shape

In [None]:
pd.set_option('display.max_rows', df.shape[0]+1)
df.isnull().sum().sort_values(ascending=False)

In [None]:
df.head()

In [None]:
df.sample(5)

In [None]:
df.tail()

In [None]:
questions = [q for q in df.iloc[0]]
questions

In [None]:
col = [c for c in df.columns]
columns = col.copy()
columns[0] = 'time'

In [None]:
def getQuestion(element, txt):
    return [x  for x in range(0, len(element)) if not element[x].find(txt)]


# ** Dataviz** - Basic visualization for all questions

In [None]:
import plotly.express as px
# Importing pygal and its styles
!pip install pygal -q
import pygal
from pygal.style import Style
from IPython.display import display, HTML

In [None]:
# This is helper function to render plot in HTML format

base_html = """
<!DOCTYPE html>
<html>
  <head>
  <script type="text/javascript" src="http://kozea.github.com/pygal.js/javascripts/svg.jquery.js"></script>
  <script type="text/javascript" src="https://kozea.github.io/pygal.js/2.0.x/pygal-tooltips.min.js""></script>
  </head>
  <body>
    <figure>
      {rendered_chart}
    </figure>
  </body>
</html>
"""
def pygalplot(chart):
    rendered_chart = chart.render(is_unicode=True)
    plot_html = base_html.format(rendered_chart=rendered_chart)
    display(HTML(plot_html))

In [None]:
def removeOutliers(out, varTarget):
    cidgrp = out[varTarget]
    # quantiles
    qtl1 = cidgrp.quantile(.25)  
    qtl3 = cidgrp.quantile(.75)
    # calculating iqr
    iqr = qtl3 - qtl1

    # creating limits
    baixo = qtl1 - 1.5 * iqr
    alto = qtl3 + 1.5 * iqr

    # removing outliers
    novodf = pd.DataFrame()

    limites = out[varTarget].between(left=baixo, right=alto, inclusive=True)
    novodf = pd.concat([novodf, out[limites]])

    return novodf

In [None]:
# time
ndf = df[1:].copy()
ndf.columns = columns
ndf.time = ndf.time.apply(lambda x: int(x))
time = removeOutliers(ndf, 'time')
time = time.describe()
f'Avg time (min): {time.time[1]/60:.2f}'

In [None]:
# age
ages = ndf.groupby('Q1')['Q1'].count()
pie_chart = pygal.Pie()
pie_chart.title = 'Ages Survey (in qty)'
for p in range(0, len(ages)):
    pie_chart.add(ages.index[p], ages.values[p])
pygalplot(pie_chart)

In [None]:
# gender
gender = ndf.groupby('Q2')['Q2'].count()
pie_chart = pygal.Pie()
pie_chart.title = 'Gender Survey (in qty)'
for p in range(0, len(gender)):
    pie_chart.add(gender.index[p], gender.values[p])
pygalplot(pie_chart)

In [None]:
# country

country = ndf.groupby('Q3')['Q3'].count().sort_values(ascending=False)
line_chart = pygal.HorizontalBar()
line_chart.title = 'Country Survey - top20 (in qty)'
for p in range(0, 20):
    line_chart.add(country.index[p], country.values[p])
pygalplot(line_chart)



In [None]:
# Level Education

ndf.Q4 = ndf.Q4.fillna('Unknow')
levEd = ndf.groupby('Q4')['Q4'].count().sort_values(ascending=False)
line_chart = pygal.HorizontalBar()
line_chart.title = 'Level Education (in qty)'
for p in range(0,len(levEd)):
    line_chart.add(levEd.index[p], levEd.values[p])
pygalplot(line_chart)

In [None]:
def plotGraphic(question, title, typeGraph='pie'):
    
    mychart = pygal.Pie()
    if typeGraph=='barH':
        mychart = pygal.HorizontalBar()
    mychart.title = title
    
    myrange = getQuestion(columns, question)
    if len(myrange) > 1:
        QX = ndf[columns[myrange[0]: myrange[len(myrange)-1]+1]]

        ansText = []
        ansVal = []
        for col in range(0, len(myrange)):
            x = QX[columns[myrange[col]]].dropna()
            ansText.append(x.unique()[0])
            ansVal.append(x.count())


        dfTemp = pd.DataFrame()
        dfTemp['desc'] = ansText
        dfTemp['value'] = ansVal
        dfTemp['perc'] = dfTemp.value / dfTemp.value.sum() * 100
        dfTemp = dfTemp.sort_values('perc', ascending=False)


        
        for p in dfTemp.index:
            mychart.add(dfTemp.desc.values[p], round(dfTemp.perc.values[p], 2))
            
    else:
        QY = ndf.groupby(question)[question].count().sort_values(ascending=False)

        for p in range(0, QY.shape[0]):
            mychart.add(QY.index[p], QY.values[p])

        
    pygalplot(mychart)

In [None]:
plotGraphic(question='Q4', title='What is the highest level of formal education that you have attained or plan to attain within the next 2years')

In [None]:
plotGraphic(question='Q5', title='Select the title most similar to your current role (or most recent title if retired)')

In [None]:
plotGraphic(question='Q6', title='For how many years have you been writing code and/or programming?')

In [None]:
plotGraphic(question='Q7', typeGraph='barH', title='What programming languages do you use on a regular basis?')

In [None]:
plotGraphic(question='Q8', title='What programming language would you recommend an aspiring data scientist to learn first?')

In [None]:
plotGraphic(question='Q9', typeGraph='barH', title='Which of the following integrated development environments (IDEs) do you use on a regular basis?')

In [None]:
plotGraphic(question='Q10', typeGraph='barH', title='Which of the following hosted notebook products do you use on a regular basis?')

In [None]:
plotGraphic(question='Q11', typeGraph='barH', title='What type of computing platform do you use most often for your data science projects?')

In [None]:
plotGraphic(question='Q12', title='Which types of specialized hardware do you use on a regular basis?')

In [None]:
plotGraphic(question='Q13',  title='Approximately how many times have you used a TPU (tensor processing unit)?')

In [None]:
plotGraphic(question='Q14', typeGraph='barH', title='What data visualization libraries or tools do you use on a regular basis? ')

In [None]:
plotGraphic(question='Q15', typeGraph='barH', title='For how many years have you used machine learning methods?')

In [None]:
plotGraphic(question='Q16', typeGraph='barH', title='Which of the following machine learning frameworks do you use on a regular basis?')

In [None]:
plotGraphic(question='Q17', typeGraph='barH', title='Which of the following ML algorithms do you use on a regular basis? (')

In [None]:
plotGraphic(question='Q18', title='Which categories of computer vision methods do you use on a regular basis? ')

In [None]:
plotGraphic(question='Q19', typeGraph='barH', title='Which of the following natural language processing (NLP) methods do you use on a regular basis?')

In [None]:
plotGraphic(question='Q20',title='What is the size of the company where you are employed?')

In [None]:
plotGraphic(question='Q21',title='Approximately how many individuals are responsible for data science workloads at your place of business?')

In [None]:
plotGraphic(question='Q22', title='Does your current employer incorporate machine learning methods into their business?')

In [None]:
plotGraphic(question='Q23', typeGraph='barH', title='Select any activities that make up an important part of your role at work')

In [None]:
plotGraphic(question='Q24', typeGraph='barH', title='What is your current yearly compensation (approximate $USD)?')

In [None]:
plotGraphic(question='Q25', title='Approximately how much money have you (or your team) spent on machine learning and/or cloud computing services at home (or at work) in the past 5 years (approximate $USD)?')

In [None]:
plotGraphic(question='Q26', typeGraph='barH', title='Which of the following cloud computing platforms do you use on a regular basis?')

In [None]:
plotGraphic(question='Q27', typeGraph='barH', title='Do you use any of the following cloud computing products on a regular basis?')

In [None]:
plotGraphic(question='Q28', typeGraph='barH', title='Do you use any of the following machine learning products on a regular basis?')

In [None]:
plotGraphic(question='Q29', typeGraph='barH', title='Which of the following big data products (relational databases, data warehouses, data lakes, or similar) do you use on a regular basis?')

In [None]:
plotGraphic(question='Q30', typeGraph='barH', title='Which of the following big data products (relational database, data warehouse, data lake, or similar)do you use most often?')

In [None]:
plotGraphic(question='Q31', typeGraph='barH', title='Which of the following business intelligence tools do you use on a regular basis?')

Question 32 (which specific product) was only asked to respondents that selected more than one choice for Question 31-A (which of the following products).

In [None]:
plotGraphic(question='Q32', typeGraph='barH', title='Which of the following business intelligence tools do you use most often?')

In [None]:
plotGraphic(question='Q33', typeGraph='barH', title='Do you use any automated machine learning tools (or partial AutoML tools) on a regular basis?')

Question 34-A (which specific product) was only asked to respondents that answered affirmatively to Question 33-A (which of the following categories of products).

In [None]:
plotGraphic(question='Q34', typeGraph='barH', title='Which of the following automated machine learning tools (or partial AutoML tools) do you use on aregular basis?')

In [None]:
plotGraphic(question='Q35', typeGraph='barH', title='Do you use any tools to help manage machine learning experiments?')

In [None]:
plotGraphic(question='Q36', title='Where do you publicly share or deploy your data analysis or machine learning applications?')

In [None]:
plotGraphic(question='Q37', typeGraph='barH', title='On which platforms have you begun or completed data science courses?')

In [None]:
plotGraphic(question='Q38', typeGraph='barH', title='What is the primary tool that you use at work or school to analyze data?')

In [None]:
plotGraphic(question='Q39', typeGraph='barH', title='Who/what are your favorite media sources that report on data science topics?')

# Insights - Dataviz

**Gender vs Country**

In [None]:
# get genders
yaxis = ndf.Q2.unique().tolist()
# get countries
xaxis = ndf[['Q2', 'Q3']].groupby(['Q3']).count().sort_values(by='Q2', ascending=False).index.tolist()[:15]
# get only data
ndfTemp = ndf[ndf.Q3.isin(xaxis)][['time', 'Q2', 'Q3']].groupby(['Q2', 'Q3']).count()
# plot graph
mychart = pygal.StackedBar()
mychart.title = 'Gender by Country (qty)'
mychart.x_labels = xaxis
for a in yaxis:
    vet = []
    for b in xaxis:
        try:
            vet.append(ndfTemp.query(f'Q2 == "{a}" and Q3 == "{b}" ').values[0].reshape(-1)[0])
        except:
            vet.append(0)
    mychart.add(a, vet)
pygalplot(mychart)

**Level Education vx Country**

In [None]:
# get level education
yaxis = ndf['Q4'].unique().tolist()
# get countries
xaxis = ndf[['Q4', 'Q3']].groupby(['Q3']).count().sort_values(by='Q4', ascending=False).index.tolist()[:15]
# get data
ndfTemp = ndf[ndf.Q3.isin(xaxis)][['time', 'Q4', 'Q3']].groupby(['Q4', 'Q3']).count()
# plot graph
mychart = pygal.StackedBar()
mychart.title = 'Level Education by Country (qty)'
mychart.x_labels = xaxis
for a in yaxis:
    vet = []
    for b in xaxis:
        try:
            vet.append(ndfTemp.query(f'Q4 == "{a}" and Q3 == "{b}" ').values[0].reshape(-1)[0])
        except:
            vet.append(0)
    mychart.add(a, vet)
pygalplot(mychart)

**Programming languages used vs Countries**

In [None]:
# get level education
yaxis = ndf['Q5'].dropna().unique().tolist()
# get countries
xaxis = ndf[['Q5', 'Q3']].groupby(['Q3']).count().sort_values(by='Q5', ascending=False).index.tolist()[:15]
# get data
ndfTemp = ndf[ndf.Q3.isin(xaxis)][['time', 'Q5', 'Q3']].groupby(['Q5', 'Q3']).count()
# plot graph
mychart = pygal.StackedBar()
mychart.title = 'Programming languages used vs Countries (qty)'
mychart.x_labels = xaxis
for a in yaxis:
    vet = []
    for b in xaxis:
        try:
            vet.append(ndfTemp.query(f'Q5 == "{a}" and Q3 == "{b}" ').values[0].reshape(-1)[0])
        except:
            vet.append(0)
    mychart.add(a, vet)
pygalplot(mychart)

# Conclusion

* soon