# Overview <br>
Welcome to Kaggle's third annual Machine Learning and Data Science Survey ― and our second-ever survey data challenge.<br>

This year, as in 2017 and 2018, we set out to conduct an industry-wide survey that presents a truly comprehensive view of the state of data science and machine learning. The survey was live for three weeks in October, and after cleaning the data we finished with 19,717 responses!<br>

There's a lot to explore here. The results include raw numbers about who is working with data, what’s happening with machine learning in different industries, and the best ways for new data scientists to break into the field. We've published the data in as raw a format as possible without compromising anonymization, which makes it an unusual example of a survey dataset.<br>

<img src="https://cdn.app.compendium.com/uploads/user/a7c086f7-9adb-4d2c-90fa-e26177af8317/c2dea8f7-8c26-44de-ae5f-5dc019485c8c/Image/60691dbc9e7de7390b93ea5284177459/data_analytics_banner.png"/> 

## Let's analyse the data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.offline as py
import plotly.express as px
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objects as go
from plotly import tools
import pycountry
from wordcloud import WordCloud, STOPWORDS
init_notebook_mode(connected=True)
import plotly.figure_factory as ff
%matplotlib inline

In [None]:
kaggle_choice_df = pd.read_csv("../input/kaggle-survey-2019/multiple_choice_responses.csv")
kaggle_choice_df = kaggle_choice_df[kaggle_choice_df.columns[~kaggle_choice_df.columns.str.contains('OTHER_TEXT')]]
kaggle_choice_df.drop(kaggle_choice_df.index[0],inplace = True)
kaggle_choice_df.drop(columns='Time from Start to Finish (seconds)',inplace = True)
def simple_graph(col,type_of_graph,color):
    data_frame = compute_percentage(kaggle_choice_df,col)
    layout = go.Layout()
    if type_of_graph == 'bar':
        data = [go.Bar(
                x = data_frame.values,
                y = data_frame.index,
                opacity = 0.6,
                orientation='h',
               marker=dict(color=data_frame.values,colorscale=color) 
            )]
    elif type_of_graph == 'pie':
        data = [go.Pie(
            labels = data_frame.index,
            values = data_frame.values,
            textfont = dict(size = 20)
        )]
    fig = go.Figure(data = data, layout = layout)
    py.iplot(fig)
def multiple(df,start,stop,color):
    layout = go.Layout()
    indexs = []
    values = []
    stop_indexs = []
    for i in range(1,stop + 1):
        stop_indexs.append(start + str(i))
    for i in stop_indexs:
        df = compute_percentage(kaggle_choice_df,i)
        indexs.append(df.index[0])
        values.append(df.values[0])
    data = [go.Bar(
                x =  values,
                y =indexs,
                opacity = 0.6,
        orientation='h',
        marker=dict(color=values,colorscale=color)
            )]
    fig = go.Figure(data = data, layout = layout)
    py.iplot(fig)
def compute_percentage(df,col):
    return df[col].value_counts(normalize=True) * 100
def bi_variant_chart(col1,col2,x_title,y_title):
    index = kaggle_choice_df[col1].dropna().unique()
    vals = kaggle_choice_df[col2].unique()
    layout = go.Layout()
    trace = []
    for j,y_axis in enumerate(vals):
        trace.append(go.Bar(x = kaggle_choice_df[kaggle_choice_df[col2] == y_axis][col1].value_counts().index,
                            y = kaggle_choice_df[kaggle_choice_df[col2] == y_axis][col1].sort_values().value_counts().values,
                opacity = 0.6, name = vals[j]))
    fig = go.Figure(data = trace, layout = layout)
    fig.update_layout(
        title = x_title,
        yaxis = dict(title = y_title),
        legend = dict( bgcolor = 'rgba(255, 255, 255, 0)', bordercolor = 'rgba(255, 255, 255, 0)'),
        bargap = 0.15, bargroupgap = 0.1,legend_orientation="h")
    fig.show()
def multiple_col_bi_variant_chart(df,start,stop,col1,col2,x_title = "",y_title = ""):
    test_df = pd.DataFrame({col1:[],col2:[]})
    indexs = []
    values = []
    stop_indexs = []
    for i in range(1,stop + 1):
        stop_indexs.append(start + str(i))
        test_df = test_df.append(df[[col1,start + str(i)]].rename(columns={start + str(i):col2}))
    test_df = test_df.dropna()
    index = test_df[col1].dropna().unique()
    vals = test_df[col2].unique()
    layout = go.Layout()
    trace = []
    for j,y_axis in enumerate(vals):
        trace.append(go.Bar(x = test_df[test_df[col2] == y_axis][col1].value_counts().index,
                            y = test_df[test_df[col2] == y_axis][col1].value_counts().values,
                opacity = 0.6, name = vals[j]))
    fig = go.Figure(data = trace, layout = layout)
    fig.update_layout(
        title = x_title,
        yaxis = dict(title = y_title),
        legend = dict( bgcolor = 'rgba(255, 255, 255, 0)', bordercolor = 'rgba(255, 255, 255, 0)'),
        bargap = 0.15, bargroupgap = 0.1,legend_orientation="h"
    )
    fig.update_layout(legend_orientation="h")
    fig.show()
def aws_gcp_azure(df,col):
    aws_gcp_azure = pd.melt(df,id_vars=[col],value_vars = ["Q29_Part_1","Q29_Part_2","Q29_Part_3"],
                            value_name="Platform")
    aws_gcp_azure = aws_gcp_azure[[col,"Platform"]].dropna()
    aws_gcp_azure.replace({" Microsoft Azure ":"Azure"," Amazon Web Services (AWS) ":"AWS"," Google Cloud Platform (GCP) ":"GCP"},inplace = True)
    gcp_percent = aws_gcp_azure[aws_gcp_azure['Platform'] == "GCP"][col].value_counts(normalize = True)*100
    aws_percent = aws_gcp_azure[aws_gcp_azure['Platform'] == "AWS"][col].value_counts(normalize = True)*100
    azure_percent = aws_gcp_azure[aws_gcp_azure['Platform'] == "Azure"][col].value_counts(normalize = True)*100
    data = []
    for i in range(len(gcp_percent)):
        data.append(go.Bar(name=gcp_percent.index[i], x=aws_gcp_azure['Platform'].unique(), y=[aws_percent.values[i],gcp_percent.values[i],azure_percent.values[i]]))
    fig = go.Figure(data=data)
    fig.update_layout(
        legend = dict(bgcolor = 'rgba(255, 255, 255, 0)', bordercolor = 'rgba(255, 255, 255, 0)'),
        bargap = 0.15, bargroupgap = 0.1
    )
    fig.update_layout(barmode='stack',legend_orientation="h")
    fig.show()

# Gender distribution in percentage

In [None]:
simple_graph("Q2","bar","portland")

# Degree distribution in percentage

In [None]:
simple_graph("Q4","bar","temps")

# What Happens in different size of companies

In [None]:
bi_variant_chart("Q6","Q1","Company size VS age group","Count")

<u><span style="font-family:12. Comic Sans MS; font-size:1.5em;">Insights</span></u>

 - Most of the company employees fall under the age of 25-29 and 30-34
 - Company from 0-49 employee range has 2nd most age group of 22-24.
 - In all the company size we have employees who are aged more than 70. lots of experience can be gained from them.

In [None]:
bi_variant_chart("Q6","Q5","Company size VS Designation","Count")

<u><span style="font-family:12. Comic Sans MS; font-size:1.5em;">Insights</span></u>

    * Data scientist are more in startups.
    * Research scientist are more in startups.
    * Large scale companies have more data analyst than business analyst.

In [None]:
bi_variant_chart("Q6","Q8","Company size VS ML in Production","Count")

<u><span style="font-family:12. Comic Sans MS; font-size:1.5em;">Insights</span></u>

    * Startups companies are exploring in the ML.
    * Big scale companies either are established ML in production or they have started using ML in production.
    * Most of the companies from range of 50-249 still do not used ML

In [None]:
bi_variant_chart("Q6","Q10","Company size VS Salary","")

<u><span style="font-family:12. Comic Sans MS; font-size:1.5em;">Insights</span></u>

    * 843 employees in startups are getting paid < $999. reason they may be freshers or interns.

In [None]:
bi_variant_chart("Q6","Q14","Company size VS Primary tool","Designation ")

<u><span style="font-family:12. Comic Sans MS; font-size:1.5em;">Insights</span></u>

    * Doesn't matter where you work you should know how to use excel.

In [None]:
multiple_col_bi_variant_chart(kaggle_choice_df,"Q9_Part_",8,"Q6","Q9", x_title = "Employee's role in companies", y_title = "")

<u><span style="font-family:12. Comic Sans MS; font-size:1.5em;">Insights</span></u>

    * Start ups are spending more time in research than other scale company
    * All types of company follows same pattern.

In [None]:
multiple_col_bi_variant_chart(kaggle_choice_df,"Q16_Part_",12,"Q6","Q16", x_title = "IDE's used in companies", y_title = "")

<u><span style="font-family:12. Comic Sans MS; font-size:1.5em;">Insights</span></u>

    * All time favourite Jupyter and R studio tops.
    * VS code editor tops 2nd in small scale company.

In [None]:
multiple_col_bi_variant_chart(kaggle_choice_df,"Q18_Part_",12,"Q6","Q18", x_title = "Programing language", y_title = "")

<u><span style="font-family:12. Comic Sans MS; font-size:1.5em;">Insights</span></u>

    * Python, SQL, R are most widely used.
    * Happy to see MATLAB still in use.

In [None]:
multiple_col_bi_variant_chart(kaggle_choice_df,"Q20_Part_",12,"Q6","Q20", x_title = "Vis lib", y_title = "")

<u><span style="font-family:12. Comic Sans MS; font-size:1.5em;">Insights</span></u>

    * From python its matplotlib and seaborn, for R its ggplot.
    * Plotly is gaining good attention.

In [None]:
multiple_col_bi_variant_chart(kaggle_choice_df,"Q26_Part_",7,"Q6","Q26", x_title = "Purpose of Computer Vision in companies", y_title = "")

<u><span style="font-family:12. Comic Sans MS; font-size:1.5em;">Insights</span></u>

    * Most of the companies use it for Image classification or general purpose.

In [None]:
multiple_col_bi_variant_chart(kaggle_choice_df,"Q27_Part_",6,"Q6","Q27", x_title = "Purpose of NLP in companies", y_title = "")

<u><span style="font-family:12. Comic Sans MS; font-size:1.5em;">Insights</span></u>

    * Most of the companies use it for word embeddings.

In [None]:
multiple_col_bi_variant_chart(kaggle_choice_df,"Q28_Part_",12,"Q6","Q28", x_title = "ML framework in companies", y_title = "")

<u><span style="font-family:12. Comic Sans MS; font-size:1.5em;">Insights</span></u>

    * Scikit-learn tops.
    * Tight competetion between tensorflow and keras.

# Coders as experience grows

In [None]:
multiple_col_bi_variant_chart(kaggle_choice_df,"Q16_Part_",12,"Q15","Q16", x_title = "Developer's fav IDE", y_title = "")

<u><span style="font-family:12. Comic Sans MS; font-size:1.5em;">Insights</span></u>

    * All group of experience ppl uses Jupyter.
    * Tight competetion between vs code, R studio and pyCharm.

In [None]:
multiple_col_bi_variant_chart(kaggle_choice_df,"Q12_Part_",12,"Q15","Q12", x_title = "Fav media source", 
                              y_title = "")

<u><span style="font-family:12. Comic Sans MS; font-size:1.5em;">Insights</span></u>

    * All group of experience people perfer kaggle and blogs.

In [None]:
multiple_col_bi_variant_chart(kaggle_choice_df,"Q18_Part_",12,"Q15","Q18", x_title = "Fav Programming language", y_title = "")

<u><span style="font-family:12. Comic Sans MS; font-size:1.5em;">Insights</span></u>

    * Python is most prefered across experience groups.

In [None]:
multiple_col_bi_variant_chart(kaggle_choice_df,"Q29_Part_",12,"Q15","Q29", x_title = "Top ML ALgo", y_title = "")

# AWS VS GCP VS AZURE

In [None]:
# AWS VS GCP VS AZURE
aws_gcp_azure = kaggle_choice_df[["Q29_Part_1","Q29_Part_2","Q29_Part_3","Q29_Part_4","Q29_Part_5","Q29_Part_6","Q29_Part_7","Q29_Part_8","Q29_Part_9","Q29_Part_10","Q29_Part_11","Q29_Part_12"]]
emp = pd.DataFrame({"emp":[]})
emp = emp["emp"].append(kaggle_choice_df[kaggle_choice_df["Q29_Part_1"] == " Google Cloud Platform (GCP) "]["Q6"]).append(kaggle_choice_df[kaggle_choice_df["Q29_Part_2"] == " Amazon Web Services (AWS) "]["Q6"]).append(kaggle_choice_df[kaggle_choice_df["Q29_Part_3"] == " Microsoft Azure "]["Q6"])
title = pd.DataFrame({"Title":[]})
title = title["Title"].append(kaggle_choice_df[kaggle_choice_df["Q29_Part_1"] == " Google Cloud Platform (GCP) "]["Q5"]).append(kaggle_choice_df[kaggle_choice_df["Q29_Part_2"] == " Amazon Web Services (AWS) "]["Q5"]).append(kaggle_choice_df[kaggle_choice_df["Q29_Part_3"] == " Microsoft Azure "]["Q5"])
aws_gcp_azure = pd.DataFrame({"Aws VS GCP VS Azure":[],"Company Size":[],"Title":[]})
aws_gcp_azure["Aws VS GCP VS Azure"] = aws_gcp_azure["Aws VS GCP VS Azure"].append(kaggle_choice_df["Q29_Part_1"].dropna()).append(kaggle_choice_df["Q29_Part_2"].dropna()).append(kaggle_choice_df["Q29_Part_3"].dropna())
aws_gcp_azure['Company Size'] = emp.values
aws_gcp_azure['Title'] = title.values
fig = px.parallel_categories(aws_gcp_azure[["Aws VS GCP VS Azure","Title","Company Size"]])
fig.show()

<u><span style="font-family:12. Comic Sans MS; font-size:1.5em;">Insights</span></u>

    * Most of the Data Scientist use GCP or AWS.
    * Small scale Companies spend less on Azure platform.
    

## Cloud platform w.r.t age

In [None]:
df = kaggle_choice_df[1:]
col = "Q1"
aws_gcp_azure = pd.melt(df,id_vars=[col],value_vars = ["Q29_Part_1","Q29_Part_2","Q29_Part_3"],
                            value_name="Platform")
aws_gcp_azure = aws_gcp_azure[[col,"Platform"]].dropna()
aws_gcp_azure.replace({" Microsoft Azure ":"Azure"," Amazon Web Services (AWS) ":"AWS",
                       " Google Cloud Platform (GCP) ":"GCP"},inplace = True)
gcp_percent = aws_gcp_azure[aws_gcp_azure['Q1'] == "25-29"]["Platform"].value_counts(normalize = True)*100
data = []
for i in df[col].unique():
    age = aws_gcp_azure[aws_gcp_azure[col] == i]["Platform"].value_counts(normalize = True)*100
    data.append(go.Bar(name="aws", x=[i], y=[age['AWS']],marker_color='indianred'))
    data.append(go.Bar(name="GCP", x=[i], y=[age['GCP']], marker_color='lightsalmon'))
    data.append(go.Bar(name="Azure", x=[i], y=[age['Azure']], marker_color='cornflowerblue'))
fig = go.Figure(data=data)
fig.update_layout(bargap = 0.15, bargroupgap = 0.1)
fig.update_layout(barmode='stack',showlegend=False)
fig.show()

<u><span style="font-family:12. Comic Sans MS; font-size:1.5em;">Insights</span></u>

    * all the age range AWS is mostly prefered
    * only the age between 18-21 GCP is hig
    

In [None]:
countries = kaggle_choice_df[["Q29_Part_1","Q3"]][1:]
countries = countries.dropna()
countries = countries.groupby("Q3").size().reset_index(name="Count")
countries.loc[2]['code'] = 'test'
for i,country in enumerate(countries['Q3']):
    user_input = country
    mapping = {country.name: country.alpha_3 for country in pycountry.countries}
    countries.set_value(i, 'code', mapping.get(user_input))
data = [ dict(
        type = 'choropleth',
        locations = countries['code'],
        z = countries['Count'],
        text = countries['Q3'],
        colorscale = [[0,"rgb(5, 10, 172)"],[0.35,"rgb(40, 60, 190)"],[0.5,"rgb(70, 100, 245)"],\
            [0.6,"rgb(90, 120, 245)"],[0.7,"rgb(106, 137, 247)"],[1,"rgb(220, 220, 220)"]],
        autocolorscale = False,
        reversescale = True,
        marker = dict(
            line = dict (color = 'rgb(180,180,180)',width = 0.5) ),
        colorbar = dict(autotick = False,tickprefix = '',title = 'Total Count'),)]
layout = dict(
    title = 'countries using GCP',
    geo = dict(
        showframe = False,
        showcoastlines = False,
        projection = dict(type = 'Mercator')
    )
)
fig = dict( data=data, layout=layout )
py.iplot( fig, validate=False)

In [None]:
countries = kaggle_choice_df[["Q29_Part_2","Q3"]][1:]
countries = countries.dropna()
countries = countries.groupby("Q3").size().reset_index(name="Count")
countries.loc[2]['code'] = 'test'
for i,country in enumerate(countries['Q3']):
    user_input = country
    mapping = {country.name: country.alpha_3 for country in pycountry.countries}
    countries.set_value(i, 'code', mapping.get(user_input))
data = [ dict(
        type = 'choropleth',
        locations = countries['code'],
        z = countries['Count'],
        text = countries['Q3'],
        colorscale = [[0,"rgb(5, 10, 172)"],[0.35,"rgb(40, 60, 190)"],[0.5,"rgb(70, 100, 245)"],\
            [0.6,"rgb(90, 120, 245)"],[0.7,"rgb(106, 137, 247)"],[1,"rgb(220, 220, 220)"]],
        autocolorscale = False,
        reversescale = True,
        marker = dict(
            line = dict (
                color = 'rgb(180,180,180)',
                width = 0.5
            ) ),
        colorbar = dict(
            autotick = False,
            tickprefix = '',
            title = 'Total Count'),
      ) ]

layout = dict(
    title = 'countries using AWS',
    geo = dict(
        showframe = False,
        showcoastlines = False,
        projection = dict(type = 'Mercator')
    )
)

fig = dict( data=data, layout=layout )
py.iplot( fig, validate=False)

In [None]:
countries = kaggle_choice_df[["Q29_Part_3","Q3"]][1:]
countries = countries.dropna()
countries = countries.groupby("Q3").size().reset_index(name="Count")
countries.loc[2]['code'] = 'test'
for i,country in enumerate(countries['Q3']):
    user_input = country
    mapping = {country.name: country.alpha_3 for country in pycountry.countries}
    countries.set_value(i, 'code', mapping.get(user_input))
data = [ dict(
        type = 'choropleth',
        locations = countries['code'],
        z = countries['Count'],
        text = countries['Q3'],
        colorscale = [[0,"rgb(5, 10, 172)"],[0.35,"rgb(40, 60, 190)"],[0.5,"rgb(70, 100, 245)"],\
            [0.6,"rgb(90, 120, 245)"],[0.7,"rgb(106, 137, 247)"],[1,"rgb(220, 220, 220)"]],
        autocolorscale = False,
        reversescale = True,
        marker = dict(
            line = dict (color = 'rgb(180,180,180)',width = 0.5) ),
        colorbar = dict(autotick = False,tickprefix = '',title = 'Total Count'))]

layout = dict(
    title = 'countries using Azure',
    geo = dict(
        showframe = False,
        showcoastlines = False,
        projection = dict(type = 'Mercator')
    )
)

fig = dict( data=data, layout=layout )
py.iplot( fig, validate=False)

# Salary difference between India and USA

In [None]:
country_salary_design = kaggle_choice_df[(kaggle_choice_df["Q3"] == "India") | (kaggle_choice_df["Q3"] == "United States of America")]
country_salary_design = country_salary_design[["Q3","Q5","Q10"]].groupby(["Q3","Q5","Q10"]).size().reset_index(name="count")
fig = px.line(country_salary_design, x="Q3", y="count", color="Q5", line_group="Q10", hover_name="Q5",line_shape="spline", render_mode="svg")
fig.show()

<u><span style="font-family:12. Comic Sans MS; font-size:1.5em;">Insights</span></u>

    * Graph looks clumsy would suggest to select one type of designation at a time.
    * select only Data Scientist in USA 164 ppl get paid between 150,000-199,999$ and in India there are only 3 people.
    

## another view of salary distribution in India and USA

In [None]:
country_salary_design = kaggle_choice_df[(kaggle_choice_df["Q3"] == "India") | (kaggle_choice_df["Q3"] == "United States of America")]
country_salary_design = country_salary_design[["Q3","Q5","Q10"]].groupby(["Q3","Q5","Q10"]).size().reset_index(name="count")
country_salary_design = country_salary_design.rename(columns={"Q3":"Country","Q5":"Designation","Q10":"Salary"})
fig = px.scatter(country_salary_design, x="Salary", y="Designation",size="count", color="Country")
fig.show()

# Top ML algorithm used

In [None]:
text = ""
for i in range(1,13):
    temp_df = kaggle_choice_df['Q24_Part_' + str(i) ][1:].dropna().values
    temp_df = ' ' .join(val for val in temp_df)
    text = text + temp_df
def generate_wordcloud(text): 
    wordcloud = WordCloud(width=1600, height=800, stopwords = {'None','etc','and','other'}).generate(text)
    plt.figure(figsize=(20,10), facecolor='k')
    plt.imshow(wordcloud,interpolation="bilinear")
    plt.axis("off")
    plt.tight_layout(pad=0)
    plt.show()
generate_wordcloud(text)

# Favourite place to study data science

In [None]:
text = ""
for i in range(1,13):
    temp_df = kaggle_choice_df['Q13_Part_' + str(i) ][1:].dropna().values
    temp_df = ' ' .join(val for val in temp_df)
    text = text + temp_df
def generate_wordcloud(text): 
    wordcloud = WordCloud(width=1600, height=800, stopwords = {'None','etc','and','other'}).generate(text)
    plt.figure( figsize=(20,10), facecolor='k')
    plt.imshow(wordcloud,interpolation="bilinear")
    plt.axis("off")
    plt.tight_layout(pad=0)
    plt.show()
generate_wordcloud(text)

# Where do people apply AI in regular life

In [None]:
text = ""
for i in range(1,8):
    temp_df = kaggle_choice_df['Q25_Part_' + str(i) ][1:].dropna().values
    temp_df = ' ' .join(val for val in temp_df)
    text = text + temp_df
for i in range(1,7):
    temp_df = kaggle_choice_df['Q26_Part_' + str(i) ][1:].dropna().values
    temp_df = ' ' .join(val for val in temp_df)
    text = text + temp_df
for i in range(1,6):
    temp_df = kaggle_choice_df['Q27_Part_' + str(i) ][1:].dropna().values
    temp_df = ' ' .join(val for val in temp_df)
    text = text + temp_df
def generate_wordcloud(text): 
    wordcloud = WordCloud(width=1600, height=800, stopwords = {'None','etc','and','other'}).generate(text)
    plt.figure( figsize=(20,10), facecolor='k')
    plt.imshow(wordcloud,interpolation="bilinear")
    plt.axis("off")
    plt.tight_layout(pad=0)
    plt.show()
generate_wordcloud(text)