# Kaggle 2020 Survey Results - Analysis

In [None]:
# Essentials
import numpy as np
import pandas as pd
import datetime
import random

# Plots
import seaborn as sns
import matplotlib.pyplot as plt

# Models
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.svm import SVR
from mlxtend.regressor import StackingCVRegressor
import lightgbm as lgb
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

# Stats
from scipy.stats import skew, norm
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax

# Misc
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA

pd.set_option('display.max_columns', None)

# Ignore useless warnings
import warnings
warnings.filterwarnings(action="ignore")
pd.options.display.max_seq_items = 8000
pd.options.display.max_rows = 8000



import os
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

from collections import defaultdict


Rename columns to make it easier to use them in the analysis.

In [None]:
names=['Duration', 'Age','Gender','Country', 'Education', 'Title', 'Num_Years_Code',
                              'Lang_Python', 'Lang_R','Lang_SQL','Lang_C_','Lang_CPlus','Lang_Java_','Lang_Javascript',
                              'Lang_Julia','Lang_Swift','Lang_Bash','Lang_MATLAB','Lang_None','Lang_Other','LangRec_First',
                              'Env_Jupyter', 'Env_RStudio','Env_Visual','Env_VScode','Env_PyCharm','Env_Spyder',
                              'Env_Notepad','Env_Sublime','Env_Vim','Env_MATLAB','Env_None','Env_Other',
                              'Note_Kaggle', 'Note_Colab', 'Note_Azure', 'Note_Paperspace', 'Note_Binder',
                              'Note_Ocean', 'Note_IBM', 'Note_Amzn_Sage', 'Note_Amzn_EMR', 'Note_Google_AI',
                              'Note_Google_Datalab', 'Note_Databricks', 'Note_None', 'Note_Other', 
                              'Platform', 'Hardware_GPU', 'Hardware_TPU', 'Hardware_None', 'Hardware_Other',
                              'Num_TPU', 'Visual_Matplotlib', 'Visual_Seaborn', 'Visual_Plotly', 'Visual_Ggplot',
                              'Visual_Shiny', 'Visual_D3', 'Visual_Altair', 'Visual_Bokeh', 'Visual_Geoplotlib',
                              'Visual_Leaflet', 'Visual_None', 'Visual_Other', 'Num_Years_ML', 'ML_SKLearn',
                             'ML_TF', 'ML_Keras', 'ML_PyTorch', 'ML_FastAI', 'ML_MXNet', 'ML_Xgboost', 'ML_LightGBM',
                             'ML_CatBoost', 'ML_Prophet', 'ML_H2O', 'ML_Caret', 'ML_Tidy', 'ML_JAX', 'ML_None',
                             'ML_Other', 'MLalg_LR', 'MLalg_RF', 'MLalg_GB', 'MLalg_BA', 'MLalg_EA', 'MLalg_NN', 'MLalg_CNN', 'MLalg_GAN',
                             'MLalg_RNN', 'MLalg_TN', 'MLalg_None', 'MLalg_Other', 'CV_GP', 'CV_IS', 'CV_OD', 'CV_IC', 'CV_GAN',
                             'CV_None', 'CV_Other', 'NLP_WE', 'NLP_EDM', 'NLP_CE', 'NLP_TLM', 'NLP_None', 'NLP_Other', 
                             'Size_Company', 'Size_DS', 'Current_ML', 'Work_Analysis', 'Work_Infra', 'Work_ML_Prototype',
                             'Work_ML', 'Work_ML_Improve', 'Work_Research', 'Work_None', 'Work_Other', 'Current_Comp',
                             'Money_Spent', 'CloudPlt_AWS', 'CloudPlt_Azure', 'CloudPlt_GCP', 'CloudPlt_IBM', 'CloudPlt_Oracle', 
                             'CloudPlt_SAP', 'CloudPlt_Salesforce', 'CloudPlt_VMware', 'CloudPlt_Alibaba', 'CloudPlt_Tencent',
                             'CloudPlt_None','CloudPlt_Other', 'Cloud_EC2', 'Cloud_Lambda', 'Cloud_Elastic', 'Cloud_Azure',
                             'Cloud_AzureContainer', 'Cloud_AzureFunctions', 'Cloud_GoogleEngine', 'Cloud_GoogleFunc',
                             'Cloud_GoogleRun', 'Cloud_GoogleApp', 'Cloud_None', 'Cloud_Other', 'MLPlt_Sagemaker', 'MLPlt_AmazonForecast',
                             'MLPlt_AmazonRekog', 'MLPlt_AzureML', 'MLPlt_AzureCogn', 'MLPlt_GoogleAI', 'MLPlt_GoogleVideoAI',
                             'MLPlt_GoogleNLP', 'MLPlt_GoogleVisionAI', 'MLPlt_None', 'MLPlt_Other', 'BigData_MySQL', 'BigData_PGSQL',
                             'BigData_SQLite', 'BigData_Oracle', 'BigData_MongoDB', 'BigData_Snowflake', 'BigData_IBM',
                             'BigData_MSSQL', 'BigData_Access', 'BigData_Azure', 'BigData_Redshift', 'BigData_Athena',
                             'BigData_Dynamo', 'BigData_GoogleBigQuery', 'BigData_GoogleSQL', 'BigData_GoogleGirestore',
                             'BigData_None', 'BigData_Other', 'BigData_Most', 'BI_Amazon', 'BI_Power', 'BI_Google', 'BI_Looker',
                             'BI_Tableau', 'BI_Salesforce', 'BI_Einstein', 'BI_Qlik', 'BI_Domo', 'BI_TIBCO', 'BI_Alteryx',
                             'BI_Sisense', 'BI_SAP', 'BI_None', 'BI_Other', 'BI_Most', 'AutoML_Aug', 'AutoML_FE', 'AutoML_MS', 'AutoML_MA',
                             'AutoML_HPT', 'AutoML_Pipe', 'AutoML_None', 'AutoML_Other', 'AutoMLReg_Google', 'AutoMLReg_H2O',
                             'AutoMLReg_Databricks', 'AutoMLReg_DaraRobot', 'AutoMLReg_Tpot', 'AutoMLReg_Keras', 
                             'AutoMLReg_Sklearn', 'AutoMLReg_ML', 'AutoMLReg_Xceessiv', 'AutoMLReg_MLbox', 'AutoMLReg_None',
                             'AutoMLReg_Other', 'MLExp_Neptune', 'MLExp_WB', 'MLExp_Comet', 'MLExp_Sacred', 'MLExp_TB', 
                             'MLExp_Guild', 'MLExp_Polyaxon', 'MLExp_Trains', 'MLExp_Monitor', 'MLExp_None', 'MLExp_Other',
                             'Deploy_Plotly', 'Deploy_Streamlit', 'Deploy_NBViewer', 'Deploy_GitHub', 'Deploy_Blog',
                             'Deploy_Kaggle', 'Deploy_Colab', 'Deploy_Shiny', 'Deploy_None', 'Deploy_Other', 'Course_Coursera',
                             'Course_edX', 'Course_Kaggle', 'Course_DataCamp', 'Course_FastAI', 'Course_Udacity', 'Course_Udemy',
                             'Course_Linkedin', 'Course_Cloud', 'Course_Uni', 'Course_None', 'Course_Other', 'Primary_Analysis',
                             'Fav_Media_Twitter', 'Fav_Media_Email', 'Fav_Media_Reddit', 'Fav_Media_Kaggle', 'Fav_Media_Course',
                             'Fav_Media_Youtube', 'Fav_Media_Podcast', 'Fav_Media_Blog', 'Fav_Media_Journal', 'Fav_Media_Slack',
                             'Fav_Media_None', 'Fav_Media_Other', 'Supp_Cloud_AWS', 'Supp_Cloud_Azure', 'Supp_Cloud_Google',
                             'Supp_Cloud_IBM', 'Supp_Cloud_Oracle', 'Supp_Cloud_SAP', 'Supp_Cloud_VMware', 'Supp_Cloud_Salesforce',
                             'Supp_Cloud_Alibaba', 'Supp_Cloud_Tencent', 'Supp_Cloud_None', 'Supp_Cloud_Other',
                             'Supp_CloudPlt_EC2', 'Supp_CloudPlt_Lambda', 'Supp_CloudPlt_Elastic', 'Supp_CloudPlt_Azure',
                             'Supp_CloudPlt_AzureContainer', 'Supp_CloudPlt_AzureFunc', 'Supp_CloudPlt_GoogleCompute',
                             'Supp_CloudPlt_GoogleFunc', 'Supp_CloudPlt_GoogleRun', 'Supp_CloudPlt_GoogleApp', 'Supp_CloudPlt_None',
                             'Supp_CloudPlt_Other', 'Supp_ML_Sagemaker', 'Supp_ML_AmznForecast', 'Supp_ML_AmznRekog',
                             'Supp_ML_Azure', 'Supp_ML_AzureCogn', 'Supp_ML_GoogleAI', 'Supp_ML_GoogleVideoAI', 'Supp_ML_GoogleNLP',
                             'Supp_ML_GoogleVisionAI', 'Supp_ML_None', 'Supp_ML_Other', 'Supp_BigData_MySQL', 'Supp_BigData_PGSQL',
                             'Supp_BigData_SQLite', 'Supp_BigData_Oracle', 'Supp_BigData_MongoDB', 'Supp_BigData_Snowflake',
                             'Supp_BigData_IBM', 'Supp_BigData_MSSQL', 'Supp_BigData_Access', 'Supp_BigData_Azure', 'Supp_BigData_Redshift',
                             'Supp_BigData_Athena', 'Supp_BigData_Dynamo', 'Supp_BigData_GoogleBigQuery', 'Supp_BigData_GoogleSQL',
                             'Supp_BigData_GoogleFirestore', 'Supp_BigData_None', 'Supp_BigData_Other', 'Supp_BI_Power', 'Supp_BI_Amzn',
                             'Supp_BI_Google', 'Supp_BI_Looker', 'Supp_BI_Tableau', 'Supp_BI_Salesforce', 'Supp_BI_Einstein', 
                             'Supp_BI_Qlik', 'Supp_BI_Domo', 'Supp_BI_TIBCO', 'Supp_BI_Alteryx', 'Supp_BI_Sisense', 
                             'Supp_BI_SAP', 'Supp_BI_None', 'Supp_BI_Other', 'Supp_AutoML_Aug', 'Supp_AutoML_FE',
                             'Supp_AutoML_MS', 'Supp_AutoML_MA', 'Supp_AutoML_HPT', 'Supp_AutoML_Pipe',
                             'Supp_AutoML_None', 'Supp_AutoML_Other', 'Supp_AutoMLTool_Google', 'Supp_AutoMLTool_H2O',
                             'Supp_AutoMLTool_Databricks', 'Supp_AutoMLTool_Datarobot', 'Supp_AutoMLTool_Tpot',
                             'Supp_AutoMLTool_Keras', 'Supp_AutoMLTool_Sklearn', 'Supp_AutoMLTool_Auto',
                             'Supp_AutoMLTool_Xcessiv', 'Supp_AutoMLTool_MLbox', 'Supp_AutoMLTool_None',
                             'Supp_AutoMLTool_Other', 'Supp_MLExp_Neptune', 'Supp_MLExp_WB', 'Supp_MLExp_Comet',
                             'Supp_MLExp_Sacred', 'Supp_MLExp_TB', 'Supp_MLExp_Guild', 
                             'Supp_MLExp_Polyaxon', 'Supp_MLExp_Trains', 'Supp_MLExp_Monitor',
                             'Supp_MLExp_None', 'Supp_MLExp_Other']
len(names)

In [None]:
len(set(names))

In [None]:
survey = pd.read_csv('../input/kaggle-survey-2020/kaggle_survey_2020_responses.csv', skiprows=2, sep=',', names=names)
survey.head()

In [None]:
survey.shape

In [None]:
survey.isna().sum()

# EDA

First, let's look at the age distribution of participants.

In [None]:
sns.countplot(survey.Age, order=survey['Age'].value_counts().sort_index().index)

In [None]:
survey['Age'].value_counts(normalize=True).sort_index()

We see around 70% of the participants are below the age of 35, however we do have some people above 60!

Next, let's see gender distribution.

In [None]:
plt.figure(figsize=(10,10))
sns.countplot(survey.Gender)

In [None]:
survey['Gender'].value_counts(normalize=True)

There is an overwhelming majority of men in Kaggle :)

In [None]:
survey['Country'].nunique()

In [None]:
survey.Country.value_counts()[0:10].index

There are people from at least 55 different countries, great for diversity! However, when we look at the distribution, we see 29% come from India, 11% come from US with remaining 60% coming from 53+ countries.

In [None]:
plt.figure(figsize=(10,10))
sns.countplot(survey.Country, order=survey.Country.value_counts().iloc[:5].index)

In [None]:
ed = sns.factorplot("Education", data=survey, aspect=1.5, kind="count")
ed.set_xticklabels(rotation=90)

In [None]:
survey.Education.value_counts(normalize=True)

Out of the participants who answered the Education question, 87% has Bachelor's degree or above.

In [None]:
plt.figure(figsize=(20,20))
title = sns.factorplot("Title", data=survey, aspect=1.5, kind="count")
title.set_xticklabels(rotation=90)

In [None]:
survey.Title.value_counts(normalize=True)

27% of participants are students, 14% are data scientists (yayy!) and 10% are software engineers.

In [None]:
plt.figure(figsize=(20,20))
years = sns.factorplot("Num_Years_Code", data=survey, aspect=1.5, kind="count")
years.set_xticklabels(rotation=60)



We have quite a few experienced people in coding, however the majority are beginners (most have been coding less than 5 years).

Now, let's look at tools, algorithms and frameworks people use based on the multiple choice questions' answers.

In [None]:
def prefix(keyword):
    plot_list = list(survey.filter(regex=keyword))
    plot_dict = defaultdict(lambda: 0)

    for x in plot_list:
        plot_dict[survey[x].value_counts().index[0]] = survey[x].value_counts().tolist()[0]
    
    plot_dict = dict(sorted(plot_dict.items(), key=lambda item: item[1], reverse=True))
    plt.figure(figsize=(20,20))
    plt.bar(plot_dict.keys(), height = plot_dict.values())
    plt.xticks(rotation=90)
    return plot_dict

This function takes a keyword (which will be searched in column names - each multiple choice question has its own keyword) and outputs a graph with counts of each answer to the question.

In [None]:
prefix('Lang_')

In [None]:
prefix('MLalg_')

In [None]:
prefix('ML_')

Python is by far the most popular tool Kagglers use, Liner/Logistic Regression is still the most common technique used in data science and scikit learn is the most commonly used library! 