# **0.0. IMPORTS**

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings( 'ignore' )

sns.set(style='darkgrid')
plt.rcParams["figure.figsize"] = (15,6)

## **0.1. HELPER FUNCTION**

In [None]:
def get_label(g):
  for p in g.patches:
    height = p.get_height()
    g.text(p.get_x()+p.get_width()/2.,
            height/2,
            '{}'.format(round(height)),
            ha="center", color='white')

# **1.0. READING DATA**

In [None]:
df_2019 = pd.read_csv("../input/kaggle-survey-2019/multiple_choice_responses.csv")
df_2020 = pd.read_csv("/kaggle/input/kaggle-survey-2020/kaggle_survey_2020_responses.csv")
df_2021 = pd.read_csv("../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv")

# **2.0. DATA PREPRECESSING**

In [None]:
# Select just the column that i will use
df_19 = df_2019[['Time from Start to Finish (seconds)', 'Q1', 'Q2', 'Q3', 'Q4', 'Q5']]
df_20 = df_2020[['Time from Start to Finish (seconds)', 'Q1', 'Q2', 'Q3', 'Q4', 'Q5']]
df_21 = df_2021[['Time from Start to Finish (seconds)', 'Q1', 'Q2', 'Q3', 'Q4', 'Q5']]

# rename columns

cols = {'Time from Start to Finish (seconds)':'Time_start_to_finish', 'Q1':'age', 'Q2':'gender', 'Q3':'country_reside', 
        'Q4':'formal_education', 'Q5':'title_job', 'Q6':'years_writing_code', }
df_19.rename(columns=cols, inplace=True)

cols = {'Time from Start to Finish (seconds)':'Time_start_to_finish', 'Q1':'age', 'Q2':'gender', 'Q3':'country_reside', 
        'Q4':'formal_education', 'Q5':'title_job', 'Q6':'years_writing_code', }
df_20.rename(columns=cols, inplace=True)

cols = {'Time from Start to Finish (seconds)':'Time_start_to_finish', 'Q1':'age', 'Q2':'gender', 'Q3':'country_reside', 
        'Q4':'formal_education', 'Q5':'title_job', 'Q6':'years_writing_code', }
df_21.rename(columns=cols, inplace=True)

df_19.drop(0, axis=0, inplace=True)
df_20.drop(0, axis=0, inplace=True)
df_21.drop(0, axis=0, inplace=True)

# creare a new column to indentify what year the daset belongs
df_19['year'] = '2019'
df_20['year'] = '2020'
df_21['year'] = '2021'

aux = pd.concat([df_19, df_20], ignore_index=True)
df_all = pd.concat([aux, df_21], ignore_index=True)

#Change the extense name
df_all['country_reside'] = df_all['country_reside'].str.replace('Iran, Islamic Republic of...', 'Iran')
df_all['country_reside'] = df_all['country_reside'].str.replace('United Kingdom of Great Britain and Northern Ireland', 'United Kingdom')
df_all['country_reside'] = df_all['country_reside'].replace('Hong Kong (S.A.R.)', 'Hong Kong')

# replace the gender

df_all['gender'] = df_all['gender'].str.replace('Female', 'Woman')
df_all['gender'] = df_all['gender'].str.replace('Male','Man')

# Creare a new column continent 
map_continent = {'India':'Asia', 'Indonesia':'Asia', 'Pakistan':'Asia', 'Mexico':'North America', 'Russia':'Asia', 'Turkey':'Asia',
                'Australia':'Australia', 'Nigeria':'Africa', 'Greece':'Europe', 'Belgium':'Europe', 'Japan':'Asia', 'Egypt':'Africa',
                'Singapore':'Asia', 'Brazil':'South America', 'Poland':'Europe', 'China':'Asia','Iran':'Asia', 'United States of America':'North America',
                'Italy':'Europe', 'Viet Nam':'Asia', 'Israel':'Asia', 'Peru':'South America', 'South Africa':'Africa', 'Other':'Other','Spain':'Europe', 'Bangladesh':'Asia',
                'United Kingdom':'Europe', 'France':'Europe','Switzerland':'Europe', 'Algeria':'Africa', 'Tunisia':'Africa', 'Argentina':'South America', 'Sweden':'Europe',
                'Colombia':'South America','I do not wish to disclose my location':'I do not wish to disclose my location', 'Canada':'North America','Chile':'South America', 
                'Netherlands':'Europe', 'Ukraine':'Europe', 'Saudi Arabia':'Asia', 'Romania':'Europe','Morocco':'Africa', 'Austria':'Europe', 'Taiwan':'Asia', 'Kenya':'Africa', 'Belarus':'Europe', 
                'Ireland':'Europe','Portugal':'Europe', 'Hong Kong':'Asia', 'Denmark':'Europe', 'Germany':'Europe','South Korea':'Asia', 'Philippines':'Asia', 'Sri Lanka':'Asia', 
                'United Arab Emirates':'Asia','Uganda':'Africa', 'Ghana':'Africa', 'Malaysia':'Asia', 'Thailand':'Asia', 'Nepal':'Asia', 'Kazakhstan':'Asia','Ethiopia':'Africa', 'Iraq':'Asia', 
                'Ecuador':'South America', 'Norway':'Europe', 'Czech Republic':'Europe', 'Kenya':'Africa', 'Republic of Korea':'Asia', 'New Zealand':'Oceanian', 'Hungary':'Europe'}

df_all['continent'] = df_all['country_reside'].map(map_continent)


# change types and trasnforming time ( seconds ) to minutes
df_all['year'] = df_all['year'].astype(int)
df_all['Time_start_to_finish'] = df_all['Time_start_to_finish'].astype(int)
df_all['Time_start_to_finish'] = df_all['Time_start_to_finish'].apply(lambda x: round(x / 60, ndigits=2))


In [None]:
df_all.sample()

# **3.0. EXPLORATORY DATA ANALYSIS (EDA)**

# **3.1. ANALYSIS ABOUT SURVEY**

## **3.1.1. TIME TO FILL THE SURVEY**

In [None]:
print(f'minimum time to complete the survey: {df_all["Time_start_to_finish"].min()} minutes')
print(f'maximum time to complete the survey: {df_all["Time_start_to_finish"].max()} minutes')

## **3.1.2. NUMBER OF SURVEY PARTICIPANTS PER YEAR**

In [None]:
df_year = df_all.groupby(['year'], as_index=False)['age'].count().rename( columns = {"age":'total'})
df_year = pd.DataFrame(df_year)

In [None]:
g = sns.barplot(x='year', y='total', data = df_year, palette='Set1')
plt.title('NUMBER OF SURVEY PARTICIPANTS PER YEAR')
get_label(g)

# **3.2. ANALAYSIS ABOUT PARTICIPANTS OF SURVEY**

## **3.2.1 AGE**

In [None]:
df_age = df_all.groupby(['age', 'year'], as_index=False)['gender'].count().rename( columns = {"gender":'total'})
df_age = pd.DataFrame(df_age)

In [None]:
g = sns.barplot(x='age', y='total', hue='year', data = df_age, palette='Set2')
plt.title('AGE OF PARTICIPANTS PER YEAR');

**In 2021 there was an increase in research participants in all age groups compared to 2020 and 2019**

## **3.2.2. GENDER**

In [None]:
mask = (df_all['gender'] != 'Prefer not to say')
df_gender =  df_all.loc[mask].groupby(['gender', 'year'], as_index=False)['age'].count().rename( columns = {'age':'total'})

In [None]:
order = ['Man', 'Woman', 'Nonbinary', 'Prefer to self-describe']
g = sns.barplot(x='gender', y='total', hue='year', data = df_gender, order=order, palette='Set2')
plt.title('GENDER OF PARTICIPANTS PER YEAR');

**The number of male and female participants also increased in 2021, compared to 2020 and 2021. There is still a large difference between women and men, according to the Kaggle survey.**

## **3.2.3. TITLE JOB**

In [None]:
title_job =  df_all.groupby(['title_job', 'year'], as_index=False)['age'].count().rename( columns = {'age':'total'})

In [None]:
g = sns.barplot(x='title_job', y='total', hue='year', data = title_job, palette='Set2')
plt.title('TITLE JOB OF PARTICIPANTS PER YEAR');
plt.xticks(rotation=90);

**Students are the biggest participants in Kaggle's research and it has been growing every year.**

## **3.2.4. PROGRAMMING LANGUAGE**

In [None]:
# This code was inspired by this notebook: https://www.kaggle.com/ruchi798/kaggle-ml-ds-survey-analysis
def get_count(question_num, parts, data):
    questions = []
    questions = ['Q'+ str(question_num) +'_Part_'+ str(j) for j in range(1, parts)]
    questions.append('Q'+ str(question_num) + '_OTHER')
    
    categories = []
    values = []
    for i in questions:
        category = data[i].value_counts().index[0]
        val = data[i].value_counts()[0]
        
        categories.append(category)
        values.append(val)
       
    combined_df = pd.DataFrame()
    combined_df['Category'] = categories
    combined_df['Value'] = values
    
    combined_df = combined_df.sort_values(['Value'],ascending=False)
    
    return combined_df

In [None]:
df1 = get_count(7, 12, df_2020)
df2 = get_count(7, 12, df_2021)
df1['year'] = 2020
df2['year'] = 2021
df_programing = pd.concat([df1, df2], ignore_index=True)

In [None]:
plt.figure(figsize=(15, 7))
sns.barplot(x='Category', y='Value', hue='year', data=df_programing, palette='Set2');
plt.title('PROGRAMMING LANGUAGE 2020 AND 2021', size=18)

## **3.2.5 INTEGRATED DEVELOPMENT ENVIRONMENT 2020 and 2021**

In [None]:
integrated_development_2020 = get_count(9, 11, df_2020)
integrated_development_2021 = get_count(9, 12, df_2021)

integrated_development_2020['year'] = 2020
integrated_development_2021['year'] = 2021
df_ide = pd.concat([integrated_development_2020, integrated_development_2021], ignore_index=True)
df_ide['Category'] = df_ide['Category'].apply(lambda x: x.strip(' '))

map = {'Jupyter (JupyterLab, Jupyter Notebooks, etc)':'Jupyter Notebook',
       'Visual Studio Code (VSCode)':'VSCode','PyCharm':'PyCharm','RStudio':'RStudio',
       'Spyder':'Spyder', 'Notepad++':'Notepad++','Sublime Text':'Sublime Text',
       'Visual Studio':'Visual Studio','MATLAB':'MATLAB', 'Vim / Emacs':'Vim / Emacs',
       'Other':'Other', 'Jupyter Notebook':'Jupyter Notebook'}

df_ide['Category'] = df_ide['Category'].map(map)

In [None]:
sns.barplot(y='Category', x='Value', hue='year', data=df_ide, palette='Set2');
plt.title('INTEGRATED DEVELOPMENT ENVIRONMENT 2020 and 2021', size=18);

## **3.2.6 MANAGED MACHINE LEARNING PRODUCTS THE PARTICIPANTS WISH TO BECOME FAMILIAR**

In [None]:
def get_count_2(question_num, parts, data):
    questions = []
    questions = ['Q'+ str(question_num) +'_B_Part_'+ str(j) for j in range(1, parts)]
    questions.append('Q'+ str(question_num) + '_B_OTHER')
    
    categories = []
    values = []
    for i in questions:
        category = data[i].value_counts().index[0]
        val = data[i].value_counts()[0]
        
        categories.append(category)
        values.append(val)
       
    combined_df = pd.DataFrame()
    combined_df['Category'] = categories
    combined_df['Value'] = values
    
    combined_df = combined_df.sort_values(['Value'],ascending=False)
    
    return combined_df

In [None]:
df_Q35_2020 = get_count_2(35, 10, df_2020)
df_Q38_2021 = get_count_2(38, 11, df_2021)

df_Q35_2020['year'] = 2020
df_Q38_2021['year'] = 2021
df_become_falimiar = pd.concat([df_Q35_2020, df_Q38_2021], ignore_index=True)
df_become_falimiar['Category'] = df_become_falimiar['Category'].apply(lambda x: x.strip(' '))

In [None]:
sns.barplot(y='Category', x='Value', hue='year', data=df_become_falimiar, palette='Set2');
plt.title('MANAGED MACHINE LEARNING PRODUCTS THE PARTICIPANTS WISH TO BECOME FAMILIAR');

**In 2020 and 2021, TensorBoard is the Managed Magine Learning products that participants most want to learn.**

**In 2021, two new technologies appeared that participants have been aiming for in the last 2 years: MLFlow and ClearML.**

# **3.3. GEOGRAPGIC ANALYSIS**

## **3.3.1. COUNTRY**

In [None]:
mask_country = (df_all['country_reside'] != 'Other')
df_country = df_all.loc[mask_country].groupby(['country_reside', 'year'], as_index=False)['gender'].count().rename( columns = {"gender":'total'})
df_country = pd.DataFrame(df_country)
df_country_10 = df_country.nlargest(30, 'total')

In [None]:
plt.figure(figsize=(15, 7))
g = sns.barplot(x='country_reside', y='total', hue='year', data = df_country_10, palette='magma')
plt.title('COUNTRY OF PARTICIPANTS PER YEAR');
plt.xticks(rotation=90);

**India is the top country of participants, ranking first in the 2019, 2020 and 2021 surveys.**

## **3.3.2. CONTINENT**

In [None]:
mask = (df_all['continent'] != 'Other') & (df_all['continent'] != 'I do not wish to disclose my location')
df1_contnent = df_all.loc[mask].groupby(['continent', 'year'], as_index=False)['age'].count().rename( columns = {'age':'total'})

In [None]:
order = ['Asia', 'North America', 'Europe', 'Africa', 'South America', 'Oceanian']
g = sns.barplot(y='continent', x='total', hue='year', data = df1_contnent, order=order, palette='magma')
plt.title('CONTINENT OF PARTICIPANTS PER YEAR');

**Asian continent loves Kaggle - Mainly in India**