# Intro
Welcome to the [2020 Kaggle ML & DS Survey](https://www.kaggle.com/c/kaggle-survey-2020) data set. 
![](https://storage.googleapis.com/kaggle-competitions/kaggle/23724/logos/header.png)

<span style="color: royalblue;">Please vote the notebook up if it helps you. Thank you. </span>

# Libraries

In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn3

# Path

In [None]:
path = '/kaggle/input/kaggle-survey-2020/'
os.listdir(path)

# Load Data

In [None]:
os.listdir(path+'supplementary_data/')

In [None]:
data = pd.read_csv(path+'kaggle_survey_2020_responses.csv',low_memory=False)

In [None]:
data.head()

# Functions

In [None]:
def plot_bar(data, text='', rotation=False):
    fig = plt.figure(figsize=(10, 5))
    x = data.keys()
    y = data.values
    plt.bar(x, y)
    plt.title(text, loc='left')
    plt.xlabel('Category')
    if rotation:
        plt.xticks(rotation='vertical')
    plt.grid()
    plt.show()

# Overview
The first row is the explicit text of the question.

In [None]:
print('number of samples:', len(data.index)-1)
print('number of features:', len(data.columns))

Questions:

In [None]:
data.iloc[0][20:30]

# Prepare Data
There are questions with multiple answers.

In [None]:
def split_Q(s):
    return s.split('_')[0]

In [None]:
columns = data.columns
df_cols = pd.DataFrame()
df_cols['org'] = columns
df_cols['Q'] = df_cols['org'].apply(split_Q)

In [None]:
df_cols['org'][7:20]

In [None]:
data_prep = pd.DataFrame()

# EDA

## Editing Time

In [None]:
feature = 'Time from Start to Finish (seconds)'
s = (data[1:][feature].apply(int))
s = s[s<3600]
fig = plt.figure(figsize=(10, 5))
plt.hist(s, bins=60)
plt.grid()
plt.title('Editing Time under 1 hour', loc='left')
plt.xlabel('Seconds')
plt.show()

## Age

In [None]:
feature = 'Q1'
s = data[1:][feature].value_counts().sort_index()
text = data.loc[0, feature]
plot_bar(s, text)

## Gender

In [None]:
feature = 'Q2'
data[feature] = data[feature].fillna('Unknown')
s = data[1:][feature].value_counts().sort_index()
text = data.loc[0, feature]
plot_bar(s, text)


## Country

In [None]:
feature = 'Q3'
data[feature] = data[feature].fillna('Unknown')
s = data[1:][feature].value_counts().sort_index()
text = data.loc[0, feature]
plot_bar(s, text, True)

## Education Level

In [None]:
feature = 'Q4'
data[feature] = data[feature].fillna('Unknown')
s = data[1:][feature].value_counts().sort_index()
text = data.loc[0, feature]
plot_bar(s, text, True)

## Current Role

In [None]:
feature = 'Q5'
data[feature] = data[feature].fillna('Unknown')
s = data[1:][feature].value_counts().sort_index()
text = data.loc[0, feature]
plot_bar(s, text, True)

## Coding Experience

In [None]:
Q6_list = ['< 1 years', '1-2 years', '3-5 years', '5-10 years' ,'10-20 years', '20+ years', 'I have never written code', 'Unknown']
feature = 'Q6'
data[feature] = data[feature].fillna('Unknown')
s = data[1:][feature].value_counts()
s = s[Q6_list]
text = data.loc[0, feature]
plot_bar(s, text, True)

## Machine Learning Methods Experience

In [None]:
Q15_list = ['Under 1 year', '1-2 years', '2-3 years', '3-4 years', '4-5 years' ,'5-10 years', '10-20 years',
            '20 or more years', 'I do not use machine learning methods', 'Unknown']
feature = 'Q15'
data[feature] = data[feature].fillna('Unknown')
s = data[1:][feature].value_counts()
s = s[Q15_list]
text = data.loc[0, feature]
plot_bar(s, text, True)

## Yearly Compensation

In [None]:
feature = 'Q24'
data[feature] = data[feature].fillna('Unknown')
s = data[1:][feature].value_counts()
text = data.loc[0, feature]
plot_bar(s, text, True)

## Programming Languages

### Use On A Regular Basis

In [None]:
s = pd.Series(dtype='float64')
number_parts = 12
for part in range(number_parts):
    feature = 'Q7_Part_'+str(part+1)
    temp = data[1:][feature].value_counts()
    s[temp.keys()[0]] = temp[0]
feature = 'Q7_OTHER'
temp = data[1:][feature].value_counts()
s[temp.keys()[0]] = temp[0]
text = data.loc[0, feature].split('-')[0]
plot_bar(s, text, True)

### Recommendation For Beginner

In [None]:
feature = 'Q8'
data[feature] = data[feature].fillna('No_Choice')
s = data[1:][feature].value_counts()
text = data.loc[0, feature]
plot_bar(s, text, True)