In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import matplotlib
from scipy import stats
pd.set_option('display.max_columns', None)
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import dask.dataframe as dd
import squarify
import geopandas as gpd

df_origin = dd.read_csv('../input/kaggle-survey-2020/kaggle_survey_2020_responses.csv')

In [None]:
df = df_origin.compute()
df=df.drop(index=0)
df.shape

In [None]:
Q = []
Q.append([df.columns[0]])
for i in range(1,40):
    if i<= 6:
        Tem = ['Q'+str(i)]
        Q.append(Tem)
    if i>6:
        Tem=[]
        for element in df.columns:
            if element.startswith('Q'+str(i)):
                Tem.append(element)
        Q.append(Tem)

# Respondent demographic

## Age and Gender Analysis

### Age 

In [None]:
#df.loc[df.Q1=='70+','Q1']='70-70'
#df['Q1_l']=df['Q1'].apply(lambda x: int(x.split('-')[0]))
#df['Q1_h']=df['Q1'].apply(lambda x: int(x.split('-')[1]))
#df['ave_age']=(df['Q1_l']+df['Q1_h'])/2

In [None]:
#df['Q2'].value_counts()

In [None]:
woman_respondent = df[df['Q2']=='Woman']['Q1'].value_counts().sort_index().tolist()
man_respondent = df[df['Q2']=='Man']['Q1'].value_counts().sort_index().tolist()
index = df[df['Q2']=='Man']['Q1'].value_counts().sort_index().index.tolist()
width = 0.35
p1 = plt.bar(index,man_respondent,width)
p2 = plt.bar(index,woman_respondent,width,bottom=man_respondent)
plt.xlabel('Age')
plt.ylabel('Number of respondents')
plt.title('Count(Respondents) by group and gender')
plt.legend((p1[0], p2[0]), ('Men', 'Women'))

### the majority of respondents are between 18 and 25, where male respondent take up the majority. 

## Region Analysis

In [None]:
world_map = gpd.read_file('../input/worldmap/World_Countries.shp')
world_map.head()

In [None]:
for item in df['Q3'].unique():
    world_data_list = world_map['COUNTRY'].tolist()
    if item not in world_data_list:
        print(item)

In [None]:
df.loc[df.Q3=='United States of America','Q3']='United States'
df.loc[df.Q3=='United Kingdom of Great Britain and Northern Ireland','Q3']='United Kingdom'
df.loc[df.Q3=='Viet Nam','Q3']='Vietnam'
df.loc[df.Q3=='Republic of Korea','Q3']='South Korea'
df.loc[df.Q3=='Iran, Islamic Republic of...','Q3']='Iran'

In [None]:
#fill 'Other' with the distribution of countries
s = df[df['Q3']!='Other']['Q3'].value_counts(normalize=True)
missing = df['Q3']=='Other'
df.loc[missing,'Q3'] = np.random.choice(s.index, size=len(df[missing]),p=s.values)

In [None]:
merge = world_map.join(df['Q3'].value_counts(), on='COUNTRY', how='right')

In [None]:
ax=merge.plot(column='Q3',
              cmap='OrRd',
              scheme='user_defined',
              classification_kwds={'bins':[100,500,1000,1500,3000,5000]},
              figsize=(20,10),
              legend=True,
              edgecolor='black',
              linewidth=0.4)

### Indian and the United States have the majorities of respondents.

## Job title and education

In [None]:
fig1, ax1 = plt.subplots()
fig1.set_size_inches(12,8)
ax1.pie(df['Q4'].value_counts().values, 
        labels=df['Q4'].value_counts().index, 
        autopct='%1.1f%%',shadow=True, startangle=190)
plt.title('Pie chart for different job titles',fontsize=20)
plt.show()

In [None]:
labels = df['Q5'].value_counts().index
encode_means = df['Q5'].value_counts().values

x = np.arange(len(labels))  # the label locations
width = 0.35  # the width of the bars

fig, ax = plt.subplots()
fig.set_size_inches(12,8)
rects1 = ax.bar(x, encode_means, width, label='encode')
ax.set_ylabel('Respondents Number',fontsize=16)
ax.set_xticks(x)
ax.set_xticklabels(labels,fontsize=12,rotation=45,ha='right')
plt.title('bar chart for different education',fontsize=20)
plt.show()

## split employed group

In [None]:
df_emp  =df[~df['Q5'].isin(['Student','Currently not employed'])]
df_unemp=df[df['Q5'].isin(['Student','Currently not employed'])]

# Salary Analysis, for employed people

## which kind of kagglers can get higher salary

In [None]:
df_emp = df_emp[df_emp.Q24.notna()]
df_emp.shape

In [None]:
df_emp['Q24']=df_emp['Q24'].apply(lambda x: x.replace(',',''))
df_emp.loc[df_emp.Q24=='> $500000','Q24']='500000-500000'
df_emp.loc[df_emp.Q24=='$0-999','Q24']='0-999'

df_emp['Q24_avg']=(df_emp['Q24'].apply(lambda x: int(x.split('-')[0]))+
                   df_emp['Q24'].apply(lambda x: int(x.split('-')[1])))/2

In [None]:
df_1 = df[df['Q24'].notna()]['Q24'].value_counts().to_frame().reset_index()
df_1 = df_1.rename(columns={'index':'Q24','Q24':'count'})

In [None]:
df_1['Q24']=df_1['Q24'].apply(lambda x: x.replace(',',''))
df_1.loc[df_1.Q24=='> $500000','Q24']='500000-500000'
df_1.loc[df_1.Q24=='$0-999','Q24']='0-999'

df_1['Q24_avg']=(df_1['Q24'].apply(lambda x: int(x.split('-')[0]))+
                   df_1['Q24'].apply(lambda x: int(x.split('-')[1])))/2
df_1 = df_1.sort_values('Q24_avg')

In [None]:
plt.figure(figsize=(12,8))
plt.bar(x=df_1.Q24, height=df_1['count'])
plt.xlabel('Salary',fontsize=16)
plt.ylabel('Number of respondents',fontsize=16)
plt.title('Count(Respondents) by Salary Group',fontsize=20)
plt.xticks(rotation=40,ha='right')
plt.show()

In [None]:
def groupby_multiple(df,q_num):
    avg_salary = []
    n_respondent = []
    element = []

    for column in Q[int(q_num)]:
        df_temp = df[df[column].notna()]
        avg_salary.append(df_temp['Q24_avg'].mean())
        n_respondent.append(len(df_temp))
        element.append(df_temp[column].value_counts().index[0])
    group = pd.DataFrame({'Question':Q[int(q_num)],'mean_salary':avg_salary,
                          'n_respondents':n_respondent,'element':element})
    return group

In [None]:
## null value pick
def compare_none(df,q_num):
    avg_salary = []
    n_respondent = []
    element = []
    for column in Q[int(q_num)]:
        df_temp = df[df[column].isna()]
        avg_salary.append(df_temp['Q24_avg'].mean())
        n_respondent.append(len(df_temp))
        element.append(df_emp[df_emp[column].notna()][column].value_counts().index[0])
    group = pd.DataFrame({'Question':Q[int(q_num)],'mean_salary':avg_salary,
                          'n_respondents':n_respondent,'non_element':element})
    return group

## Salary by what program language you learned

In [None]:
encode_means = groupby_multiple(df_emp,7).mean_salary.tolist()
uncode_means = compare_none(df_emp,7).mean_salary.tolist()

labels = groupby_multiple(df_emp,7).element.tolist()
encode_means = list(np.around(np.array(encode_means),2))
uncode_means = list(np.around(np.array(uncode_means),2))

x = np.arange(len(labels))  # the label locations
width = 0.35  # the width of the bars

fig, ax = plt.subplots()
fig.set_size_inches(24,10)
rects1 = ax.bar(x - width/2, encode_means, width, label='encode')
rects2 = ax.bar(x + width/2, uncode_means, width, label='uncode')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Salary',fontsize=20)
ax.set_xlabel('Language',fontsize=20)
ax.set_xticks(x)
ax.set_xticklabels(labels,fontsize=18,rotation=45,ha='right')
ax.legend(fontsize=20)

plt.show()

The most surprising case is that although majority of people start data analysis with Python or R.
There is no evidence implying that learning Python and R can give you a higher salary.
Some languages we rarely use in data analysis, may even take time and be negatively affecting, like C and Java,etc.

## ML methods 

In [None]:
df_15 = df_emp[['Q15','Q24_avg']].groupby('Q15').agg('mean')
df_15 = df_15.reindex(index=['I do not use machine learning methods','Under 1 year','1-2 years', '2-3 years', 
       '3-4 years', '4-5 years', '5-10 years','10-20 years', '20 or more years'])
plt.figure(figsize=(10,5))
plt.title('Experienced ML Kagglers are paid higher')
plt.ylabel('Average Salary')
plt.bar(x=df_15.index,height=df_15.Q24_avg)
plt.xticks(rotation=40,ha='right')
plt.show()

In [None]:
df_ml = df_emp[~(df_emp['Q15'].isin(['I do not use machine learning methods']))]
df_16 = groupby_multiple(df_ml,16).sort_values(by ='n_respondents')
df_16 = df_16[df_16.element!='Other']
#x,y = zip(*sorted(zip(df_16.n_respondents,df_16.mean_salary)))
labels = df_16['element'].tolist()

fig, ax = plt.subplots()
fig.set_size_inches(13,9)
plt.plot(df_16.n_respondents,df_16.mean_salary, marker='o')
i = 0
for x,y in zip(df_16.n_respondents,df_16.mean_salary):
    plt.annotate(labels[i],(x,y),textcoords='offset points',xytext=(0,10),ha='center',fontsize=12)
    i = i+1
ax.set_xlabel('Popularity(Number of respondents)',fontsize=12)
ax.set_ylabel('Average Salary',fontsize=12)
ax.set_title('Popularity of ML framework & Salary',fontsize=14)

In [None]:
import squarify
Q_23 = groupby_multiple(df_emp,23)
Q_23['percentage'] = Q_23['n_respondents']/len(df_emp)
size = Q_23['n_respondents']
norm = matplotlib.colors.Normalize(vmin=min(Q_23.percentage), vmax=max(Q_23.percentage))
colors = [matplotlib.cm.Blues(norm(value)) for value in Q_23.percentage]
labels=['Analyze and understand data to \n influence product or business decisions \n 59.17%',
       'Build and/or run the data infrastructure \n that my business uses for storing, \n analyzing, and operationalizing data \n 30.65%',
       'Build prototypes to \n explore applying machine \n learning to new areas \n 36.93%',
       'Build and/or run a \n machine learning service \n that operationally improves \n my product or workflows \n 25.1%',
       'Experimentation and iteration \n to improve existing ML models \n 28.8%',
       'Do research that advances the \n state of the art of machine learning \n 21.55%',
       'None',
       'Other']
plt.figure(figsize=(22,10))
squarify.plot(size,label=labels,text_kwargs={'fontsize':18,'weight':'bold'},color=colors)
plt.title('Most important tasks in daily work \n',fontsize=25, weight = 'bold')
plt.axis('off')
plt.show()


# Larger companies, larger team have higher pay

In [None]:
df_emp.loc[df.Q21=='20+','Q21']='20-25'
df_emp.loc[df.Q21=='0','Q21']='0-1'
df_emp['Q21'] = df_emp['Q21'].apply(lambda x: int(x.split('-')[0]))

In [None]:
index_order=[(          '0-49 employees',   '0-1'),
            (          '0-49 employees',       1),
            (          '0-49 employees',       3),
            (          '0-49 employees',       5),
            (          '0-49 employees',      10),
            (          '0-49 employees',      15),
            (          '0-49 employees', '20-25'),
            (        '50-249 employees',   '0-1'),
            (        '50-249 employees',       1),
            (        '50-249 employees',       3),
            (        '50-249 employees',       5),
            (        '50-249 employees',      10),
            (        '50-249 employees',      15),
            (        '50-249 employees', '20-25'),
            (       '250-999 employees',   '0-1'),
            (       '250-999 employees',       1),
            (       '250-999 employees',       3),
            (       '250-999 employees',       5),
            (       '250-999 employees',      10),
            (       '250-999 employees',      15),
            (       '250-999 employees', '20-25'),
            (    '1000-9,999 employees',   '0-1'),
            (    '1000-9,999 employees',       1),
            (    '1000-9,999 employees',       3),
            (    '1000-9,999 employees',       5),
            (    '1000-9,999 employees',      10),
            (    '1000-9,999 employees',      15),
            (    '1000-9,999 employees', '20-25'),
            ('10,000 or more employees',   '0-1'),
            ('10,000 or more employees',       1),
            ('10,000 or more employees',       3),
            ('10,000 or more employees',       5),
            ('10,000 or more employees',      10),
            ('10,000 or more employees',      15),
            ('10,000 or more employees', '20-25')]

In [None]:
df_company = df_emp[['Q20','Q21','Q24_avg']].groupby(['Q20','Q21']).agg('mean')
df_company = df_company.reindex(index_order)

In [None]:
alpha_list = [0.1,0.2,0.4,0.7,1]
fig,ax=plt.subplots()
fig.set_size_inches(15,8)
label_list = []
for i in range(0,len(df_emp['Q20'].value_counts())):
    x = [0,1,3,5,10,15,20]
    y = []
    for j in range(0,7):
        y.append(df_company.values[7*i+j][0])
    label_list.append(df_company.index[7*i][0])
    ax.plot(x,y,'b',alpha=alpha_list[i])
plt.xlabel('Colleague number')
plt.ylabel('Salary')
plt.legend(labels=label_list)
plt.title('Bigger organization, higher salary',fontsize=20)
plt.show()

## Popular learning platforms and reading source

### learning platforms

In [None]:
labels = ['Coursera',
 'edX',
 'Kaggle Learn Courses',
 'DataCamp',
 'Fast.ai',
 'Udacity',
 'Udemy',
 'LinkedIn Learning',
 'Cloud-certification programs',
 'University Courses',
 'None',
 'Other']
values = groupby_multiple(df_emp,37)['n_respondents'].tolist()
x = np.arange(len(labels))  # the label locations
width = 0.35  # the width of the bars
fig, ax = plt.subplots()
fig.set_size_inches(12,6)
rects1 = ax.bar(x, values, width)
ax.set_ylabel('Respondents number',fontsize=18)
ax.set_xticks(x)
ax.set_xticklabels(labels,fontsize=12,rotation=40,ha='right')
plt.title('Popularity for learning platforms',fontsize=20)
plt.show()

Coursera, Kaggle learn courses, Udemy are the top3 popular course platforms.

### reading source

In [None]:
labels = ['Twitter',
 "Email newsletters",
 'Reddit',
 'Kaggle',
 'Course Forums',
 'YouTube',
 'Podcasts',
 'Blogs',
 'Journal Publications',
 'Slack Communities',
 'None',
 'Other']
values = groupby_multiple(df_emp,39)['n_respondents'].tolist()
x = np.arange(len(labels))  # the label locations
width = 0.35  # the width of the bars
fig, ax = plt.subplots()
fig.set_size_inches(12,6)
rects1 = ax.bar(x, values, width)
ax.set_ylabel('Respondents number',fontsize=18)
ax.set_xticks(x)
ax.set_xticklabels(labels,fontsize=12,rotation=45,ha='right')
plt.title('Popularity for reading source',fontsize=20)
plt.show()

Kaggle, Youtube, Blogs are the top3 popular media.