![Kaggle v. Stack Overflow](https://image.ibb.co/cgK0x0/kagglevstackoverflow.jpg)
# Comparing Kaggle and StackOverflow communities
The purpose of this notebook is to find insightful comparisons between Kaggle and StackOverflow communities from their annual survey datasets.

In [None]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

import os
print(os.listdir("../input"))

In [None]:
kaggle = pd.read_csv('../input/kaggle-survey-2018/multipleChoiceResponses.csv')[1:]
soflow = pd.read_csv('../input/stack-overflow-2018-developer-survey/survey_results_public.csv')

## 1. How is **gender diversity** different?

In [None]:
gender_kaggle = kaggle.groupby('Q1').Q1.count()
gender_kaggle_df = pd.DataFrame(columns=['numeric', 'perc'], index=gender_kaggle.index)
gender_kaggle_df['numeric'] = gender_kaggle
gender_kaggle_df['perc'] = gender_kaggle_df['numeric'] / gender_kaggle_df['numeric'].sum()
gender_kaggle_df

In [None]:
gender_soflow = soflow.groupby('Gender').Gender.count()
gender_soflow_df = pd.DataFrame(columns=['numeric', 'perc'], index=gender_soflow.index)
gender_soflow_df['numeric'] = gender_soflow
gender_soflow_df['perc'] = gender_soflow_df['numeric'] / gender_soflow_df['numeric'].sum()
gender_soflow_df

In [None]:
gender = pd.DataFrame(columns=['kaggle_num', 'soflow_num'], index=['Male', 'Female', 'Others'])

for g in ['Male', 'Female']:
    gender.loc[g] = [gender_kaggle_df['numeric'].loc[g], gender_soflow_df['numeric'].loc[g]]

gender['kaggle_num'].loc['Others'] = gender_kaggle_df['numeric'].sum() - gender['kaggle_num'].sum()
gender['soflow_num'].loc['Others'] = gender_soflow_df['numeric'].sum() - gender['soflow_num'].sum()

gender['Kaggle'] = gender['kaggle_num'] / gender['kaggle_num'].sum()
gender['Stack Overflow'] = gender['soflow_num'] / gender['soflow_num'].sum()
gender.drop(columns=['kaggle_num', 'soflow_num'], inplace=True)
gender

In [None]:
plt.rcParams["figure.figsize"] = [10, 6]
ind = np.arange(gender.shape[0])  # the x locations for the groups
width = 0.35  # the width of the bars

fig, ax = plt.subplots()
rects1 = ax.bar(ind - width/2, gender['Kaggle'], width, yerr=None, color='#00C0FF', label='Kaggle')
rects2 = ax.bar(ind + width/2, gender['Stack Overflow'], width, yerr=None, color='#FF7900', label='Stack Overflow')
ax.patch.set_visible(False)

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Percentage', fontsize=14)
ax.set_title('Gender comparison: Kaggle v. Stack Overflow', fontsize=20)
ax.set_xticks(ind)
ax.set_xticklabels(gender.index, fontsize=14)
ax.legend(fontsize=14)
ax.set_ylim([0,1.05])
ax.set_yticklabels([])

def autolabel(rects, xpos='center', fontsize='medium'):
    xpos = xpos.lower()  # normalize the case of the parameter
    ha = {'center': 'center', 'right': 'left', 'left': 'right'}
    offset = {'center': 0.5, 'right': 0.57, 'left': 0.43}  # x_txt = x + w*off

    for rect in rects:
        height = rect.get_height()
        ax.text(rect.get_x() + rect.get_width()*offset[xpos], 1.01*height,
                '{0:.1f}'.format(height * 100), ha=ha[xpos], va='bottom', fontsize=fontsize)

        
autolabel(rects1)
autolabel(rects2)

plt.show()

### Proportionally, there is over **2.5x more women** in **Kaggle** community vs. **Stack Overflow**!

## 2. How does **age** compare?

In [None]:
age_kaggle = kaggle.groupby('Q2').Q2.count()
age_kaggle_df = pd.DataFrame(columns=['numeric', 'perc'], index=age_kaggle.index)
age_kaggle_df['numeric'] = age_kaggle
age_kaggle_df['perc'] = age_kaggle_df['numeric'] / age_kaggle_df['numeric'].sum()
age_kaggle_df

In [None]:
age_soflow = soflow.groupby('Age').Age.count()
age_soflow_df = pd.DataFrame(columns=['numeric', 'perc'], index=age_soflow.index)
age_soflow_df['numeric'] = age_soflow
age_soflow_df['perc'] = age_soflow_df['numeric'] / age_soflow_df['numeric'].sum()
age_soflow_df

In [None]:
age = pd.DataFrame(columns=['kaggle_num', 'soflow_num'], index=['18 - 24', '25 - 34', '35 - 44', '45 - 54', '55 or older'])

for i1, i2 in zip(age_soflow_df.index, age.index):
    age.loc[i2]['soflow_num'] = age_soflow_df.loc[i1]['numeric']
    
age.loc['55 or older']['soflow_num'] = age_soflow_df.loc['55 - 64 years old']['numeric'] + age_soflow_df.loc['65 years or older']['numeric']

age.loc['18 - 24']['kaggle_num'] = age_kaggle_df.loc['18-21']['numeric'] + age_kaggle_df.loc['22-24']['numeric']
age.loc['25 - 34']['kaggle_num'] = age_kaggle_df.loc['25-29']['numeric'] + age_kaggle_df.loc['30-34']['numeric']
age.loc['35 - 44']['kaggle_num'] = age_kaggle_df.loc['35-39']['numeric'] + age_kaggle_df.loc['40-44']['numeric']
age.loc['45 - 54']['kaggle_num'] = age_kaggle_df.loc['45-49']['numeric'] + age_kaggle_df.loc['50-54']['numeric']
age.loc['55 or older']['kaggle_num'] = age_kaggle_df.loc['55-59']['numeric'] + age_kaggle_df.loc['60-69']['numeric'] + age_kaggle_df.loc['70-79']['numeric'] + age_kaggle_df.loc['80+']['numeric']

age['Kaggle'] = age['kaggle_num'] / age['kaggle_num'].sum()
age['Stack Overflow'] = age['soflow_num'] / age['soflow_num'].sum()
age.drop(columns=['kaggle_num', 'soflow_num'], inplace=True)
age

In [None]:
plt.rcParams["figure.figsize"] = [10, 6]
ind = np.arange(age.shape[0])  # the x locations for the groups
width = 0.35  # the width of the bars

fig, ax = plt.subplots()
rects1 = ax.bar(ind - width/2, age['Kaggle'], width, yerr=None, color='#00C0FF', label='Kaggle')
rects2 = ax.bar(ind + width/2, age['Stack Overflow'], width, yerr=None, color='#FF7900', label='Stack Overflow')
ax.patch.set_visible(False)

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Percentage', fontsize=14)
ax.set_title('Age comparison: Kaggle v. Stack Overflow', fontsize=20)
ax.set_xticks(ind)
ax.set_xticklabels(age.index, fontsize=14)
ax.legend(fontsize=14)
ax.set_ylim([0,0.6])
ax.set_yticklabels([])

autolabel(rects1)
autolabel(rects2)

plt.show()

### Kaggle community is **younger** than Stack Overflow!

## 3. How does **top countries** compare?

In [None]:
country_kaggle = kaggle.groupby('Q3').Q3.count()
country_kaggle_df = pd.DataFrame(columns=['numeric', 'perc'], index=country_kaggle.index)
country_kaggle_df['numeric'] = country_kaggle
country_kaggle_df['perc'] = country_kaggle_df['numeric'] / country_kaggle_df['numeric'].sum()
country_kaggle_df.sort_values(by=['numeric'], ascending=False, inplace=True)
country_kaggle_df.drop(index=['I do not wish to disclose my location', 'Other'], inplace=True)
country_kaggle_df.rename({'United States of America': 'United States', 'United Kingdom of Great Britain and Northern Ireland': 'United Kingdom'}, inplace=True)
#country_kaggle_df

In [None]:
country_soflow = soflow.groupby('Country').Country.count()
country_soflow_df = pd.DataFrame(columns=['numeric', 'perc'], index=country_soflow.index)
country_soflow_df['numeric'] = country_soflow
country_soflow_df['perc'] = country_soflow_df['numeric'] / country_soflow_df['numeric'].sum()
country_soflow_df.sort_values(by=['numeric'], ascending=False, inplace=True)
country_soflow_df.rename({'Russian Federation': 'Russia'}, inplace=True)
#country_soflow_df

In [None]:
list1 = country_kaggle_df[:25].index.values.tolist()
list2 = country_soflow_df[:25].index.values.tolist()
union = list1 + [e for e in list2 if e not in list1]

country = pd.DataFrame(columns=['kaggle_num', 'soflow_num'], index=union)
for c in union:
    country.loc[c] = [country_kaggle_df['numeric'].loc[c], country_soflow_df['numeric'].loc[c]]
    
country['Kaggle'] = country['kaggle_num'] / country_kaggle_df['numeric'].sum()
country['Stack Overflow'] = country['soflow_num'] / country_soflow_df['numeric'].sum()
country.drop(columns=['kaggle_num', 'soflow_num'], inplace=True)
country = country[:20]
country

In [None]:
plt.rcParams["figure.figsize"] = [20, 6]
ind = np.arange(country.shape[0])  # the x locations for the groups
width = 0.35  # the width of the bars

fig, ax = plt.subplots()
rects1 = ax.bar(ind - width/2, country['Kaggle'], width, yerr=None, color='#00C0FF', label='Kaggle')
rects2 = ax.bar(ind + width/2, country['Stack Overflow'], width, yerr=None, color='#FF7900', label='Stack Overflow')
ax.patch.set_visible(False)

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Percentage', fontsize=14)
ax.set_title('Top countries\' comparison: Kaggle v. Stack Overflow', fontsize=20)
ax.set_xticks(ind)
ax.set_xticklabels(country.index, fontsize=14, rotation = 45, ha="right")
ax.legend(fontsize=14)
#ax.set_ylim([0,0.6])
ax.set_yticklabels([])

autolabel(rects1, fontsize='x-small')
autolabel(rects2, fontsize='x-small')

plt.show()

### There are interesting fingings here:
- Kaggle's community is proportionally **overdeveloped** vs. Stack Overflow's in **India**, **China**, **Russia**, **Japan**, **Spain**, **Turkey**, **Nigeria** and **South Korea**;
- Kaggle's community is proportionally **underdeveloped** vs. Stack Overflow's in **Germany**, **United Kingdom**, **Australia**, **Poland** and **Netherlands**.

## 4. How does **education** compare?

In [None]:
education_kaggle = kaggle.groupby('Q4').Q4.count()
education_kaggle_df = pd.DataFrame(columns=['numeric', 'perc'], index=education_kaggle.index)
education_kaggle_df['numeric'] = education_kaggle
education_kaggle_df['perc'] = education_kaggle_df['numeric'] / education_kaggle_df['numeric'].sum()
education_kaggle_df.sort_values(by=['numeric'], ascending=False, inplace=True)
education_kaggle_df.drop(index=['I prefer not to answer'], inplace=True)

education_kaggle_df

In [None]:
education_soflow = soflow.groupby('FormalEducation').FormalEducation.count()
education_soflow_df = pd.DataFrame(columns=['numeric', 'perc'], index=education_soflow.index)
education_soflow_df['numeric'] = education_soflow
education_soflow_df['perc'] = education_soflow_df['numeric'] / education_soflow_df['numeric'].sum()
education_soflow_df.sort_values(by=['numeric'], ascending=False, inplace=True)
education_soflow_df.rename({'Bachelor’s degree (BA, BS, B.Eng., etc.)': 'Bachelor’s degree', 
                            'Master’s degree (MA, MS, M.Eng., MBA, etc.)': 'Master’s degree',
                            'Some college/university study without earning a degree': 'Some college/university study without earning a bachelor’s degree',
                            'Professional degree (JD, MD, etc.)': 'Professional degree',
                            'Other doctoral degree (Ph.D, Ed.D., etc.)': 'Doctoral degree',
                            'I never completed any formal education': 'No formal education past high school'
                           }, inplace=True)

education_soflow_df.loc['Bachelor’s degree'] = education_soflow_df.loc['Bachelor’s degree'] + education_soflow_df.loc['Associate degree']
education_soflow_df.loc['No formal education past high school'] = education_soflow_df.loc['No formal education past high school'] + education_soflow_df.loc['Primary/elementary school'] + education_soflow_df.loc['Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)']
education_soflow_df.drop(index=['Associate degree', 'Primary/elementary school', 'Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)'], inplace=True)

education_soflow_df

In [None]:
education = pd.DataFrame(columns=['kaggle_num', 'soflow_num'], index=education_kaggle_df.index)
for e in education.index:
    education.loc[e] = [education_kaggle_df['numeric'].loc[e], education_soflow_df['numeric'].loc[e]]
    
education['Kaggle'] = education['kaggle_num'] / education_kaggle_df['numeric'].sum()
education['Stack Overflow'] = education['soflow_num'] / education_soflow_df['numeric'].sum()
education.drop(columns=['kaggle_num', 'soflow_num'], inplace=True)
education

In [None]:
plt.rcParams["figure.figsize"] = [18, 8]
ind = np.arange(education.shape[0])  # the x locations for the groups
width = 0.35  # the width of the bars

fig, ax = plt.subplots()
rects1 = ax.bar(ind - width/2, education['Kaggle'], width, yerr=None, color='#00C0FF', label='Kaggle')
rects2 = ax.bar(ind + width/2, education['Stack Overflow'], width, yerr=None, color='#FF7900', label='Stack Overflow')
ax.patch.set_visible(False)

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Percentage', fontsize=14)
ax.set_title('Formal Education comparison: Kaggle v. Stack Overflow', fontsize=20)
ax.set_xticks(ind)
ax.set_xticklabels(['Master\'s \ndegree', 
                    'Bachelor\'s \ndegree', 
                    'Doctoral \ndegree', 
                    'Some college/university \nstudy without earning a \nbachelor’s degree', 
                    'Professional \ndegree', 
                    'No formal education \npast high school'], fontsize=14, rotation = 0, ha="center")
ax.legend(fontsize=14)
ax.set_ylim([0,0.6])
ax.set_yticklabels([])

autolabel(rects1)
autolabel(rects2)

plt.show()

### Kaggle community has significantly more **formal education** than Stack Overflow's!

## 5. How does **undegrad major** compare?

In [None]:
undergrad_kaggle = kaggle.groupby('Q5').Q5.count()
undergrad_kaggle_df = pd.DataFrame(columns=['numeric', 'perc'], index=undergrad_kaggle.index)
undergrad_kaggle_df['numeric'] = undergrad_kaggle
undergrad_kaggle_df['perc'] = undergrad_kaggle_df['numeric'] / undergrad_kaggle_df['numeric'].sum()

undergrad_kaggle_df.rename({'Physics or astronomy': 'Medical or natural sciences (ex. biology, chemistry, physics, astronomy)'}, inplace=True)
undergrad_kaggle_df.loc['Medical or natural sciences (ex. biology, chemistry, physics, astronomy)'] = undergrad_kaggle_df.loc['Medical or natural sciences (ex. biology, chemistry, physics, astronomy)'] + undergrad_kaggle_df.loc['Medical or life sciences (biology, chemistry, medicine, etc.)'] + undergrad_kaggle_df.loc['Environmental science or geology']
undergrad_kaggle_df.drop(index=['Medical or life sciences (biology, chemistry, medicine, etc.)','Environmental science or geology'], inplace=True)
undergrad_kaggle_df.sort_values(by=['numeric'], ascending=False, inplace=True)

undergrad_kaggle_df

In [None]:
undergrad_soflow = soflow.groupby('UndergradMajor').UndergradMajor.count()
undergrad_soflow_df = pd.DataFrame(columns=['numeric', 'perc'], index=undergrad_soflow.index)
undergrad_soflow_df['numeric'] = undergrad_soflow
undergrad_soflow_df['perc'] = undergrad_soflow_df['numeric'] / undergrad_soflow_df['numeric'].sum()

undergrad_soflow_df.rename({'Computer science, computer engineering, or software engineering': 'Computer science (software engineering, etc.)', 
                            'Another engineering discipline (ex. civil, electrical, mechanical)': 'Engineering (non-computer focused)',
                            'Information systems, information technology, or system administration': 'Information technology, networking, or system administration',
                            'A natural science (ex. biology, chemistry, physics)': 'Medical or natural sciences (ex. biology, chemistry, physics, astronomy)',
                            'A business discipline (ex. accounting, finance, marketing)': 'A business discipline (accounting, economics, finance, etc.)',
                            'A humanities discipline (ex. literature, history, philosophy)': 'Humanities (history, literature, philosophy, etc.)',
                            'A social science (ex. anthropology, psychology, political science)': 'Social sciences (anthropology, psychology, sociology, etc.)',
                            'Fine arts or performing arts (ex. graphic design, music, studio art)': 'Fine arts or performing arts',
                            'Web development or web design': 'Other'
                           }, inplace=True)

undergrad_soflow_df.loc['Medical or natural sciences (ex. biology, chemistry, physics, astronomy)'] = undergrad_soflow_df.loc['Medical or natural sciences (ex. biology, chemistry, physics, astronomy)'] + undergrad_soflow_df.loc['A health science (ex. nursing, pharmacy, radiology)']
undergrad_soflow_df.drop(index=['A health science (ex. nursing, pharmacy, radiology)'], inplace=True)
undergrad_soflow_df.sort_values(by=['numeric'], ascending=False, inplace=True)

undergrad_soflow_df

In [None]:
undergrad = pd.DataFrame(columns=['kaggle_num', 'soflow_num'], index=undergrad_kaggle_df.index)
for e in undergrad.index:
    undergrad.loc[e] = [undergrad_kaggle_df['numeric'].loc[e], undergrad_soflow_df['numeric'].loc[e]]
    
undergrad['Kaggle'] = undergrad['kaggle_num'] / undergrad_kaggle_df['numeric'].sum()
undergrad['Stack Overflow'] = undergrad['soflow_num'] / undergrad_soflow_df['numeric'].sum()
undergrad.drop(columns=['kaggle_num', 'soflow_num'], inplace=True)
undergrad

In [None]:
plt.rcParams["figure.figsize"] = [20, 6]
ind = np.arange(undergrad.shape[0])  # the x locations for the groups
width = 0.35  # the width of the bars

fig, ax = plt.subplots()
rects1 = ax.bar(ind - width/2, undergrad['Kaggle'], width, yerr=None, color='#00C0FF', label='Kaggle')
rects2 = ax.bar(ind + width/2, undergrad['Stack Overflow'], width, yerr=None, color='#FF7900', label='Stack Overflow')
ax.patch.set_visible(False)

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Percentage', fontsize=14)
ax.set_title('Top undergrad major\'s comparison: Kaggle v. Stack Overflow', fontsize=20)
ax.set_xticks(ind)
ax.set_xticklabels(['Computer science \n(software engineering, \netc.)',
                    'Engineering \n(non-computer focused)', 'Mathematics or statistics',
                    'Medical or natural sciences \n(ex. biology, chemistry, \nphysics, astronomy)',
                    'A business discipline \n(accounting, economics, \nfinance, etc.)',
                    'Information technology, \nnetworking, or system \nadministration', 'Other',
                    'Social sciences \n(anthropology, psychology, \nsociology, etc.)',
                    'Humanities \n(history, literature, \nphilosophy, etc.)',
                    'I never declared a major', 'Fine arts or \nperforming arts'], fontsize=14, rotation = 45, ha="right")
ax.legend(fontsize=14)
#ax.set_ylim([0,0.6])
ax.set_yticklabels([])

autolabel(rects1) #, fontsize='x-small')
autolabel(rects2) #, fontsize='x-small')

plt.show()

### There are interesting fingings here:
- Kaggle's community is proportionally **overdeveloped** vs. Stack Overflow's in **Engineering (non-computer focused)**, **Math or statistics**, **Medical or natural sciences**, **Social sciences**, and in **Business related** undergrad majors;
- Kaggle's community is proportionally **underdeveloped** vs. Stack Overflow's in **Computer science**, **Information technology**, **Humanities**, and in **Fine or performing arts** undergrad majors.

## 6. How does **compensation** compare?

In [None]:
compensation_kaggle = kaggle.groupby('Q9').Q9.count()
compensation_kaggle_df = pd.DataFrame(columns=['numeric', 'perc'], index=compensation_kaggle.index)
compensation_kaggle_df['numeric'] = compensation_kaggle
compensation_kaggle_df.drop(index=['I do not wish to disclose my approximate yearly compensation'], inplace=True)
compensation_kaggle_df['perc'] = compensation_kaggle_df['numeric'] / compensation_kaggle_df['numeric'].sum()

compensation_kaggle_df

In [None]:
soflow['ConvertedSalary'].fillna(0, inplace=True)
compensation_soflow_df = soflow[soflow['ConvertedSalary'] > 0].ConvertedSalary.reset_index(drop=True)
plt.figure();
compensation_soflow_df.plot(kind='hist', bins=100) #.hist()

In [None]:
x = compensation_soflow_df
compensation_soflow_df_2 = pd.DataFrame(columns=['numeric', 'perc'], index=['0-10,000', '10-20,000', '20-30,000', '30-40,000', '40-50,000', '50-60,000', '60-70,000', '70-80,000', '80-90,000', '90-100,000', '100-125,000', '125-150,000', '150-200,000', '200-250,000', '250-300,000', '300-400,000', '400-500,000', '500,000+'])
compensation_soflow_df_2.loc['0-10,000'] = x[x <= 10000].count()
compensation_soflow_df_2.loc['10-20,000'] = x[(x > 10000) & (x <= 20000)].count()
compensation_soflow_df_2.loc['20-30,000'] = x[(x > 20000) & (x <= 30000)].count()
compensation_soflow_df_2.loc['30-40,000'] = x[(x > 30000) & (x <= 40000)].count()
compensation_soflow_df_2.loc['40-50,000'] = x[(x > 40000) & (x <= 50000)].count()
compensation_soflow_df_2.loc['50-60,000'] = x[(x > 50000) & (x <= 60000)].count()
compensation_soflow_df_2.loc['60-70,000'] = x[(x > 60000) & (x <= 70000)].count()
compensation_soflow_df_2.loc['70-80,000'] = x[(x > 70000) & (x <= 80000)].count()
compensation_soflow_df_2.loc['80-90,000'] = x[(x > 80000) & (x <= 90000)].count()
compensation_soflow_df_2.loc['90-100,000'] = x[(x > 90000) & (x <= 100000)].count()
compensation_soflow_df_2.loc['100-125,000'] = x[(x > 100000) & (x <= 125000)].count()
compensation_soflow_df_2.loc['125-150,000'] = x[(x > 125000) & (x <= 150000)].count()
compensation_soflow_df_2.loc['150-200,000'] = x[(x > 150000) & (x <= 200000)].count()
compensation_soflow_df_2.loc['200-250,000'] = x[(x > 200000) & (x <= 250000)].count()
compensation_soflow_df_2.loc['250-300,000'] = x[(x > 250000) & (x <= 300000)].count()
compensation_soflow_df_2.loc['300-400,000'] = x[(x > 300000) & (x <= 400000)].count()
compensation_soflow_df_2.loc['400-500,000'] = x[(x > 400000) & (x <= 500000)].count()
compensation_soflow_df_2.loc['500,000+'] = x[x > 500000].count()
compensation_soflow_df_2['perc'] = compensation_soflow_df_2['numeric'] / compensation_soflow_df_2['numeric'].sum()
compensation_soflow_df_2

In [None]:
compensation = pd.DataFrame(columns=['kaggle_num', 'soflow_num'], index=compensation_soflow_df_2.index)
for c in compensation.index:
    compensation.loc[c] = [compensation_kaggle_df['numeric'].loc[c], compensation_soflow_df_2['numeric'].loc[c]]
    
compensation['Kaggle'] = compensation['kaggle_num'] / compensation_kaggle_df['numeric'].sum()
compensation['Stack Overflow'] = compensation['soflow_num'] / compensation_soflow_df_2['numeric'].sum()
compensation.drop(columns=['kaggle_num', 'soflow_num'], inplace=True)
compensation

In [None]:
plt.rcParams["figure.figsize"] = [20, 6]
ind = np.arange(compensation.shape[0])  # the x locations for the groups
width = 0.35  # the width of the bars

fig, ax = plt.subplots()
rects1 = ax.bar(ind - width/2, compensation['Kaggle'], width, yerr=None, color='#00C0FF', label='Kaggle')
rects2 = ax.bar(ind + width/2, compensation['Stack Overflow'], width, yerr=None, color='#FF7900', label='Stack Overflow')
ax.patch.set_visible(False)

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Percentage', fontsize=14)
ax.set_title('Compensation comparison: Kaggle v. Stack Overflow', fontsize=20)
ax.set_xticks(ind)
ax.set_xticklabels(compensation.index, fontsize=14, rotation = 45, ha="right")
ax.legend(fontsize=14)
#ax.set_ylim([0,0.6])
ax.set_yticklabels([])

autolabel(rects1, fontsize='x-small')
autolabel(rects2, fontsize='x-small')

plt.show()

### Despite having higher formal education, Kaggle community's **compensation** is lower than Stack Overflow's!

# Upvote it if you like it! :)