In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Kaggle Survey Global Results

Many factors determine who we are in life, some of which are environmental/genetic and can never be changed, while others are within our power to determine for ourselves. What I've attempted to analyze in the Kaggle Survey is the relationships between gender, age, country, company, online academies, compensation, degree, and multiple data science tools to see how these factors affect each other. Some of these factors are unchangeable (country) but others are based on choice (online academies); my questions primarily focus on the relationship between gender and degree with other features.

The approach I used to analyze the Kaggle survey focused on combining up to 3 features which helped answer more complex questions . My main goal is to prove the obvious using data and to uncover trends or relationships that are not so obvious.

My analysis is focused on: 
1. Age and gender distribution 
2. Relationsip between programming languages used and degrees obtained 
3. Relationship between programming languages used and gender 
4. Distribution of countries Kagglers are located in and their gender
5. Relationship between Visualization tools used and the degree Kagglers have  obtained
6. Relationship between Kagglers assigned role at work, the degree they obtained, and their gender
7. Relationship between Online academies Kagglers attended and what ML models Kagglers use on a regular basis
8. The gender distribution across companies of different sizes
9. The gender distribution across companies of different sizes based on the degree they hold
10. Relationship between Degree obtained and the automation tool the Kaggler uses.
11. Relationship between Degree obtained, the number of years coding the Kaggler has been coding, and their compensation.
12. Gender distribution across salary ranges and the experience a Kaggler has in coding.

Thank you in advance for taking your time to read this!

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('/kaggle/input/kaggle-survey-2020/kaggle_survey_2020_responses.csv', low_memory = False)
pd.options.mode.chained_assignment = None
df.head()

Here is the first of the raw data from the survey.

The first thing I'm going to do is check out the age distribution for all Kagglers.

In [None]:
df_questions = pd.DataFrame(df.iloc[0])
df_questions = df_questions.transpose() #Sets Q1 and Q2 as columns on the questions dataframe
df_questions.head()

df.drop(0, inplace = True)
df['Q1'].value_counts()/df['Q1'].value_counts().sum()

In [None]:
plt.figure(figsize=(15,8), dpi = 100)
ax = sns.countplot(x = 'Q1', data = df, color = 'red', order = ['18-21','22-24','25-29','30-34','35-39','40-44','45-49','50-54','55-59','60-69','70+'])
ax.set(xlabel = df_questions['Q1'][0]);


In [None]:
#About 56% of respondents are people within the age of 18-29 indicating that younger people are the majority in this survey. The Countplot indicates the data is skewed to the right meaning that the older someone is, the less likely they are to be a Kaggler.

I'm going to check out the gender distribution based on age as well.

In [None]:
df['Q2'].value_counts()/df['Q2'].count()

In [None]:
#A majority of people who responded to the survey are men. An important note to keep in mind is that the percentage of anyone who is not male were sometimes too small for me to make an accurate conclusion.

In [None]:
plt.figure(figsize=(15,10), dpi = 150)
ax = sns.countplot(x = 'Q1', data = df, color = 'red', hue = 'Q2', order = ['18-21','22-24','25-29','30-34','35-39','40-44','45-49','50-54','55-59','60-69','70+'], palette = 'Spectral')
ax.set(xlabel = df_questions['Q1'][0]);

In [None]:
ag = df[['Q1', 'Q2']]
ag.columns = ['Age', 'Gender']
ages = pd.get_dummies(ag['Gender'])
ag = pd.concat([ag.drop('Gender', axis = 1), ages], axis = 1)

ag.groupby('Age').mean().transpose().round(3)

In [None]:
#Data for the ages of females seems to be skewed to the right as well as shown by the countplot. Most females on Kagglers, however, seem to be within the age of 22 - 24 while men are mainly in the ages between 25 - 29. As the age of Kagglers starts to increase, the  percentage of males also increases making it a positive correlation between age and percentage of males. 

Here I analyze the programming language do people mainly use on Kaggle

In [None]:
df_program = df[['Q7_Part_1', 'Q7_Part_2', 'Q7_Part_3', 'Q7_Part_4', 'Q7_Part_5', 'Q7_Part_6', 'Q7_Part_7', 'Q7_Part_8', 'Q7_Part_9', 'Q7_Part_10', 'Q7_Part_11', 'Q7_Part_12', 'Q7_OTHER']]
df_program = df_program.fillna('0')

plt.figure(figsize = (15,5))
for x in df_program.columns:
    sns.countplot(data = df_program, x=x, order=['Python', 'R', 'SQL', 'C', 'C++' ,'Java', 'Javascript', 'Julia', 'Swift', 'Bash', 'MATLAB', 'None', 'Other'])

In [None]:
#As expected most Kagglers use Python by far followed by SQL and R. These are primarily the programming languages that people use for data science.

Here I'm going to explore the relationship between programming languages used and the degree a Kaggler has obtained. First I'm going to break down the degrees by count.

In [None]:
def education(ed):
    if 'Some' in str(ed):
        return "College without Bachelor's degree"
    else:
        return str(ed)
df['Q4'] = df['Q4'].apply(education)


df_program.columns = ['Python', 'R', 'SQL', 'C', 'C++', 'Java', 'Javascript', 'Julia', 'Swift', 'Bash', 'MATLAB', 'None', 'Other']
def numbers(stringer):
    if('0' in stringer):
        return 0
    else:
        return 1

for x in df_program.columns:
    df_program[x] = np.vectorize(numbers)(df_program[x])
    
df_program['degree'] = df['Q4'].copy()
degrees = df_program['degree'].value_counts()
dd = pd.concat([degrees, degrees/degrees.sum()], axis = 1)
dd.columns = ['Count', 'Percentage']
dd

In [None]:
 #Nearly 75% of Kagglers have Masters or Bachelors degrees.

In [None]:
grouped_programs = df_program.groupby(by = 'degree').sum()
df_program.groupby(by = 'degree').sum()

In [None]:
#Amount of Kagglers who have used these programming languages based on their degree

In [None]:
df_program.groupby('degree').mean().round(3)

In [None]:
#Mean values of Kagglers who have used these programming languages based on their degree

In [None]:
plt.figure(figsize = (12,8), dpi = 150)
ax = sns.heatmap(df_program.groupby('degree').mean().round(3), annot=True, robust = True)
ax.set(xlabel = 'Programming Language used on a Regular Basis');

In [None]:
#The above heatmap and table of the means shows the coding languages that people use based off their degree (adding the row will give you 100%)

#Seems like no matter what degree a Kaggler has, Python is still the most used programming language. Nearly 70% of respondents regardless of degree have used Python 

#This heatmap also shows that the likelihood of Kagglers using R increases with a higher University degree, R was also highly in usage among those with Professional degrees

#Many Kagglers with Doctoral degrees tend to use MATLAB and Bash more than any other degree, a bit of an outlier compared to other percentages for MATLAB and Bash

So Python, SQL, and R are clearly the most famous programming languages on Kaggle, but does the use of these programming languages differ based on gender?

In [None]:
df_genders = df_program.drop('degree', axis = 1).copy() 
df_genders['gender'] = df['Q2']

df_genders.groupby('gender').sum()

In [None]:
#Mean of Kagglers who use these programming languages based on their gender

In [None]:
plt.figure(figsize = (12,10), dpi = 100)
sns.heatmap(df_genders.groupby('gender').mean(), cmap = 'plasma', robust = True, annot = True);

In [None]:
#Regardless of gender, Python is still the most widely used progamming language followed by SQL and R. The breakdown between the 3 programming languages for male and female are nearly identical

Here, I'm going to analyze the number of Kagglers by country and gender.

In [None]:
df_countries = pd.DataFrame()
df_countries['country'] = df['Q3'].copy() 
df_countries['gender'] = df['Q2'].copy() #We'll use this later

df_countries['country'].value_counts() 

In [None]:
#A significantly high amount of people from India and U.S. but also 'Other'.

In [None]:
genders = pd.get_dummies(df_countries['gender'])
df_countries = pd.concat([df_countries, genders], axis = 1)

country_genders = df_countries.drop('gender', axis = 1).groupby('country').sum().copy()

country_genders[(country_genders['Man'] > country_genders['Man'].quantile(0.8)) | (country_genders['Woman'] > country_genders['Woman'].quantile(0.8)) | (country_genders['Nonbinary'] > country_genders['Nonbinary'].quantile(0.8))] 

In [None]:
#Most Woman seem to be from India, Other, and US, same with men. Most people who are non-binary are from the US and Other.

In [None]:
df_countries.drop('gender', axis = 1).groupby('country').mean()

In [None]:
df_countries.drop('gender', axis = 1).groupby('country').mean()['Woman'].nlargest(5)

In [None]:
#Countries with the largest percentage of Women respondents

In [None]:
df_countries.drop('gender', axis = 1).groupby('country').mean()['Woman'].nsmallest(5)

In [None]:
#Countries with the lowest percentage of Women respondents

In [None]:
#This is the mean of genders in every country (Adding the row will give you 100%)

#Surprisingly many industrialized countries such as Republic of Korea, Russia, Spain, Japan, Italy, Australia, Brazil, Beligum, France, and Germany have a far lower percentage of female Kagglers averaging around 14%

#Countries such as the USA, United Arab Emirates, Turkey, Thailand, Taiwan, Saudi Arabia, Portugual, and India each have around 20% of female Kagglers

#By far the lowest percentage of female Kagglers were in Japan, Republic of Korea, Chile, Greece, and Italy averaging about 8%

#Meanwhile emerging countries such as Malaysia, Tunisia, Iran, and Sri Lanka each have higher than 30% Kagglers that are female

In [None]:
plt.figure(figsize=(19,14), dpi = 200)
sns.countplot(data = df_countries, x = 'country', hue = 'gender', order = ['Malaysia', 'Tunisia', 'United States of America', 'United Arab Emirates','Turkey', 'Japan', 'Republic of Korea', 'Italy']);

> Here we explore the relationship between degree obtained and the frequency of usage of ML libraries.

In [None]:
visualizes = [col for col in df.columns if 'Q14' in col]
df_visualization = pd.DataFrame()
df_visualization = pd.concat([df['Q4'], df[visualizes]], axis = 1)

grouped_visualizations = df_visualization.copy()
grouped_visualizations.columns = ['Degree','Matplotlib', 'Seaborn', 'Plotly', 'Ggplot', 'Shiny', 'D3 js', 'Altair', 'Bokeh', 'Geoplotlib', 'Leaflet / Folium', 'None', 'Other']

grouped_visualizations.groupby('Degree').count().transpose().apply(lambda x: round(x/x.sum(), 3)).transpose()

In [None]:
#Percentage of Kagglers that use the visualization tools based on the degree they have (rows add to 1).

In [None]:
df_visualization.drop('Q4', axis = 1).fillna('Not_Used', inplace = True)

for x in df_visualization.columns[1:6]:
    ax = sns.catplot(data = df_visualization, x = x,  kind = 'count', hue = 'Q4', height = 3, aspect = 2)
    ax.set(ylim = (0,5000));


In [None]:
#Based on the above charts, usage of various ML libraries seems to be consistent across all respondents regardless of the degree the obtained, with the most poplar being Matplotlib and Seaborn.

Now I'm going to compare how the degree Kagglers have and their gender affect the role they play at work.

In [None]:
df_roles = df[['Q4', 'Q23_Part_1', 'Q23_Part_2', 'Q23_Part_3', 'Q23_Part_4', 'Q23_Part_5', 'Q23_Part_6', 'Q23_Part_7', 'Q23_OTHER', 'Q2']]

#Going to shorten gender by changing it to male, female, other so the data looks better
def genders(gender):
    if(('Man' in gender) or ('Woman' in gender)):
        return gender
    else:
        return 'Other'
df_roles['Q2'] = df_roles['Q2'].apply(genders)

df_roles.columns = ['Degree', 'Analyze and Understand data', 'Build or Run data infrastructure', 'Apply Machine learning to new areas', 'Machine learning to improve product', 'Experimentation to improve esxisting ML models', 'Research to advance ML', 'None', 'Other', 'Gender']
questions = df_roles.groupby(by = ['Degree', 'Gender']).count()

In [None]:
round(questions.xs('Man', level = 1, axis = 0)/15789, 4)*100

In [None]:
#Men's tasks based on degree

In [None]:
round(questions.xs('Woman', level = 1, axis = 0)/3878, 4)*100

In [None]:
#Women's task based on degree

In [None]:
#Tables show percentage of total Men/Women that has been asked to perfrom this task based on their degree

#Above table shows percentage of men/women who have been assigned to perform one or more types of tasks based on the degree they obtained. It’s clear that the percentage of women assigned to perform any of the tasks was constantly lower than that of men’s.

#One interesting observation is the percentage of kagglers, albeit being fairly small, who had no formal education past high school, yet were asked to work on advanced tasks such as research to advance ML.

Now I will show how the online academy people take affect what ML model Kagglers use on a regular basis

In [None]:
course_algo = df[['Q17_Part_1', 'Q17_Part_2', 'Q17_Part_3', 'Q17_Part_4', 'Q17_Part_5', 'Q17_Part_6', 'Q17_Part_7', 'Q17_Part_8', 'Q17_Part_9', 'Q17_Part_10', 'Q37_Part_1', 'Q37_Part_3', 'Q37_Part_4', 'Q37_Part_7', 'Q37_Part_10']]
#Many different types of courses and algorithms so I'm just going to focus on the more gamous ones such as ones on Kaggle, Coursera, etc. and later replace them with other


course_algo.columns = ['Regression', 'Trees_Forests', 'Boosting_GBM', 'Bayesian', 'Evolutionary', 'DenseNeural', 'ConvolutionNeural', 'Generative_Adversarial', 'Recurrent_Neural', 'Transformer_Networks', 'Coursera', 'Kaggle', 'DataCamp', 'Udemy', 'University']

def nulls(value1, value2, value3, value4, value5):
    if(value1 and value2 and value3 and value4 and value5 == True ):
        return 1
    else:
        return np.NaN

course_algo['Other_Course'] = np.vectorize(nulls)(course_algo['Coursera'].isnull(), course_algo['Kaggle'].isnull(), course_algo['DataCamp'].isnull(), course_algo['Udemy'].isnull(), course_algo['University'].isnull())


def others(other):
    if(other == 1):
        return 'Other'
    else:
        return other

course_algo['Other_Course'] = course_algo['Other_Course'].apply(others)

def shortener(word):
    if('University' in str(word)):
        return 'University Courses'
    else:
        return word
course_algo['University'] = course_algo['University'].apply(shortener)

In [None]:
dt = pd.DataFrame()
dt = course_algo.fillna(0)
def All_Courses(value1, value2, value3, value4, value5, value6):
    x = ""
    if(value1 != 0):
        x+= str(value1) + ', '
    if(value2 != 0):
        x+= str(value2) + ', '
    if(value3 != 0):
        x+= str(value3) + ', '
    if(value4 != 0):
        x+= str(value4) + ', '
    if(value5 != 0):
        x+= str(value5) + ', '
    if(value6 != 0):
        x+= str(value6) + ', '
    return x[:-2]

course_algo['Courses'] = dt[['Coursera', 'Kaggle', 'DataCamp', 'Udemy', 'University', 'Other_Course']].apply(lambda dt: All_Courses(dt['Coursera'], dt['Kaggle'], dt['DataCamp'], dt['Udemy'], dt['University'], dt['Other_Course']), axis = 1 )

course_algo['Courses'].value_counts()

In [None]:
#There seems to be a lot of people that took multiple online academies such as 'Coursera and kaggle' or 'Coursera and Udemy'. 

#Coursera definetely seems to be one of the popular courses, however, being taken with many other courses and on its own.

In [None]:
Course_and_algo = course_algo.groupby('Courses')[['Regression', 'Trees_Forests', 'Boosting_GBM', 'Bayesian', 'Evolutionary', 'DenseNeural', 'ConvolutionNeural', 'Generative_Adversarial', 'Recurrent_Neural', 'Transformer_Networks']].count().transpose()
Course_and_algo.apply(lambda x: round(x/x.sum(), 3)).transpose().head()

In [None]:
#In this table, every row adds up to 1 (or 100%)

In [None]:
plt.figure(dpi = 200, figsize=(9,15))
ax = sns.heatmap(Course_and_algo.apply(lambda x: round(x/x.sum(), 3)).transpose(), annot = True)
ax.set(xlabel = 'ML_Algorithms used on a regular basis');

In [None]:
#Linear Regression seems to be in the range of 20% - 30% for ML algorithms used on a regular basis which makes sense because almost every course will teach Linear Regression first and using the model is very simple

#Trees and Forests are used a bit less than Linear Regression but the range is still in between 19% to 26.3% so it is still widely used and doesn't differ much by which courses Kagglers have done

#All of these ML Algorithms barely deviate in use based on which course you took.

So now I want to see if size of a company affect which gender they choose to hire

In [None]:
company_gender = df[['Q2', 'Q20', 'Q4']]
company_gender.dropna(inplace = True)

company_gender.columns = ['Gender', 'Size_Of_Company', 'Degree']
size = pd.get_dummies(company_gender['Size_Of_Company'])

company_gender = pd.concat([company_gender.drop('Size_Of_Company', axis = 1), size], axis = 1)

company_gender.drop('Degree', axis = 1).groupby('Gender').sum()

In [None]:
cg = company_gender.drop('Degree', axis = 1).groupby('Gender').mean().round(3)
cg

In [None]:
plt.figure(dpi = 100, figsize = (8,5))
ax = sns.heatmap(data = cg, annot = True, cmap="YlGnBu")
ax.set(xlabel = 'Amount of Employees at Company');

In [None]:
#Due to Kagglers who put themselves in "Prefer to self-describe" and "Nonbinary" having such a low amount of people who are in companies, I cannot accurately say anything about their numbers

#For "Men", "Women", and people who "Prefer not to say" it seems like companies almost equally have people based on their gender. No company size really leans more towards a specific gender

#There is an exception for 'Prefer not to say', 30% of them work with 10,000 or more employees which is significantly higher than Males or Females.

Let me now add on the degrees people have along with their gender and size of the company to see if there is any relation between the gender and degree Kagglers have compared to the size of their company

In [None]:
#I'm only going to include the genders male and female to stop the data from becoming too big
company_gender = company_gender[(company_gender['Gender'] == 'Man') | (company_gender['Gender'] == 'Woman')]
company_gender.groupby(['Gender', 'Degree']).sum()

In [None]:
cgd = company_gender.groupby(['Gender', 'Degree']).mean()
cgd

In [None]:
plt.figure(dpi = 300, figsize=(14,10))
ax = sns.heatmap(cgd, annot = True, robust = True, cmap = 'viridis')
ax.set(xlabel = 'Amount of Employees at Company');

In [None]:
#Seems that Woman and Men with their respective degrees have close to the same percentage of companies hiring them based on company sizes. However, Men were hired a bit more with bigger companies while women were hired a bit more into smaller companies.

#Another interesting observation is that Kagglers with Doctoral degrees actually tend not to work at companies with 10,000 or more employees than those with Bachelors and Masters no matter the gender.

How do Kagglers degree affect which automated ML tool they use?

In [None]:
tool_deg = df[['Q4', 'Q34_A_Part_1', 'Q34_A_Part_2','Q34_A_Part_3','Q34_A_Part_4','Q34_A_Part_5','Q34_A_Part_6','Q34_A_Part_7','Q34_A_Part_8','Q34_A_Part_9','Q34_A_Part_10','Q34_A_Part_11','Q34_A_OTHER',]]

tool_deg.columns = ['Degree', 'Google Cloud', 'H20 Driverless AI', 'DataBricks AutoML', 'DataRobots AutoML', 'Tpot', 'Auto-Keras', 'Auto-Sklearn', 'Auto_ml', 'Xcessiv', 'MLbox', 'None', 'Other']

tool_deg.groupby('Degree').count()

In [None]:
tg = tool_deg.groupby('Degree').count().transpose().apply(lambda x: round(x/x.sum(), 3)).transpose()
tg

In [None]:
plt.figure(dpi = 100, figsize = (13,10))
ax = sns.heatmap(tg, annot=True, robust=True)
ax.set(xlabel = 'Automated ML Tool');

In [None]:
#Kagglers mainly use Auto-Sklearn, Auto_Keras, or no automated ML tool at all


#Kagglers with Professional degrees and without degrees also tend to use Auto_ML

How does a Kagglers compensation compare to the amount of years they've been coding?

In [None]:
comp = df[['Q2','Q4', 'Q6', 'Q24']]
comp = comp.dropna() #Going to have to remove the null values in the compensation column
comp.columns = ['Gender', 'Degree', 'Years_Coding', 'Compensation']

comp['Compensation'].value_counts() 

In [None]:
#Way too many values, I need to narrow them down a bit

In [None]:
def compensation(count):
    if(('125,000' in count) or ('100,000' in count) or ('150,000' in count)):
        return '100,000-199,999'
    elif(('200,000' in count) or ('250,000' in count)):
        return '200,000-299,999'
    elif(('1,000' in count) or ('2,000' in count) or ('3,000' in count) or ('4,000' in count)):
        return '1,000-4,999'
    elif(('5,000' in count) or ('7,500' in count)):
        return '5,000-9,999'
    elif(('10,000' in count) or ('20,000' in count) or ('25,000' in count)):
        return '10,000-29,999'
    elif(('30,000' in count) or ('40,000' in count)):
        return '30,000-49,999'
    elif(('50,000' in count) or ('60,000' in count)):
        return '50,000-69,999'
    elif(('70,000' in count) or ('80,000' in count) or ('90,000' in count)):
        return '70,000-99,999'
    else:
        return count

comp['Compensation'] = np.vectorize(compensation)(comp['Compensation'])
        

In [None]:
#Also going to narrow down the genders and years of coding a bit for easier graphing capabilities
def Genders(gender):
    if(('Man' in gender) or ('Woman' in gender)):
        return gender
    else:
        return 'Other'
comp['Gender'] = np.vectorize(Genders)(comp['Gender'])

def Years(year):
    if(('I' in year) or ('<' in year)):
        return '0-1 years'
    else:
        return year
comp['Years_Coding'] = np.vectorize(Years)(comp['Years_Coding'])

plt.figure(figsize=(16, 12), dpi = 250)
plt.xticks(rotation = 40)
sns.countplot(data = comp, x = 'Compensation', order =['$0-999', '1,000-4,999', '5,000-9,999', '10,000-29,999', '30,000-49,999', '50,000-69,999', '70,000-99,999', '100,000-199,999', '200,000-299,999', '300,000-500,000', '> $500,000'], hue = 'Years_Coding', hue_order=['0-1 years', '1-2 years', '3-5 years', '5-10 years', '10-20 years', '20+ years']);

In [None]:
#The countplot is skewed to the right which makes sense considering a lot of people using Kaggle are still in school and therefore are not making much money because they don't have a job.

#Most data in the graph is expected such as 0-1 years of coding experience have the highest count in making $0-999, 3-5 years of coding experience has the highest count in making $5000-9999 and 20+ years of coding experience have the highest count in making $100,000 and above. 

#Other parts of the data, however, don't make as much sense, some people with barely any coding experience are making more than $100,000 (could be due to them having other jobs and just doing coding on the side) and some people with over 10+ years of coding are making almost no money at all (maybe because they're still in school?)

What is the correlation between the amount of years a Kaggler has been coding and the degree they have?

In [None]:
comp.groupby('Degree').sum()
cyg = comp.copy()

In [None]:
Year = pd.get_dummies(comp['Years_Coding'])
comp = pd.concat([comp, Year], axis = 1)

comp.groupby('Degree').mean().round(3)

In [None]:
plt.figure(figsize = (13,10), dpi =  150)
sns.heatmap(data = comp.groupby('Degree').mean(), annot = True);

In [None]:
#A surprising 21% of Bachelors degrees, 16% of Masters degrees, and 23% of Professional degrees only have 0-1 Years of Coding experience. 

#Along with this, 24% of Bachelors degrees, 17% of Masters degrees, and 18% of Professional degrees only have 1-2 Years of Coding experience which is still highly unusual for the degree that they have.

#These results can be explained by Kagglers who are just doing coding on the side and have actually gotten a degree in a different field unrelated to coding.

#18.9% of people with no formal education past highschool and 13.8% of people doing college/University without earning a bachelors degree have 20+ years of coding experience which is clearly unreasonable

Lastly, I am going to compare 

In [None]:
def compensation(count):
    if(('30,000' in count) or ('10,000' in count)):
        return '$10,000-49,999'
    elif(('50,000' in count) or ('70,000' in count)):
        return '$50,000-99,999'
    elif(('100,000' in count) or ('200,000' in count)):
        return '$100,000-299,999'
    elif(('300,000' in count) or '>' in count):
        return '> $300,000'
    elif('0-999' in count):
        return count
    else:
        return '$' + count

#comp['Compensation'] = np.vectorize(compensation)(comp['Compensation'])
cyg['Compensation'] = np.vectorize(compensation)(cyg['Compensation'])
        

In [None]:
def Degrees(degree):
    if(('Some' in degree) or ('I' in degree) or ('No' in degree)):
        return 'No_Degree'
    else:
        return degree
#comp['Degree'] = np.vectorize(Degrees)(comp['Degree']) 
cyg['Degree'] = np.vectorize(Degrees)(cyg['Degree']) 

compensations = pd.get_dummies(cyg['Compensation'])
cyg = pd.concat([cyg.drop('Compensation', axis = 1), compensations], axis = 1)

#comp.groupby(['Degree', 'Compensation']).sum()

cyg.groupby(['Degree', 'Years_Coding']).sum()

In [None]:
dc = cyg.groupby(['Degree', 'Years_Coding']).mean()
dc

In [None]:
plt.figure(figsize = (12,14), dpi = 200)
ax = sns.heatmap(dc, annot = True, robust=True)
ax.set(xlabel = 'Years of Coding');
plt.tick_params(axis='both', which='major', labelsize=8, labelbottom = False, bottom=False, top = False, labeltop=True);

In [None]:
#This heatmap shows the degree Kagglers have along with how much money they are making and how many years they have been coding
#for. Adding up the rows gives you 100%

#The heatmap clearly shows that the higher degree and more years of experience with coding, the more money a Kaggler will earn. 

#Some observations I have come up with are:

#Kagglers with Doctoral degrees have barely anyone with 0-1 or 1-2 years of coding experience making over $300,000, however, Kagglers with Masters and Bachelors still have multiple people making $300,000 with 0-1 or 1-2 years of coding experience

#Nearly 57% of kagglers with Bacheors degrees and 0-1 year of coding experience earn less than $5000, 46% of Kagglers with a Masters degree and 0-1 years of coding experience earn less than $5000, and 52% of Kagglers with a Doctorate degree and 0-1 years of coding experience earn less than $5000. These are fairly surprising numbers which could due fairly low standards of living in countries, it's still concerning that people with such high degreees and in a field that high in demand would earn that little money. In fact kagglers with Professional Degreesand 0-1 year of coding experience have the lowest percentage of all the degrees with 45% earning between 0  - $5000.


In [None]:
cyg.groupby(['Gender', 'Years_Coding']).sum()

In [None]:
cg = cyg.groupby(['Gender', 'Years_Coding']).mean()
cg

In [None]:
plt.figure(figsize=(10,12), dpi = 150)
ax = sns.heatmap(cg, annot = True, robust = True, cmap = 'viridis')
ax.set(xlabel = 'Compensation');
plt.tick_params(axis='both', which='major', labelsize=7, labelbottom = False, bottom=False, top = False, labeltop=True);

In [None]:
#Again adding every row will lead you to 100%

#In all genders, however, there are still a some people that have only coded for a few years but are earning 6 figure salaries, so I am assuming (like that for the degree) they actually have others jobs and are just doing coding on the side or just lied

#For Woman with 20+ years of coding experience, 31% of them make less than $5000 where on the other hand only 14% of Men with more than 20+ years of experience make less than $5000

#As the salary range increases especially for those between 50k - 100k and those between 100k - 300k, the percentage of woman making these compensations is 1/3 less than that of men
