In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import seaborn as sns
import os

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
pd.set_option('display.max_rows', 500)

In [None]:
df = pd.read_csv('/kaggle/input/kaggle-survey-2021/kaggle_survey_2021_responses.csv')

In [None]:
# remove the first row
df = df.iloc[1:, :]

In [None]:
# we're interested in age, degree, experience, role, country, and annual salary
columns = ['Q1', 'Q3', 'Q4', 'Q5', 'Q6', 'Q25']
df = df[columns]

# rename the columns to be more easier
df.columns = ['Age', 'Country', 'Degree', 'Role', 'Experience', 'Salary']

In [None]:
# let's create some  functions that creates some plots for categorical vars
def plot_hist(data, cat_var, title, ordinal=False, order=None, orientation='v', xtick_angle = None):
    if orientation == 'h':
        fig = px.histogram(data, y= cat_var, histnorm='percent') 
    else:
        fig = px.histogram(data, x= cat_var, histnorm='percent')
    fig.update_layout(title=title, 
                          font_family="San Serif",
                          titlefont={'size': 20},
                          showlegend=True,
                          legend=dict(
                              orientation=orientation,
                              y=1.0, 
                              yanchor="top", 
                              x=1.0, 
                              xanchor="right"
                          )                 
                         )
    if ordinal:
        fig.update_xaxes(categoryorder='array', categoryarray=order)
        
    if xtick_angle:
        fig.update_xaxes(tickangle = xtick_angle)    
    fig.update_traces(marker_color=None, marker_line_color='white',
                          marker_line_width=1.5, opacity=0.99)
    fig.show()

def plot_pie(data, cat_var, title):
    plt.figure(figsize=(50,20))
    fig = px.pie(data, cat_var,title=title, hole=0.6)
    fig.update_traces(textposition='inside', textinfo='percent+label')
    fig.update_layout(uniformtext_minsize=10, uniformtext_mode='hide')
    fig.show()    

In [None]:
# let's replace all 40+ people
to_replace = ['40-44', '45-49', '50-54', '55-59', '60-69', '70+']
df.Age = df.Age.replace(to_replace, '40+')
df.Age.unique()

In [None]:
# let's look at age dist
age_order = ['18-21', '22-24', '25-29', '30-34', '35-39', '40+']
plot_hist(df, 'Age', 'Age Distribution', ordinal=True, order=age_order)

In [None]:
# turrns out that there is a lot of young people (18-21) who are interested in DS and that could be a competition,
# since the younger people are more serious to learn than those who have learned and been working.
# It means that DS is a trending among young people and they might compete older ones

In [None]:
df.Degree

In [None]:
# let's look at degree dist
# let's first divide degrees into (below bachelor, bachelor, master, more than master)
df.Degree = df.Degree.replace(['Doctoral degree', 'Professional doctorate'], 'After Master')
df.Degree = df.Degree.replace(['No formal education past high school', 'Some college/university study without earning a bachelor’s degree', 'I prefer not to answer'], 'Before Bachelor')

df.Degree.unique()

In [None]:
degrees_order = ['Before Bachelor', 'Bachelor’s degree', 'Master’s degree', 'After Master']
plot_hist(df, 'Degree', 'Degree Distribution', ordinal=True, order=degrees_order)
plot_pie(df.dropna(subset = ['Degree']), 'Degree', 'Degree Distribution')

#### - 75% of people have Bachelor's or Master's degrees (or they intend to have it soon).
#### - no much people out of university or school.
#### - I know there are some people doesn't attend university and became very successful, 
#### - but this is not the case for the majority of them.

In [None]:
# let's look at Experience
df.Experience.unique()

In [None]:
# first let's make it more easier to work with
# will execlude those who never coding
df.Experience = df.Experience.replace('I have never written code', np.nan)

In [None]:
df.Experience.unique()

In [None]:
ex_order = ['< 1 years', '1-3 years', '3-5 years', '5-10 years', '10-20 years', '20+ years']
plot_hist(df.dropna(subset=['Experience']), 'Experience', 'Experience distribution', ordinal=True, order=ex_order)

In [None]:
# as we can see around 50% are juniors or mid-level (5 years max of coding)
# just around 7% with 20+ years of experience (40+ years old)
# it seems that older people less than 40 years old are using kaggle more to gain expertise
df.Salary.dtype

In [None]:
# Annual salary
df.Salary = df.Salary.replace('$0 ($USD)', np.nan)
df.Salary.isna().sum() /df.shape[0]

In [None]:
# it seems like 66% of respondants don't like to say their salary
# Does that have somthing with the culture or country
def s(group):
    return group.isna().sum()
pivot = pd.pivot_table(index=df['Country'], values=['Salary'], aggfunc=s ,data = df).sort_values('Salary', ascending=False)
plt.rcParams["figure.figsize"] = [20, 6]
pivot.plot(kind='bar')
# pivot.plot.bar(x=pivot.index, y=pivot.values)
plt.show()

# seems like india has a huge number of respondants that don't like to clear how much they earn

In [None]:
# let's make it a lot more easier by dividing annual salaries into four ranges <1000, 1000-9,999, 10,000-99,999, 100,000<=

# <1000
df['Salary'] = df['Salary'].replace(['$0-999'], '< $1000')
# 1000 - 9,999
df['Salary'] = df['Salary'].replace(['1,000-1,999', '2,000-2,999', '3,000-3,999', '4,000-4,999', '5,000-7,499', '7,500-9,999'], '1,000-9,999')
# 10,000 - 99,999
df['Salary'] = df['Salary'].replace(['10,000-14,999',
               '15,000-19,999', '20,000-24,999', '25,000-29,999', '30,000-39,999', '40,000-49,999', '50,000-59,999', '60,000-69,999',
               '70,000-79,999', '80,000-89,999', '90,000-99,999'], '10,000-99,999')
# > 100,000
df['Salary'] = df['Salary'].replace(['100,000-124,999', '125,000-149,999', '150,000-199,999', '200,000-249,999',
               '250,000-299,999', '300,000-499,999', '$500,000-999,999', '>$1,000,000'], '> $100,000')

In [None]:
df['Salary'].value_counts()

In [None]:
s_order = ['< $1000', '1,000-9,999', '10,000-99,999', '> $100,000']
plot_hist(df.dropna(subset=['Salary']), 'Salary', 'Annual Salary distribution', ordinal=True, order=s_order, orientation='h')
plot_pie(df.dropna(subset = ['Salary']), 'Salary', 'Annual Salary Distribution')

In [None]:
# around 55% of the kagglers get more than $10000 dollars and 42% of them in the range $10000 - 99999

### Some insights
- Data sciense is getting more popular among young people
- Most of data scientists have a degree less than Master's degree
- Most of the kagglers have 3-5 years of coding
- It seems like kagglers earn much money (It could vary by country, experience , and education level)

## Look at education level by age

In [None]:
# cascatter implementation - reused from https://github.com/myrthings/catscatter/blob/master/catscatter.py
# (c) Myr Barnés, 2020
# More info about this function is available at
# - https://towardsdatascience.com/visualize-categorical-relationships-with-catscatter-e60cdb164395
# - https://github.com/myrthings/catscatter/blob/master/README.md
def catscatter(df, colx, coly, colx_order, coly_order, color=['blue','gray', 'green'], ratio=180, xlabel=None, ylabel=None):
        plt.figure(figsize=(50,20))
        '''
        It's name abbreviation for Categorical scatter, so from it's name it's very likely to scatter that you know but for categorical variables.
        The size of each point varies by the percentage of the existence of each value in variable1 in variable2
        colx_order: the order of variable1
        coly_order: the order of variable2
        color: the colors of them (Be careful it's not easy to be determined)
        ratio: the size of the points (overall not specific point)
        font: the font of your words
        '''
        agg_data = df.groupby([colx, coly]).size().reset_index(name='count')
        # this will generate the count, but we want the ratio of each range in var1 in each range in var2 so we will use the following
        agg_data['ratio'] = 0
        for var1_val in colx_order:
            summ = agg_data[agg_data[colx] == var1_val]['count'].sum()
            
            for var2_val in coly_order:

                row = agg_data[ (agg_data[colx] == var1_val) & (agg_data[coly] == var2_val) ]
                value = row['count']
                agg_data.loc[(agg_data[colx] == var1_val) & (agg_data[coly] == var2_val), 'ratio'] = round(value / summ*100, 2)
               
        # this will prevent manifesting a little bug of catscatter 
        # casting age and gender to int64 as a result of the catscatter plotting below
        agg_data_copy = agg_data.copy()
        df = agg_data_copy
        cols = 'ratio'
        # Create a dict to encode the categeories into numbers (sorted)
        colx_codes=dict(zip(colx_order,range(len(df[colx].unique()))))
        coly_codes=dict(zip(coly_order[::-1],range(len(df[coly].unique()))))
        

        # Apply the encoding
        df[colx]=df[colx].apply(lambda x: colx_codes[x])
        df[coly]=df[coly].apply(lambda x: coly_codes[x])


        # Prepare the aspect of the plot
        plt.rcParams['xtick.bottom'] = plt.rcParams['xtick.labelbottom'] = False
        plt.rcParams['xtick.top'] = plt.rcParams['xtick.labeltop'] = True
        plt.rcParams['xtick.color']=color[-1]
        plt.rcParams['ytick.color']=color[-1]
        plt.box(False)


        # Plot all the lines for the background
        for num in range(len(coly_codes)):
            plt.hlines(num,-1,len(colx_codes)+1,linestyle='dashed',linewidth=1,color=color[num%2],alpha=0.5)
        for num in range(len(colx_codes)):
            plt.vlines(num,-1,len(coly_codes)+1,linestyle='dashed',linewidth=1,color=color[num%2],alpha=0.5)

        
        # Plot the scatter plot with the numbers
        plt.scatter(df[colx],
                   df[coly],
                   s=df[cols]*ratio,
                   zorder=2,
                   color=color[-1])
        
        # To annotate the scatter plot
        agg_data_copy = agg_data.copy()
        xmap = {order:val for order, val in zip(colx_order, range(len(colx_order)) ) }
        ymap = {order:val for order, val in zip(coly_order, range(len(coly_order))[::-1] ) }
        for var1_val in colx_order:
            x_place = xmap[var1_val]
            for var2_val in coly_order:
                y_place = ymap[var2_val]
                percentage = agg_data_copy.loc[ (agg_data_copy[colx] == var1_val) & (agg_data_copy[coly] == var2_val)  ,'ratio'].values[0]
                plt.annotate(str(percentage)+'%', (x_place+.1, y_place+.1), size=30)

        # Change the ticks numbers to categories and limit them
        plt.xticks(ticks=list(colx_codes.values()),labels=colx_codes.keys(),rotation=90)
        plt.yticks(ticks=list(coly_codes.values()),labels=coly_codes.keys())
        plt.xlim(xmin=-1,xmax=len(colx_codes))
        plt.ylim(ymin=-1,ymax=len(coly_codes))
        
        # Some updates
        plt.xticks(fontsize=30)
        plt.yticks(fontsize=30)
        if xlabel:
            plt.xlabel(xlabel, size=50, color='gray')
            
        if ylabel:
            plt.ylabel(ylabel, size=50, color='gray')
        plt.show()

In [None]:
df_age_degree = df[['Age', 'Degree']]

# create the plot
catscatter(df_age_degree , 'Age', 'Degree', age_order, degrees_order,ratio=180)

In [None]:
# We have some old kagglers (40+) with less than bacholer's degree! surperising
# Some young kagglers(18-21) have master's degree or above, really interesting as well (Maybe later i'll figure out from where those people are)
# In general, the older you are, the more chance to get higher degree

## Look at Coding Experience by age

In [None]:
# group by age & degree, count the size of each group
df_age_ex = df[['Age', 'Experience']]

# create the plot
catscatter(df_age_ex, 'Age', 'Experience', age_order, ex_order, ratio=180)


In [None]:
# Here we have such amazing things to point out:
# We have some (18-21) kagglers that have (5-10 years) of experience, which means they might have started coding at 12 or 13.
# most of (22-24) have (1-3 years) which makes a lot sense, since they should be in uni at that age

## Look at Annual Salary by age

In [None]:
# group by age & degree, count the size of each group
df_age_salary = df[['Age', 'Salary']]

# create the plot
catscatter(df_age_salary, 'Age', 'Salary', age_order, s_order, ratio=180)


In [None]:
# make sense that most of kagglers that make 100K+ are (40+)
# It seems like the more years you get, the more money you make
# But the more interesting is that we find some young kagglers (18-21) earn more than 100K,
# maybe they are those who started coding much earlier (12 or 13 years old)
# one more interesing notice is that most of 40+ kagglers earn less than 100K

## What the relation between Salary and years of experience

In [None]:
# group by age & degree, count the size of each group
df_ex_salary = df[['Experience', 'Salary']]

# create the plot
catscatter(df_ex_salary, 'Experience', 'Salary', ex_order, s_order, ratio=180)

In [None]:
# As it was expected the more years you get, the more money you make.

# We have such brilliant kagglers (33%) with less than 1 year and earn more than 10K,
# that means even if you're beginner, you could make much money.

# For those who earn (1000 - 9999), it seems that experience doesn't have that much effect,

## What the relation between Salary and education level

In [None]:
df_degree_salary = df[['Degree', 'Salary']]

# create the plot
catscatter(df_degree_salary, 'Degree', 'Salary', degrees_order, s_order, ratio=180)

In [None]:
# Here earning 100K+ starts from 7% and increase to 20% when you get some degree after master
# compared with the experience plot above, earning 100K+ starts from 3% to 34%.

# If you have some degree less than bacholer, 
# you and someone else with Master got almost the same chance to get a job with salary range(1000 - 9999)

# it seems like education level or at least (having Master's degree) doesn't matter a lot

# That could mean education level matters but not like experience level
# experiecne level has much more significant effect on how much you earn