# IT Salary Survey EU 2020

In [None]:
import warnings
warnings.filterwarnings("ignore")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Importing libaries for analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

I will be working with the Dataset from 2020 (IT Salary Survey Eu 2020) only in this notebook. There are many compelling features to explore in detail through feature engineering to gain better insight into the data. 

In [None]:
df_2020 = pd.read_csv('/kaggle/input/2020-it-salary-survey-for-eu-region/IT Salary Survey EU  2020.csv')
df_2020.head()

In [None]:
# Dimension of the Data
df_2020.shape # 1253 rows and 23 columns

In [None]:
# Checking for NaN value
df_2020.isna().sum()

# Background Information about participants in the survey
In this section, I will go over basic features such as Age and Gender of the participants. I will be going into a more detailed analysis later on in the notebook.

### Age of survey participants

In [None]:
# Age of survey participants distribution
fig, (ax_box,ax_hist) = plt.subplots(2,sharex=True,figsize=(15,8))

sns.boxplot(data=df_2020,x='Age',ax=ax_box)
ax_box.set(xlabel="")
sns.histplot(data=df_2020,x='Age',kde=True,ax=ax_hist)

plt.xticks(fontsize=13,rotation=90)
plt.xlabel("Age of survey participants",fontsize=14)
plt.yticks(fontsize=13)
plt.ylabel("Frequencies",fontsize=14)
plt.show()

In [None]:
# Using count plot
plt.figure(figsize=(15,8))
ax = sns.countplot(data=df_2020,x='Age')



plt.xticks(fontsize=13,rotation=90)
plt.xlabel("Age of survey participants",fontsize=14)
plt.yticks(fontsize=13)
plt.ylabel("Frequency",fontsize=14)
plt.show()

In [None]:
df_2020['Age'].describe()

### City

In [None]:
plt.figure(figsize=(15,8))
sns.barplot(data=df_2020['City'].value_counts().head(10).reset_index(),
        y='index',x='City')
plt.xticks(fontsize=13)
plt.xlabel("Frequency",fontsize=14)
plt.yticks(fontsize=13)
plt.ylabel("Country",fontsize=14)
plt.show()

In [None]:
# Top 5 City on the survey 
plt.figure(figsize=(15,8))

df_2020['City'].value_counts().iloc[:5].plot.pie(autopct="%1.2f%%",fontsize=13,startangle=90,labels=['']*5,
                                         cmap='Set2',explode=[0.05] * 5,pctdistance=1.2)
plt.ylabel("")
plt.legend(loc='upper left',labels=df_2020['City'].value_counts().iloc[:5].index)
plt.show()

In [None]:
# Gender
plt.figure(figsize=(15,8))

df_2020['Gender'].value_counts().plot.pie(autopct="%1.2f%%",fontsize=12,startangle=90,
                                         cmap='crest',explode=[0.05] * 3,pctdistance=1.1,
                                          labeldistance=1.3,textprops={'fontsize': 15})
plt.ylabel("")
plt.show()

# Yearly brutto salary (without bonus and stocks) in EUR
In this section, I will analyze the "Yearly brutto salary (without bonus and stocks) in EUR" to get a clear understanding of the based salary of IT employees as the main source of income. Additionally, I will be using the Interquartile range (IQR) method to removed outliers (based on the Yearly brutto salary) from the dataset 

In [None]:
# Exploring the Original Data

interested_df = df_2020['Yearly brutto salary (without bonus and stocks) in EUR']

sns.set_style('whitegrid')
plt.figure(figsize=(12,6))

# Original Data
plt.subplot(1,2,1)
sns.boxplot(interested_df)
plt.title("Original Unscaled Salary")

# After Scaling down using Logarithms
plt.subplot(1,2,2)
sns.boxplot(np.log10(interested_df))
plt.title("Scaled Salary (log10)")

plt.tight_layout()
plt.show()

You can see that there are potentials outliers from the above boxplot; henceforth, I will have to handle them later on in this notebook. 

In [None]:
# Removing Outlier using IQR

feature = 'Yearly brutto salary (without bonus and stocks) in EUR'

p25 = df_2020[feature].quantile(0.25) # 58800.0
p75 = df_2020[feature].quantile(0.75) # 80000.0
iqr = p75 - p25 # 21200.0

upper_limit = p75 + 1.5 * iqr # 111800.0
lower_limit = p25 - 1.5 * iqr # 27000.0

# New DataFrame after removing the outlier 
new_df = df_2020[(df_2020[feature] > lower_limit) & (df_2020[feature] < upper_limit)]

# Checking the difference
diff = df_2020.shape[0] - new_df.shape[0] # 32 outliers were removed

diff

In [None]:
# Distribution after removing outlier from the dataset

fig, (ax1,ax2) = plt.subplots(1,2,figsize=(15,6))

sns.histplot(new_df[feature], ax=ax1)
sns.boxplot(new_df[feature],ax=ax2)

plt.tight_layout()
plt.show()

# Relationship between Gender and other Features
I will be exploring the relationship between Gender and other features like Age, Number of vacation days, Yearly brutto salary (without bonus and stocks) in EUR, etc.

### Data Cleaning for Gender Column

In [None]:
# Exploring the Gender Column
feature = 'Gender'
new_df[feature].unique()

After removing outlier (based on Yearly brutto salary (without bonus and stocks) in EUR) from the dataset, the Diverse Gender (count = 2) is no longer represented. There is still some NaN value (no response) left in the dataset. I will replace the NaN value with the string "Not Specified".

In [None]:
# Replacing the NaN value with string "Not Specified"
new_df[[feature]] = new_df[[feature]].fillna(value='Not Specified')
new_df[feature].unique()

### Gender and Age

In [None]:
# Exploring the Age column
new_df['Age'].unique()

In [None]:
# Plotting the Boxplots
plt.figure(figsize=(12,8))
sns.boxplot(x='Gender',y='Age',data=new_df,palette='Pastel2')
plt.xticks(fontsize=13)
plt.xlabel("")
plt.yticks(fontsize=13)
plt.ylabel("Age",fontsize=14)
plt.show()

### Gender and Yearly Salary

In [None]:
# Ploting the Distribution
salary = 'Yearly brutto salary (without bonus and stocks) in EUR'

plt.figure(figsize=(15,8)) 
ax = sns.boxplot(x='Gender',y=salary,data=new_df,palette='Set3',linewidth=2.5)
ax = sns.swarmplot(x='Gender',y=salary,data=new_df)

plt.xticks(fontsize=13)
plt.xlabel("")
plt.yticks(fontsize=13)
plt.ylabel(salary,fontsize=14)
plt.show()

In [None]:
# Distribution of Yearly Salaries 
plt.figure(figsize=(15,8))
sns.histplot(x='Yearly brutto salary (without bonus and stocks) in EUR',data=new_df,
             bins=20, kde=True, hue='Gender',multiple="stack",palette='magma')
plt.xticks(fontsize=13)
plt.xlabel("Yearly brutto salary (without bonus and stocks) in EUR",fontsize=14)
plt.yticks(fontsize=13)
plt.ylabel("Count",fontsize=14)
plt.show()

### Gender and Number of Vacation Day

In [None]:
# Exploring the Number of vacation days column

feature = 'Number of vacation days'
new_df[feature].unique()

In [None]:
# I will be removing NaN value and filter out none numeric responses like 'unlimited' and '(no idea)'
vacation_sal = new_df[new_df[feature].isna() == False] # remove NaN Value
vacation_sal = vacation_sal[vacation_sal[feature].str.isnumeric()]
vacation_sal[feature] = pd.to_numeric(vacation_sal[feature])

# Removing anypotential outlier in Number of vacation days column - using the same process like earlier in the notebook
# I will put the name of the column here instead of variable 'feature' just as a reminder of what we are working with :) 
p25 = vacation_sal['Number of vacation days'].quantile(0.25)
p75 = vacation_sal['Number of vacation days'].quantile(0.75)
iqr = p75 - p25

upper_limit = p75 + 1.5 * iqr
lower_limit = p25 - 1.5 * iqr

vacation_sal = vacation_sal[(vacation_sal['Number of vacation days'] > lower_limit) &
                               (vacation_sal['Number of vacation days'] < upper_limit)]

# Difference
diff = new_df.shape[0] - vacation_sal.shape[0] # 74 rows removed after filtering process
diff

In [None]:
# Plotting The distribution
plt.figure(figsize=(15,10))

sns.histplot(x='Number of vacation days',data=vacation_sal,hue='Gender',bins=10,
             multiple="stack",palette='viridis',kde=True)
plt.xticks(fontsize=13)
plt.xlabel("Number of vacation days",fontsize=14)
plt.yticks(fontsize=13)
plt.ylabel("Count",fontsize=14)
plt.show()

### Quick: Salary and Number of vacation days
Since we are already on the topic of number of vacation days, lets explore its relationship with employee salaries

In [None]:
plt.figure(figsize=(15,8))
sns.swarmplot(y='Yearly brutto salary (without bonus and stocks) in EUR',x='Number of vacation days',
            data=vacation_sal, hue='Gender')

plt.xticks(fontsize=13)
plt.xlabel("Number of vacation days",fontsize=14)
plt.yticks(fontsize=13)
plt.ylabel("Yearly brutto salary (without bonus and stocks) in EUR",fontsize=14)
plt.show()

In [None]:
plt.figure(figsize=(15,10))
sns.histplot(y='Yearly brutto salary (without bonus and stocks) in EUR',x='Number of vacation days',
            data=vacation_sal)
plt.xticks(fontsize=13)
plt.xlabel("Number of vacation days",fontsize=14)
plt.yticks(fontsize=13)
plt.ylabel("Yearly brutto salary (without bonus and stocks) in EUR",fontsize=14)
plt.show()

According to the distribution, a large portion of respondents gets 30 vacation days. The second highest concentration seems to be around 28 vacation days

# Deep Dive into Position
Now, I will be exploring different Position in the IT workforce.

### Data Cleaning for Position column

In [None]:
# Removing the space of the Position column name
new_df.rename(columns={'Position ':'Position'},inplace=True)
new_df['Position'].value_counts()

### Salary by Position

In [None]:
# Plotting the distribution - number of Positon
plt.figure(figsize=(15,8))
sns.countplot(data=new_df,x='Position',
              order=new_df['Position'].value_counts().iloc[:10].index,
              palette='Pastel2')
plt.xticks(rotation=70,fontsize=13)
plt.xlabel("Position",fontsize=14)
plt.yticks(fontsize=13)
plt.ylabel("Count",fontsize=14)
plt.show()

In [None]:
position_earning = new_df['Yearly brutto salary (without bonus and stocks) in EUR'].groupby(new_df['Position']).sum()
position_earning.sort_values(ascending=False,inplace=True)
position_earning.head(10)

Given that there are a lot of people who work as a Software Engineer comparative to other positions, it is expected that the total earning for Software Engineering would be the highest. Now, let's look into the average earning of the top position to get a clearer picture.

In [None]:
top_position = position_earning.iloc[:10].index.to_list()
avr_earning = (position_earning/ new_df['Position'].value_counts()).sort_values(ascending = False)

for position in top_position:
    print(position,": ",round(avr_earning[position],2))

In [None]:
# Top 10 Highest Total Earning by Profession
position_earning = position_earning.to_frame()

plt.figure(figsize=(12,8))

sns.barplot(data=position_earning, y=position_earning.index[:10],
            x= position_earning['Yearly brutto salary (without bonus and stocks) in EUR'].iloc[:10],
            palette='Pastel1')

plt.ticklabel_format(style='plain', axis='x')
plt.xticks(fontsize=13)
plt.xlabel("Yearly brutto salary (without bonus and stocks) in EUR",fontsize=14)
plt.yticks(fontsize=13)
plt.ylabel("Position",fontsize=14)
plt.show()

### Quick Glimpse at the relationship between Programming Language/ Tecahnology and Yearly salary 

In [None]:
# Top 15 Salary and Technology

modi_new_df = new_df.copy()
modi_new_df['Your main technology / programming language'] = modi_new_df['Your main technology / programming language'].str.lower()
modi_new_df['Your main technology / programming language'] = modi_new_df['Your main technology / programming language'].str.strip()

top_tech_sal = modi_new_df['Yearly brutto salary (without bonus and stocks) in EUR'].groupby(
            modi_new_df['Your main technology / programming language']).sum().sort_values(ascending=False).index.to_list()[0:15]
top_tech_sal

In [None]:
# Boxplot Salary and Programming Language

data = modi_new_df[modi_new_df['Your main technology / programming language'].isin(top_tech_sal)]

plt.figure(figsize=(15,10))
sns.boxplot(data=data,y='Yearly brutto salary (without bonus and stocks) in EUR',
           x='Your main technology / programming language')
plt.legend()
plt.xticks(rotation=45,fontsize=13)
plt.xlabel('Teachnology/ Programming Language',fontsize=14)
plt.yticks(fontsize=13)
plt.ylabel("Yearly brutto salary (without bonus and stocks) in EUR",fontsize=14)
plt.show()

### Programming Language In-Depth

In [None]:
# Visualization before possible modification to the data
plt.figure(figsize=(15,8))
sns.countplot(data=new_df,x='Your main technology / programming language',
             order=new_df['Your main technology / programming language'].value_counts().iloc[:10].index,
             palette='inferno_r')
plt.xticks(rotation=45,fontsize=13)
plt.xlabel("Technology/ Programming Language",fontsize=14)
plt.yticks(fontsize=13)
plt.ylabel("Count",fontsize=14)
plt.show()

In [None]:
# Checking unique instance of 'Your main technology / programming language'
new_df['Your main technology / programming language'].sample(100).unique()

After taking a sample of the unique value of 'Your main technology / programming language' column, we can see that there is a high variance in the input style. Some input more than one programming language, with different separators like dash or comma (even how they use the separation symbol varies). The same programming language can be input many multiple different ways (capitalization or using abbreviation instead). Even from the bar graph above, two JavaScript and two Python bars appeared when it is supposed to be just one bar for each language. This happens because the inputs are written differently (no capitalization at S and p respectively). In the following cell, I will try to capture the real number for each programming language as best as I can.

In [None]:
# Data Modification

# Creating a list of all programming langauge 
programming_raw_list = df_2020['Your main technology / programming language'].dropna().to_list()

programming_raw_list[0:10]

In [None]:
# Breaking down each input into a list

programming_list = list()
for p in programming_raw_list:
    
    # Replace punctuation pattern with comma, lower case everything, and strip white space to create comma seperated string
    # Then, use split to split the string into list 
    # The order of replace(), strip(), and lower() can be different
    p = p.replace(", ",",").replace(" / ",",").replace("/",",").strip().replace(" ","").lower().split(",")
    
    # Append all the instances into another list 
    for p1 in p:
        programming_list.append(p1)
programming_list[0:10]

In [None]:
from collections import Counter

# Using Counter to help count all unique instances 
programming_count = Counter(programming_list)

main_programming_df = pd.DataFrame.from_dict(programming_count, orient='index').reset_index()
main_programming_df.columns = ['Language','Count']
main_programming_df.sort_values(by='Count',ascending=False,inplace = True)
main_programming_df.head(10)

After some modification, the order and count are different from before. Regardless, this method might not be the best. I still couldn't deal with an abbreviation like 'js' that stands for javascript. Therefore, the number might be different for those languages that go by its abbreviation. 

In [None]:
# Distribution Top 10 Technology and Programming Language
plt.figure(figsize=(12,8))
ax = sns.barplot(data=main_programming_df.head(10),x='Language',y='Count',palette='pastel')

for p in ax.patches:
        ax.annotate('{:.0f}'.format(p.get_height()),
                    (p.get_x()+0.2, p.get_height()+5), fontsize=12)

plt.xticks(rotation=45,fontsize=13)
plt.xlabel("Technology/ Programming Language",fontsize=14)
plt.yticks(fontsize=13)
plt.ylabel("Count",fontsize=14)
plt.show()

### Positions and their preferred Programming Language

In [None]:
# Software Engineer and Main Programming Language
se_raw_pro = new_df[new_df['Position'] == 'Software Engineer'][
                    'Your main technology / programming language'].dropna().to_list()
se_pro_list = list()

for langs in se_raw_pro:
    langs = langs.strip().replace(", ",",").replace("/",",").replace(" / ",",").replace(" ",",").lower().split(",")
    for lang in langs:
        se_pro_list.append(lang)

se_pro_df = pd.DataFrame.from_dict(Counter(se_pro_list),orient='index')
se_pro_df.columns = ['Count']
se_pro_df.sort_values(by='Count',ascending=False,inplace=True)
se_pro_df.drop(index="",inplace=True)
se_pro_df.head(11)

In [None]:
# Top 11 Programming Language for Software Engineer
plt.figure(figsize=(12,8))
ax = sns.barplot(data=se_pro_df.head(11), x=se_pro_df.iloc[:11].index,
            y=se_pro_df['Count'].iloc[:11], palette = 'Pastel2')

for p in ax.patches:
        ax.annotate('{:.0f}'.format(p.get_height()),
                    (p.get_x()+0.3, p.get_height()+0.3), fontsize=12)

plt.xlabel("Technology/ Programming Language",fontsize=14)
plt.xticks(fontsize=13,rotation=65)
plt.ylabel("Count",fontsize=14)
plt.yticks(fontsize=13)
plt.show()

In [None]:
# Backend Developer and Main Programming Language 

bd_raw_pro = new_df[new_df['Position'] == 'Backend Developer'][
                    'Your main technology / programming language'].dropna().to_list()
bd_pro_list = list()

for langs in se_raw_pro:
    langs = langs.strip().replace(", ",",").replace("/",",").replace(" ",",").lower().split(",")
    for lang in langs:
        se_pro_list.append(lang)

bd_pro_df = pd.DataFrame.from_dict(Counter(se_pro_list),orient='index')
bd_pro_df.columns = ['Count']
bd_pro_df.sort_values(by='Count',ascending=False,inplace=True)
bd_pro_df.drop(index="",inplace=True)
bd_pro_df.head(11)

In [None]:
# Top 11 Programming Language for Backend Developer
plt.figure(figsize=(12,8))
ax = sns.barplot(data=bd_pro_df.head(11), x=bd_pro_df.iloc[:11].index,
            y=bd_pro_df['Count'].iloc[:11], palette = 'Pastel1')

for p in ax.patches:
        ax.annotate('{:.0f}'.format(p.get_height()),
                    (p.get_x()+0.25, p.get_height()+0.3), fontsize=12)

plt.xlabel("Technology/ Programming Language",fontsize=14)
plt.xticks(fontsize=13,rotation=65)
plt.ylabel("Count",fontsize=14)
plt.yticks(fontsize=13)
plt.show()

In [None]:
# Data Scientist and Main Programming Language

ds_raw_pro = new_df[new_df['Position'] == 'Data Scientist'][
    'Your main technology / programming language'].dropna().to_list()

ds_pro_list = list()

for langs in ds_raw_pro:
    langs = langs.strip().replace(", ",",").replace("/",",").replace(" + ",",").lower().split(",")
    for lang in langs:
        ds_pro_list.append(lang)


ds_pro_df = pd.DataFrame.from_dict(Counter(ds_pro_list),orient='index')
ds_pro_df.columns = ['Count']
ds_pro_df.sort_values(by='Count',ascending=False,inplace = True)
ds_pro_df.head(3)

In [None]:
# Top 13 Programming Language for Data Scientist
plt.figure(figsize=(12,8))
ax = sns.barplot(data=ds_pro_df.head(3), x=ds_pro_df.iloc[:3].index,
            y=ds_pro_df['Count'].iloc[:3], palette = 'Pastel1')

for p in ax.patches:
        ax.annotate('{:.0f}'.format(p.get_height()),
                    (p.get_x()+0.35, p.get_height()+0.4), fontsize=12)

plt.xlabel("Technology/ Programming Language",fontsize=14)
plt.xticks(fontsize=13)
plt.ylabel("Count",fontsize=14)
plt.yticks(fontsize=13)
plt.show()

In [None]:
# Other technologies/programming languages you use often

other_raw_pro = new_df['Other technologies/programming languages you use often'].dropna().to_list()

other_pro_list = list()

for langs in other_raw_pro:
    langs = langs.strip().replace(", ",",").replace(" / ",",").replace("/",",").lower().split(",")
    for lang in langs:
        other_pro_list.append(lang)

other_pro_df = pd.DataFrame.from_dict(Counter(other_pro_list),orient='index')
other_pro_df.columns = ['Count']
other_pro_df.sort_values(by='Count',ascending=False,inplace=True)
other_pro_df.head(10)

In [None]:
# Top 15 Technologies/programming languages
plt.figure(figsize=(12,8))
ax = sns.barplot(data=other_pro_df.head(15), x=other_pro_df.iloc[:15].index,
            y=other_pro_df['Count'].iloc[:15], palette = 'Pastel2')

for p in ax.patches:
        ax.annotate('{:.0f}'.format(p.get_height()),
                    (p.get_x()+0.2, p.get_height()+0.5), fontsize=12)

plt.xlabel("Technology/ Programming Language",fontsize=14)
plt.xticks(fontsize=13,rotation=65)
plt.ylabel("Count",fontsize=14)
plt.yticks(fontsize=13)
plt.show()

# Effected by COVID-19 Pandemic?

### Losing a job due to COVID-19?

In [None]:
# Exploring feature
feature = 'Have you lost your job due to the coronavirus outbreak?'
new_df[feature].value_counts()

In [None]:
# Majority of the inputs are either Yes or No (first two values). I will combine none yes or no responses to another row
# in the dataset
other_responses = pd.Series([7],index=['Other'])

effected_se = new_df[feature].value_counts().iloc[0:2]
effected_se = pd.concat([effected_se,other_responses])
effected_se

In [None]:
effected_se.plot.pie(y=feature, autopct='%1.1f%%', figsize=(8,6), fontsize=13,cmap='Set2',
                     startangle=45, explode =[0.1]*3, pctdistance=1.1,labeldistance=1.3,
                    textprops={'fontsize': 15})
plt.ylabel("")
plt.tight_layout()
plt.show()

### Having shorter working week (Kurzarbeit)?

In [None]:
feature = 'Have you been forced to have a shorter working week (Kurzarbeit)? If yes, how many hours per week'
new_df[feature].unique()

In [None]:
new_df[feature].isna().sum() / new_df[feature].shape[0] * 100

Around 70.28 percent of the response from the employees whether they have a shorter working week is NaN value. Given the prompt to the question, this means that around 70 percent of the workers don't have a shorter working week. Hence around 30 percent of respondents have a shorter working week during the COVID-19 outbreak. "

In [None]:
adjusted_wh = new_df[feature].value_counts()
adjusted_wh

In [None]:
print("Zero working hours per week: ", adjusted_wh.iloc[0] / adjusted_wh.sum() * 100 )
print("Greater than 10 hours per weel: ", adjusted_wh[adjusted_wh.index > 10].sum() / adjusted_wh.sum() * 100)

For those with adjusted working hours, approximately 51.4 percent of the respondents (179) don't have working hours per week. Around 43.3% of the employee still have greater than 10 working hours per week.

In [None]:
# Distribution 
plt.figure(figsize=(15,8))
sns.countplot(data=new_df,x=feature)
plt.xlabel(feature,fontsize=14)
plt.xticks(fontsize=13,rotation=90)
plt.ylabel("Count",fontsize=14)
plt.yticks(fontsize=13)
plt.show()

### Monetary Support from Employer during COVID-19?

In [None]:
feature = 'Have you received additional monetary support from your employer due to Work From Home? If yes, how much in 2020 in EUR'
new_df[feature].value_counts().head(10)

In [None]:
new_df[feature].unique()

In [None]:
# NaN value counts
new_df[feature].isna().sum()

In [None]:
# Getting some kind of responses
new_df[feature].value_counts().sum()

There is a lot of missing value for this feature. Although it seems reasonable to assume that NaN responses mean No monetary support, I wouldn't make that assumption because it can be misleading. I will be doing two visualizations: one that excludes NaN value (427) and one that assumes NaN value as "No" (744). 


In [None]:
# For this feature, I will combine those who answer '0' and No (ignore case) together
# I will be looking at the proportion of those who get some support vs. those that received nothing
support_res = new_df[feature].value_counts()
num_no_support = support_res["0"] + support_res["No"] + support_res["no"]
num_support = new_df[feature].value_counts().sum() - num_no_support

num_support, num_no_support

In [None]:
# Distribution - Ignoring NaN (no response) value
labels = ['Support','No Support']
values = [num_support, num_no_support]

plt.figure(figsize=(12,6))

plt.subplot(1,2,1)
plt.title("Ignoring NaN value",fontsize=15,fontweight='bold')
plt.pie(values,labels=labels,autopct='%1.1f%%',startangle=90,explode=[0.05] * 2,
       colors=['lightgreen','lightblue'],textprops={'fontsize': 14},shadow=True)

# Assuming that NaN means "No monetary support"
plt.subplot(1,2,2)
plt.title("Counting NaN value as No",fontsize=15,fontweight='bold')
plt.pie([num_support,num_no_support + new_df[feature].isna().sum()],labels=labels,autopct='%1.1f%%',startangle=180,explode=[0.05] * 2,
       colors=['lightpink','violet'],textprops={'fontsize': 14},shadow=True)

plt.tight_layout()
plt.show()

# Other Features
In this section, I will explore other features that I haven't covered yet in the prior visualization. 

### Total years of experience and Seniority Level

In [None]:
new_df['Total years of experience'].value_counts()

total_experience = new_df.copy().dropna(subset=['Total years of experience'])
total_experience = total_experience[total_experience['Total years of experience'].str.isnumeric()]
total_experience['Total years of experience'] = total_experience['Total years of experience'].astype(int) 

plt.figure(figsize=(15,8))
sns.boxplot(data=total_experience[total_experience['Seniority level'].isin(['Senior','Junior','Middle'])],
           x='Seniority level',y='Total years of experience',palette='viridis_r')
plt.xticks(fontsize=13)
plt.xlabel("")
plt.yticks(fontsize=13)
plt.ylabel("Total years of experience",fontsize=14)
plt.show()

### Total years of experience and Yearly Salary

In [None]:
plt.figure(figsize=(15,8))
sns.boxplot(data=total_experience[total_experience['Seniority level'].isin(['Senior','Junior','Middle'])],
           y='Seniority level',x='Yearly brutto salary (without bonus and stocks) in EUR',palette='viridis_r')
plt.xticks(fontsize=13)
plt.xlabel("Yearly brutto salary (without bonus and stocks) in EUR",fontsize=14)
plt.yticks(fontsize=13)
plt.ylabel("")
plt.show()

### Main Language at work

In [None]:
# Language at Work
new_df['Main language at work'].value_counts()

In [None]:
# Language at work and total salaries
language_sal = new_df['Yearly brutto salary (without bonus and stocks) in EUR'].groupby(
    new_df['Main language at work']).sum().sort_values(ascending=False)

language_sal

In [None]:
# Average earning seperated by Language at work
avr_language_sal = new_df['Yearly brutto salary (without bonus and stocks) in EUR'].groupby(
    new_df['Main language at work']).mean().sort_values(ascending=False)

avr_language_sal

### Company Size and Salary

In [None]:
# Company size and Salary 
new_df['Company size'].value_counts()

In [None]:
# Average earning of employee at different company size
avr_earning_size = new_df['Yearly brutto salary (without bonus and stocks) in EUR'].groupby(new_df['Company size']).mean()
avr_earning_size

### Company Type and Salary

In [None]:
# Company type
new_df['Company type'].value_counts().head(10)

In [None]:
# Top 3 Company Type
top3_com_type = new_df['Company type'].value_counts().iloc[:3].index.to_list() # ['Product', 'Startup', 'Consulting / Agency']

# Top 3 Company Average Yearly brutto salary
top3_com_salary = new_df[new_df['Company type'].isin(top3_com_type)].groupby(
                    new_df['Company type']).mean()['Yearly brutto salary (without bonus and stocks) in EUR'].sort_values(ascending=False)
top3_com_salary

In [None]:
# Pie Plot - distribution of company type
data = [n for n in new_df['Company type'].value_counts().iloc[:3].values] + [new_df['Company type'].value_counts().iloc[3:].sum()]
labels = [l for l in top3_com_type] + ['Other']

plt.figure(figsize=(15,8))
plt.pie(data,labels=labels,autopct="%1.2f%%",explode=[0.05]*4, startangle=15,
        colors=['thistle','cornflowerblue','lightseagreen','khaki'],
       textprops={'fontsize': 14},shadow=True)
plt.title("Distribution of Company Type",fontsize=15,fontweight='bold')
plt.show()

In [None]:
# More Years of experience in Germany means geter 
new_df.columns

In [None]:
new_df['Years of experience in Germany'].value_counts()

In [None]:
# Year of expirience in germany that is numeric
in_germany = new_df.copy().dropna(subset=['Years of experience in Germany'])
in_germany = in_germany[in_germany['Years of experience in Germany'].str.isnumeric()]
# in_germany['Years of experience in Germany'].value_counts()

plt.figure(figsize=(15,8))
sns.countplot(data=in_germany,x='Years of experience in Germany',
             order=in_germany['Years of experience in Germany'].value_counts().index)
plt.xticks(fontsize=13)
plt.xlabel("Years of experience in Germany",fontsize=14)
plt.yticks(fontsize=13)
plt.ylabel("Count",fontsize=14)

# Thank you for checking out my notebook!