In [None]:
# One-time run to accelerate the EDA process,
# Uncomment to run this

# !pip install -U pandas-profiling 
# !pip install -U seaborn 
# !pip install autoviz
# !pip install sweetviz 

In [None]:
# Library Imports
import pandas as pd
import numpy as np
import glob
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import plotly.express as px
from pandas_profiling import ProfileReport
import sweetviz as sv 
import warnings 
from scipy.stats import trim_mean
warnings.simplefilter(action='ignore', category=FutureWarning)

%matplotlib inline

sns.set_style('darkgrid')
plt.style.use('fivethirtyeight')
mpl.rcParams['font.size'] = 14
mpl.rcParams['figure.figsize'] = (10,6)
mpl.rcParams['figure.facecolor'] = '#00000000'

In [None]:
# Read data
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
df_2018 = pd.read_csv("../input/2020-it-salary-survey-for-eu-region/IT Salary Survey EU 2018.csv")
df_2019 = pd.read_csv("../input/2020-it-salary-survey-for-eu-region/T Salary Survey EU 2019.csv")
df_2020 = pd.read_csv("../input/2020-it-salary-survey-for-eu-region/IT Salary Survey EU  2020.csv")

In [None]:
# Accelerate EDA, this can be done for all 3 years
profile_2018 = ProfileReport(df_2018, title="IT Salary Survey for Year 2018", explorative=True)
# profile_2019 = ProfileReport(df_2019, title="IT Salary Survey for Year 2019", explorative=True)
# profile_2020 = ProfileReport(df_2020, title="IT Salary Survey for Year 2020", explorative=True)
profile_2018.to_notebook_iframe()
# profile_2018.to_file('it_salary_survey_2018.html')

The schema in 3 years is little different.

## Data Preparation

In [None]:
df_2018 = df_2018.rename(columns={
    'Your level':'Seniority level',
    'Are you getting any Stock Options?': 'Stock options?',
    'Main language at work': 'Working language',
    'Position ': 'Position'
})

df_2018["Gender"] = df_2018["Gender"].replace(
    {"M": 'Male', 'F': 'Female'})

df_2018['Timestamp'] = pd.to_datetime(df_2018['Timestamp'])
df_2018['Timestamp'] = df_2018['Timestamp'].dt.strftime('%Y-%m-%d %H:%M:%S')

In [None]:
df_2019 = df_2019.rename(columns={
    'Zeitstempel':'Timestamp',
    'Position (without seniority)': 'Position',
    'Your main technology / programming language': 'Technical Language',
    'Number of vacation days': 'Holidays available',
    'Number of home office days per month': 'WFH days',
    'Сontract duration': 'Contract Duration'
})

df_2019['Timestamp'] = pd.to_datetime(df_2019['Timestamp'])
df_2019['Timestamp'] = df_2019['Timestamp'].dt.strftime('%Y-%m-%d %H:%M:%S')

df_2019['Current Salary'] = df_2019['Yearly brutto salary (without bonus and stocks)'].fillna(0) + \
                            df_2019['Yearly bonus'].fillna(0) + \
                            df_2019['Yearly stocks'].fillna(0) 

df_2019['Salary one year ago'] = df_2019['Yearly brutto salary (without bonus and stocks) one year ago. Only answer if staying in same country'].fillna(0) + \
                            df_2019['Yearly bonus one year ago. Only answer if staying in same country'].fillna(0) + \
                            df_2019['Yearly stocks one year ago. Only answer if staying in same country'].fillna(0) 

for i in ['0', 'Company name ', 'Yearly brutto salary (without bonus and stocks)',
                        'Yearly bonus', 'Yearly stocks',
          'Yearly brutto salary (without bonus and stocks) one year ago. Only answer if staying in same country',
          'Yearly bonus one year ago. Only answer if staying in same country',
          'Yearly stocks one year ago. Only answer if staying in same country']:
    try:
        df_2019 = df_2019.drop(i, axis=1)
    except:
        continue

In [None]:
df_2020 = df_2020.rename(columns={
    'Position ': 'Position',
    'Total years of experience':'Years of experience',
    'Your main technology / programming language': 'Technical Language',
    'Other technologies/programming languages you use often': 'Other technical skills',
    'Сontract duration': 'Contract Duration',
    'Have you lost your job due to the coronavirus outbreak?': 'Covid job loss',
    'Have you been forced to have a shorter working week (Kurzarbeit)? If yes, how many hours per week':'Shorter work week',
    'Have you received additional monetary support from your employer due to Work From Home? If yes, how much in 2020 in EUR': 'Covid WFH allowance',
})

df_2020['Timestamp'] = pd.to_datetime(df_2020['Timestamp'])
df_2020['Timestamp'] = df_2020['Timestamp'].dt.strftime('%Y-%m-%d %H:%M:%S')

df_2020["Yearly bonus + stocks in EUR"] = df_2020["Yearly bonus + stocks in EUR"].replace(
    {"bvg only": 0, 'depends': 0, '15000+-': 15000, 'Na': 0, 'Not sure': 0, "-": 0})

df_2020["Annual bonus+stocks one year ago. Only answer if staying in same country"] = df_2020["Annual bonus+stocks one year ago. Only answer if staying in same country"].replace(
    {"learning budget, bvg, gym, food":0, "-": 0})


df_2020['Current Salary'] = df_2020['Yearly brutto salary (without bonus and stocks) in EUR'].astype(float).fillna(0) + df_2020['Yearly bonus + stocks in EUR'].astype(float).fillna(0) 

df_2020['Salary one year ago'] = df_2020['Annual brutto salary (without bonus and stocks) one year ago. Only answer if staying in the same country'].astype(float).fillna(0) + \
                            df_2020['Annual bonus+stocks one year ago. Only answer if staying in same country'].astype(float).fillna(0) 


for i in ['Yearly brutto salary (without bonus and stocks) in EUR','Yearly bonus + stocks in EUR',
          'Annual brutto salary (without bonus and stocks) one year ago. Only answer if staying in the same country',
          'Annual bonus+stocks one year ago. Only answer if staying in same country']:
    try:
        df_2020 = df_2020.drop(i, axis=1)
    except:
        pass

In [None]:
df_final = pd.concat([df_2018, df_2019, df_2020], axis=0)

In [None]:
df_final["City"] = df_final["City"].replace({"München": 'Munich', "Krakau":"Krakow", 'Cracow': 'Krakow', 
                                             'warsaw': 'Warsaw' , 'Warsaw, Poland': 'Warsaw', 'Tampere (Finland)' : 'Tampere',
                                             'Fr': 'Berlin', 'France': 'Paris', 'NJ, USA' : 'New Jersey'})

In [None]:
df_final.columns

In [None]:
profile_final = ProfileReport(df_final, title="IT Salary Survey for Year 2018-2020", explorative=True)
profile_final.to_notebook_iframe()

## Missing Values Treatment

In [None]:
def missing_value_imputation(df, col_name):
    fig = px.histogram(df_2018,
                   x=col_name,
                   marginal='box',
                   nbins=47,
                   title=col_name)
    fig.update_layout(bargap=0.1)
    fig.show()

    print("Mean: ", df[col_name].mean())
    print("Trimmed Mean: ", trim_mean(df[col_name].dropna() , proportiontocut=0.1))
    print("Median: ", np.median(df[col_name].dropna().values))

    # Draw the density plot
    sns.distplot(df[col_name].fillna(df[col_name].mean()), label='Mean Imputed' ,hist = False, kde = True, kde_kws = {'linewidth': 3})
    sns.distplot(df[col_name].fillna(trim_mean(df[col_name].dropna() , proportiontocut=0.1)), label='Trimmed Mean Imputed' , hist = False, kde = True, kde_kws = {'linewidth': 3})
    sns.distplot(df[col_name].fillna(np.median(df[col_name])), label='Median Imputed' , hist = False, kde = True, kde_kws = {'linewidth': 3})

    # Plot formatting
    plt.legend(prop={'size': 16})
    plt.title('Density Plot with Different Methods of Missing values imputation')
    plt.xlabel(col_name)
    plt.ylabel('Density')
    plt.show()


In [None]:
missing_value_imputation(df_final, "Age")

In [None]:
df_final['Age'] = df_final['Age'].fillna(trim_mean(df_final["Age"].dropna() , proportiontocut=0.1))

Trimmed mean imputation can be used for missing values treatment as it is robust to outliers