# Exporation of the factors that lead to high Salary

In [None]:
import datetime
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt # data visualization
import seaborn as sns # statistical data visualization
import missingno
%matplotlib inline

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score 
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from sklearn import tree

from sklearn.metrics import log_loss
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

import graphviz
from IPython.display import Image
#import pydotplus
import collections

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/kaggle-survey-2020/kaggle_survey_2020_responses.csv')

In [None]:
# Pick out relevant columns
d1 = pd.DataFrame() 
d1['age'] = df['Q1'][1:]
d1['gender'] = df['Q2'][1:]
d1['country'] = df['Q3'][1:]
d1['highest_education'] = df['Q4'][1:]
d1['job_title'] = df['Q5'][1:]
d1['experience'] = df['Q6'][1:]
d1['recommended_language'] = df['Q8'][1:]
d1['computing_platform'] = df['Q11'][1:]
d1['use_of_tpu'] = df['Q13'][1:]
d1['ml_exp_years'] = df['Q15'][1:]
d1['size_of_company'] = df['Q20'][1:]
d1['emp_in_ds_dept'] = df['Q21'][1:]
d1['ml_in_company'] = df['Q22'][1:]
d1['salary'] = df['Q24'][1:]
d1['ml_cc_spend'] = df['Q25'][1:]
d1['big_data_products'] = df['Q30'][1:]
d1['bi_tools'] = df['Q32'][1:]
d1['analysis_tools'] = df['Q38'][1:]

In [None]:
d1.head()

In [None]:
## Salary
d1['salary'] = d1['salary'].fillna('No answer')
salary_mapping = {'No answer' : 0, '$0-999' : 1 , '1,000-1,999' : 2, '2,000-2,999' : 3, '3,000-3,999': 4, '4,000-4,999': 5, 
               '5,000-7,499': 6, '7,500-9,999': 7, '10,000-14,999': 8,'15,000-19,999': 9,'20,000-24,999': 10, 
                '25,000-29,999': 11, '30,000-39,999': 12, '40,000-49,999': 13, '50,000-59,999': 14, '60,000-69,999': 15,
                '70,000-79,999': 16, '80,000-89,999': 17, '90,000-99,999': 18,
                 '100,000-124,999': 19, '125,000-149,999': 20, '150,000-199,999': 21,
                 '200,000-249,999': 22, '250,000-299,999': 23, '300,000-500,000': 24, '> $500,000': 25}

d1['salary_ordinal'] = d1.loc[d1.salary.notnull(), 'salary'].map(salary_mapping)

In [None]:
# High Salary Indicator
d1['salary_high'] = [1 if x >=19 else 0 for x in d1['salary_ordinal']] 

In [None]:
# Age
age_mapping = {'18-21' : 0 , '22-24' : 1, '25-29' : 2, '30-34': 3, '35-39': 4, 
               '40-44': 5, '45-49': 6, '50-54': 7,'55-59': 8,'60-69': 9, '70+': 10 }

d1['age_ordinal'] = d1.loc[d1.age.notnull(), 'age'].map(age_mapping)
d1 = d1.sort_values(by=['age_ordinal'])

In [None]:
## Experience
d1['experience'] = d1['experience'].fillna('No answer')
experience_mapping = {'No answer':0, 'I have never written code' : 1 , '< 1 years' : 2, '1-2 years' : 3, '3-5 years': 4, '5-10 years': 5, 
               '10-20 years': 6, '20+ years': 7}

d1['experience_ordinal'] = d1.loc[d1.experience.notnull(), 'experience'].map(experience_mapping)

In [None]:
# Highest Education
d1['highest_education'] = d1['highest_education'].fillna('No answer')
highest_education_mapping = {'No answer':0, 'I prefer not to answer' : 1 , 'No formal education past high school' : 2, 'Some college/university study without earning a bachelor’s degree' : 3, 'Bachelor’s degree': 4, 'Master’s degree': 5, 
               'Doctoral degree': 6, 'Professional degree': 7}

d1['highest_education_ordinal'] = d1.loc[d1.highest_education.notnull(), 'highest_education'].map(highest_education_mapping)

In [None]:
## Machine Learning Experience
d1['ml_exp_years'] = d1['ml_exp_years'].fillna('No answer')
ml_exp_years_mapping = {'No answer':0 , 'I do not use machine learning methods' : 1 , 'Under 1 year' : 2, 
                        '1-2 years' : 3, '2-3 years' :4,
                        '3-4 years': 5, '4-5 years': 6, '5-10 years': 7, '10-20 years': 8, '20 or more years': 9}
d1['ml_exp_years_ordinal'] = d1.loc[d1.ml_exp_years.notnull(), 'ml_exp_years'].map(ml_exp_years_mapping)

In [None]:
## Size of company
d1['size_of_company'] = d1['size_of_company'].fillna('No answer')
size_of_company_mapping = {'No answer':0 , '0-49 employees' : 1 , '50-249 employees' : 2, '250-999 employees' : 3,
               '1000-9,999 employees': 4, '10,000 or more employees': 5}
d1['size_of_company_ordinal'] = d1.loc[d1.size_of_company.notnull(), 'size_of_company'].map(size_of_company_mapping)

In [None]:
## ML Spend
d1['ml_cc_spend'] = d1['ml_cc_spend'].fillna('No answer')
ml_cc_spend_mapping = {'No answer':0 , '$0 ($USD)' : 1 , '$1-$99' : 2, '$100-$999' : 3,'$1000-$9,999':4,
               '$10,000-$99,999': 5, '$100,000 or more ($USD)': 6}
d1['ml_cc_spend_ordinal'] = d1.loc[d1.ml_cc_spend.notnull(), 'ml_cc_spend'].map(ml_cc_spend_mapping)

# Model - what 

In [None]:
def evaluate(truth, pred):
    accuracy = accuracy_score(truth, pred)
    print('Confusion Matrix')
    print(confusion_matrix(truth, pred))
    print('Classification Report')
    print(classification_report(truth, pred))
    mae  = mean_absolute_error(truth, pred)
    mse  = mean_squared_error(truth, pred)
    rmse = math.sqrt(mse)
    print()
    return (mae, mse, rmse)
resultsDF = pd.DataFrame(columns=['Technique', 'MAE', 'MSE', 'RMSE', 'log_loss_metric', 'Features','TrainingTime'])

In [None]:
# Generate results
def generate_results(technique,truth,pred,X_train,traintime,proba):
    mae, mse, rmse = evaluate(y_test, pred)
    log_loss_metric = log_loss(y_test,proba) ##################
    print('MAE  : {:.2f}'.format(mae))
    print('RMSE : {:.2f}'.format(rmse))
    print('Log loss : {:.2f}'.format(log_loss_metric))
    resultsDF.loc[technique] = [technique, mae, mse, rmse, log_loss_metric, X_train.shape[1],traintime]

In [None]:
categorical_columns = ['gender','country', 'job_title', 'recommended_language','computing_platform','use_of_tpu','emp_in_ds_dept','big_data_products','bi_tools','analysis_tools']
numerical_columns = []
ord_columns    = ['age_ordinal','experience_ordinal','highest_education_ordinal','ml_exp_years_ordinal','size_of_company_ordinal','ml_cc_spend_ordinal']

In [None]:
categorical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numerical_pipe = Pipeline([
    #('scaler',  StandardScaler()),
    ('imputer', SimpleImputer(strategy='median'))
])

ordinal_pipe = Pipeline(steps=[
    ('le', OrdinalEncoder())
])

preprocessor = ColumnTransformer(
    [('cat', categorical_pipe, categorical_columns),
     ('num', numerical_pipe, numerical_columns),
     ('le',  ordinal_pipe,  ord_columns)])

In [None]:
y = d1['salary_high'].copy()
X = d1.drop(columns=['salary_ordinal','salary_high']).copy()
X = X[categorical_columns + numerical_columns + ord_columns]

In [None]:
pipe = Pipeline(steps=[('preprocess', preprocessor)])
X_train_proc = pipe.fit_transform(X, y)

In [None]:
# Baseline Random Forest Model
rf = RandomForestClassifier(n_estimators=200, 
                               criterion='gini', # default is gini, entropy is Information Gain = Log Loss which leads to more balanced trees
                               min_samples_split=5, 
                               min_samples_leaf=2, 
                               max_features='auto',  # Was 'auto' - can be sqrt
                               bootstrap=True, n_jobs=-1, random_state=42)
# Fit
startDefault = datetime.datetime.now()
rf.fit(X_train_proc, y)
endDefault = datetime.datetime.now()
defaultTime = (endDefault - startDefault).seconds
print(f'Default took {defaultTime} seconds')
# Predict
proba = rf.predict_proba(X_test_proc)
pred  = np.argmax(proba, axis=1)
# Log Results
generate_results('Baseline Random Forest',y_test,pred,X_train,defaultTime,proba)

In [None]:
ohe = (pipe.named_steps['preprocess']
         .named_transformers_['cat']
         .named_steps['onehot'])
feature_names = ohe.get_feature_names(input_features=categorical_columns)
feature_names = np.r_[feature_names, numerical_columns,ord_columns]

tree_feature_importances = (
    rf.feature_importances_)
sorted_idx = tree_feature_importances.argsort()

y_ticks = np.arange(0, len(feature_names))
fig, ax = plt.subplots(figsize=(10,6))
ax.barh(y_ticks[-10:], tree_feature_importances[sorted_idx][-10:])
ax.set_yticklabels(feature_names[sorted_idx][-10:])
ax.set_yticks(y_ticks[-10:])
#ax.set_title("Random Forest Feature Importances (MDI)")

fig.suptitle('Random Forest Feature Importance', fontsize=20)
plt.xlabel('Importance', fontsize=20)
plt.ylabel('Feature', fontsize=20)

# Set general font size
plt.rcParams['font.size'] = '16'

# Set tick font size
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
    label.set_fontsize(16)

plt.show()

# ****So the most important factor that leads to a high salary is living the USA!****

Followed by: -
1. the money have you (or your team) spent on machine learning and/or cloud computing services at home (or at work) in the past 5 years (approximate $USD)
2. the number of years have you used machine learning methods
3. your age
4. your experience


