# Who will move to a new job?

In [None]:
import numpy as np
import pandas as pd

### train data

In [None]:
path = '../input/hr-analytics-job-change-of-data-scientists/aug_train.csv'
train_data = pd.read_csv(path)
train_data

### test data

In [None]:
path = '../input/hr-analytics-job-change-of-data-scientists/aug_test.csv'
test_data = pd.read_csv(path)
test_data

### sample_submission.csv

In [None]:
path = '../input/hr-analytics-job-change-of-data-scientists/sample_submission.csv'
gender_submission_data = pd.read_csv(path)
gender_submission_data

# Data overview, cleaning and preprocessing

### Contact for joint processing

In [None]:
df = pd.concat([train_data, test_data], ignore_index=True)
df

In [None]:
df['target'].replace(to_replace=0,  value='no', inplace=True)
df['target'].replace(to_replace=1,  value='yes', inplace=True)
df['target'] = df['target'].astype('object')
df['target'].value_counts(dropna=False)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

plt.figure(figsize=(20, 30))
sns.heatmap(df.isnull(), cbar=False)

In [None]:
def NaN_info(df):
    global null_view
    try:
        null_view = df[[col for col in df.columns if df[col].isna().sum() > 0]].isna().sum().sort_values(ascending = True)
        null_view = pd.DataFrame(null_view, columns=['NANs'])
        null_view[['PERCENT']] = null_view.NANs.apply(lambda x: round((x/len(df))*100, 2))
        null_view[['TYPE']] = df.dtypes
    except:
        return null_view
    return null_view

NaN_info(df)

# Pre-Processing and Feature Engineering

In [None]:
df.dtypes

In [None]:
for el in list(df.columns):
    print(f'======================= {el} =======================')
    print(df[el].value_counts(dropna=False))

In [None]:
indexes = list(df.index)
for el in indexes:
    city = df.loc[el, 'city']
    city = city.split(sep='_')    
    df.loc[el, 'city_num'] = int(city[1])
    
df.city_num = df.city_num.astype('int64')

In [None]:
df['experience'].replace(to_replace='>20', value=np.NaN, inplace=True)
df['experience'].replace(to_replace='<1', value=0.5, inplace=True)
df['experience'] = df['experience'].astype('float64')

# Create feature with experience title

In [None]:
def change(x):
    if x < 1:
        x = 'trainee'
    elif x < 2:
        x = 'junior'
    elif x < 5:
        x = 'middle'
    elif x < 8:
        x = 'senior'
    elif x < 15:
        x = 'master'
    elif x > 14:
        x = 'grandmaster'
    else:
        np.nan
    return x

df['experience_cat'] = df['experience'].apply(change)

df[0:10][['experience','experience_cat']]

In [None]:
change={
        '<10':10, 
        '10/49':50, 
        '50-99':100, 
        '100-500':500, 
        '500-999':1000, 
        '1000-4999':5000, 
        '5000-9999':10000, 
        '10000+':100000,
        }
df['company_size_num'] = df['company_size'].map(change)
df[0:10][['company_size','company_size_num']]

In [None]:
df['last_new_job'].unique()

In [None]:
change={
        'never': 0,
        '1': 1,
        '2': 2,
        '3': 3,
        '4': 4,
        '>4': 5,
        }
df['last_new_job_num'] = df['last_new_job'].map(change)
df[0:10][['last_new_job','last_new_job_num']]

In [None]:
df

# NaN prediction and imputation

In [None]:
def NaN_info(df):
    global null_view
    try:
        null_view = df[[col for col in df.columns if df[col].isna().sum() > 0]].isna().sum().sort_values(ascending = True)
        null_view = pd.DataFrame(null_view, columns=['NANs'])
        null_view[['PERCENT']] = null_view.NANs.apply(lambda x: round((x/len(df))*100, 2))
        null_view[['TYPE']] = df.dtypes
    except:
        return null_view
    return null_view

NaN_info(df)

In [None]:
def nan_predict(df,
                skip_features_from_prediction_where_percent_missing_data_more_than = 100,
                include_features_as_predictors_where_perc_missing_data_less_than = 50,
                apply_fast_predictor_where_missing_data_less_than_percent = 100,
                use_n_rows_for_train_not_more_than = 1000000000,    #  If your dataframe is large
                randomizedSearchCV_iter_plus_perc_missing_data = 10,
                n_estimators_parameter_for_LightGBM = 2000,
                target_feature = None,   # For prediction at the end
                ): 
    
    import random
    import pandas as pd
    import numpy as np

    # Disabling warnings
    import sys
    import warnings
    if not sys.warnoptions:
        warnings.simplefilter("ignore")


    from lightgbm import LGBMClassifier
    from lightgbm import LGBMRegressor
    from sklearn.model_selection import RandomizedSearchCV
    from sklearn.model_selection import ShuffleSplit
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import mean_absolute_error
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import f1_score
    from sklearn.preprocessing import LabelEncoder
    
    
    import matplotlib.pyplot as plt
    import seaborn as sns
    %matplotlib inline
    
    

    global counter_all_predicted_values
    counter_all_predicted_values = 0
    
    global numeric_features
    numeric_features = []
    
    global best_params
    
    
    PARAMS  =  {'num_leaves': [12, 50, 120, 200, 300, 400, 500],   #np.arange(200, 600, step=100),
                'max_depth': [4, 8, 12, 16],
                'learning_rate': [0.001, 0.01, 0.1],
                'n_estimators': [n_estimators_parameter_for_LightGBM],
                'subsample': [0.1, 0.3, 0.5],
                'feature_fraction': [0.1, 0.3, 0.5],
                'bagging_fraction': [0.1, 0.3, 0.5],
                'bagging_seed': np.arange(1, 3, step=1),
                'lambda_l1': [0.2],
                'lambda_l2': [0.1],
                'min_child_samples': np.arange(2, 6, step=2),
                'min_split_gain': [0.0001, 0.001]
               }
    
    
    CV = ShuffleSplit(n_splits=2, test_size=0.25, random_state=0)
    
    
    

    def NaN_info(df):
        global null_view
        try:
            null_view = df[[col for col in df.columns if df[col].isna().sum() > 0]].isna().sum().sort_values(ascending = True)
            null_view = pd.DataFrame(null_view, columns=['NANs'])
            null_view[['PERCENT']] = null_view.NANs.apply(lambda x: round((x/len(df))*100, 2))
            null_view[['TYPE']] = df.dtypes
        except:
            return null_view
        return null_view
    
    
    def numeric_features(df):
        num_features = [feature for feature in df.columns if df[feature].dtype in ['int64', 'float64']]
        return num_features
    
    
    def integer_features(df):
        global int_features
        int_features = [feature for feature in df.columns if df[feature].dtype in ['int64']]
        return int_features


    def encoding(work_predictors, df):
        feature_power = 0.5          # Skew handling
        for j in work_predictors:
            el_type = df[j].dtype
            if el_type == 'object':
                df[j].replace(np.nan, 'NoNoNo', inplace=True)
                labelencoder = LabelEncoder()
                df.loc[:, j] = labelencoder.fit_transform(df.loc[:, j])
            else:
                df[j] = df[j]**feature_power
        return df, work_predictors


    def hyperparms_tuning(CV, X_train, X_test, y_train, y_test, n_iter_for_RandomizedSearchCV, PARAMS, alg, scoring):
        global best_params
        global pred_test_lgb

        lgbm = alg(random_state = 0)
        lgbm_randomized = RandomizedSearchCV(estimator=lgbm, 
                                            param_distributions=PARAMS, 
                                            n_iter=n_iter_for_RandomizedSearchCV, 
                                            scoring=scoring, 
                                            cv=CV, 
                                            verbose=0,
                                            n_jobs = -1)

        lgbm_randomized.fit(X_train, y_train)
        
        best_params = lgbm_randomized.best_params_
        pred_test_lgb = lgbm_randomized.predict(X_test)
        return best_params, pred_test_lgb

    
    def predict_regressor(best_params, X, y, miss_df):
        print('Best parameters:')
        print(best_params)
        print('')
        global pred_miss
        lgbm = LGBMRegressor(**best_params, n_jobs=-1, random_state=0)
        lgbm.fit(X, y)
        pred_miss = list(lgbm.predict(miss_df))
        print('-------------------------------')
        print(f"The first 100 predicted missing values: \n{pred_miss[:100]}")
        return pred_miss


    def predict_classifier(best_params, X, y, miss_df):
        print('Best parameters:')
        print(best_params)
        print('')
        global pred_miss
        lgbm = LGBMClassifier(**best_params, n_jobs=-1, random_state=0)
        lgbm.fit(X, y)
        pred_miss = list(lgbm.predict(miss_df))
        print('-------------------------------')
        print(f"The first 100 predicted missing values: \n{pred_miss[:100]}")
        return pred_miss
    
    
    def imput_missing_value_to_main_df(df, miss_indeces, pred_miss, el):
        counter = 0
        for idx in miss_indeces:
            df.loc[idx, el] = pred_miss[counter]
            counter += 1
        return df
    
    
    
    # Go)

    plt.figure(figsize=(20, 5))
    sns.heatmap(df.isnull(), cbar=False)
    
    
    print(NaN_info(df))
    print('\n\n\n')
    
    all_features = list(df.columns)
    df_indeces = list(df.index)
    df.reset_index(drop=True, inplace=True)
    
    integer_features(df)

    delete_miss_features = list(
        (null_view.loc[null_view['PERCENT'] > skip_features_from_prediction_where_percent_missing_data_more_than]).index)
    print(f'Exclude from the prediction, because missing data more than \
    {skip_features_from_prediction_where_percent_missing_data_more_than}% :\n{delete_miss_features}')
    print('')
    all_miss_features = list(null_view.index)

    for delete_feature in delete_miss_features:
        all_miss_features.remove(delete_feature)
        
    
    if target_feature in all_miss_features:  # moving target_feature to end of the prediction
        all_miss_features.append(all_miss_features.pop(all_miss_features.index(target_feature)))
        
    
    for el in all_miss_features:
        print('\n\n\n\n')
        
        # select features as predictors
        NaN_info(df)
        lot_of_miss_features = list(
            (null_view.loc[null_view['PERCENT'] > include_features_as_predictors_where_perc_missing_data_less_than]).index)
        now_predictors = list(set(all_features)-set(lot_of_miss_features))
        work_predictors = list(set(now_predictors) - set([el]))

        
        # missing data (data for prediction)
        miss_indeces = list((df[pd.isnull(df[el])]).index)
        miss_df = df.iloc[miss_indeces][:]
        miss_df = miss_df[work_predictors]
        encoding(work_predictors, df=miss_df)

        
        # data without NaN rows (X data for train, evaluation of model)
        work_indeces = list(set(df_indeces) - set(miss_indeces))
        if len(work_indeces) > use_n_rows_for_train_not_more_than:
            randomlist = random.sample(range(0, len(work_indeces)), use_n_rows_for_train_not_more_than)
            work_indeces = [work_indeces[i] for i in randomlist]
        
        work_df = df.iloc[work_indeces][:] 
        encoding(work_predictors, df=work_df)
        X = work_df[work_predictors]
        y = work_df[el]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

        
        # Info
        feature_type = df[el].dtypes
        percent_missing_data = null_view['PERCENT'][el]
        print(f'Feature: {el},   type: {feature_type},   missing values: {percent_missing_data}%\n')    
        print(f'Shape for train dataframe: {(X.shape)}')
        print(f'Unused features as predictors, because missing data more than {include_features_as_predictors_where_perc_missing_data_less_than}% :')
        print(lot_of_miss_features)
        print('')
        
        
        # PREDICTIONS
        if percent_missing_data < apply_fast_predictor_where_missing_data_less_than_percent:
            
            # FAST Predictions without tuning hyperparameters
            
            print('FAST prediction without tuning hyperparameters, because missing data less than 1%\n')
            best_params = {}
            if feature_type == 'object' or feature_type == 'bool':
                print('FAST CLASSIFIER:')
                labelencoder = LabelEncoder()
                y_train = labelencoder.fit_transform(y_train)
                y_test = labelencoder.fit_transform(y_test)
                lgbm = LGBMClassifier(n_jobs=-1, random_state=0)
                lgbm.fit(X_train, y_train)
                pred_test_lgb_FAST = lgbm.predict(X_test)
                accuracy = accuracy_score(y_test, pred_test_lgb_FAST)
                print('Evaluations:')
                print(f'first 10 y_test: {y_test[:10]}')
                print(f'first 10 y_pred: {pred_test_lgb_FAST[:10]}\n')
                f1 = f1_score(y_test, pred_test_lgb_FAST, average='weighted')
                print(f'accuracy_score:      {accuracy}')
                print(f'f1_score (weighted): {f1}')
                
                predict_classifier(best_params, X, y, miss_df)
                counter_all_predicted_values += len(miss_indeces)
                imput_missing_value_to_main_df(df, miss_indeces, pred_miss, el)

            elif feature_type == 'float64' or feature_type == 'int64':
                print('FAST REGRESSOR:')
                
                lgbm = LGBMRegressor(n_jobs=-1, random_state=0)
                lgbm.fit(X_train, y_train)
                pred_test_lgb_FAST = lgbm.predict(X_test)
                MAE = mean_absolute_error(y_test,pred_test_lgb_FAST)
                y_te = list(round(y_test[:10], 1))
                y_pred = list(np.round(pred_test_lgb_FAST[:10], 1))
                print('Evaluations:')
                print(f'first 10 y_test: {y_te}')
                print(f'first 10 y_pred: {y_pred}\n')
                print(f'mean_absolute_error: {MAE}')
                print(f'mean for {el}: {df[el].mean()}')
                
                predict_regressor(best_params, X, y, miss_df)
                counter_all_predicted_values += len(miss_indeces)
                imput_missing_value_to_main_df(df, miss_indeces, pred_miss, el)

            else:
                print(f"unprocessed feature: {el} - {feature_type} type")
                
                  
        else:
            
            # ADVANCED Predictions with tuning hyperparameters
            
            n_iter_for_RandomizedSearchCV = int(randomizedSearchCV_iter_plus_perc_missing_data + percent_missing_data * 1)
            print(f'Iteration for RandomizedSearchCV: {n_iter_for_RandomizedSearchCV}\n')
            
            if feature_type == 'object' or feature_type == 'bool':
                print('ADVANCED CLASSIFIER:')
                labelencoder = LabelEncoder()
                y_train = labelencoder.fit_transform(y_train)
                y_test = labelencoder.fit_transform(y_test)
                hyperparms_tuning(CV, X_train, X_test, y_train, y_test, n_iter_for_RandomizedSearchCV, PARAMS, alg=LGBMClassifier, scoring='f1_weighted')
                accuracy = accuracy_score(y_test, pred_test_lgb)
                print('Evaluations:')
                print(f'first 10 y_test: {y_test[:10]}')
                print(f'first 10 y_pred: {pred_test_lgb[:10]}\n')
                f1 = f1_score(y_test, pred_test_lgb, average='weighted')
                print(f'accuracy_score:      {accuracy}')
                print(f'f1_score (weighted): {f1}')
                
                predict_classifier(best_params, X, y, miss_df)
                counter_all_predicted_values += len(miss_indeces)
                imput_missing_value_to_main_df(df, miss_indeces, pred_miss, el)

            elif feature_type == 'float64' or feature_type == 'int64':
                print('ADVANCED REGRESSOR:')
                hyperparms_tuning(CV, X_train, X_test, y_train, y_test, n_iter_for_RandomizedSearchCV, PARAMS, alg=LGBMRegressor, scoring='neg_mean_squared_error')
                MAE = mean_absolute_error(y_test,pred_test_lgb)
                y_te = list(round(y_test[:10], 1))
                y_pred = list(np.round(pred_test_lgb[:10], 1))
                print('Evaluations:')
                print(f'first 10 y_test: {y_te}')
                print(f'first 10 y_pred: {y_pred}\n')
                print(f'mean_absolute_error: {MAE}')
                print(f'mean for {el}: {df[el].mean()}')
                
                predict_regressor(best_params, X, y, miss_df)
                counter_all_predicted_values += len(miss_indeces)
                imput_missing_value_to_main_df(df, miss_indeces, pred_miss, el)

            else:
                print(f"unprocessed feature: {el} - {feature_type} type")
        
        plt.figure(figsize=(20, 5))
        sns.heatmap(df.isnull(), cbar=False)

        
    for feature in int_features:
        df[[feature]] = df[[feature]].astype('int64')
        
    df.index = df_indeces

    print('\n\n\n')
    print(f'These features have not been processed, because missing data more than {skip_features_from_prediction_where_percent_missing_data_more_than}%')
    print(NaN_info(df))
    print('\n\n\n')
    print(f'{counter_all_predicted_values} values have been predicted and replaced')
    print('\n')
    
    return df

In [None]:
nan_predict(df,
            target_feature = 'target')     # For prediction at the end


In [None]:
int_features = ['training_hours', 'city_num', 'company_size_num', 
                'last_new_job_num', 'experience']
for feature in int_features:
        df[[feature]] = df[[feature]].astype('int64')

# EDA

In [None]:
target_column = ['target']
predictors = list(set(list(df.columns))-set(target_column))

In [None]:
sns.set(font_scale=1.5)

for el in predictors:
    plot_data = df[['target', el]]
    try:
        g = sns.pairplot(plot_data, hue='target', palette='Set1', height=10, aspect=2)
        
        handles = g._legend_data.values()
        labels = g._legend_data.keys()
        g.fig.legend(handles=handles, labels=labels, loc='upper center', ncol=1)
    except:
        pass

In [None]:
df.columns

In [None]:
plot_data = df[['company_size', 'target']]
plt.figure(figsize=(20,10))
sns.countplot(x='company_size', hue='target', data=plot_data, palette='Set1')

In [None]:
plot_data = df[['major_discipline', 'target']]
plt.figure(figsize=(20,20))
sns.countplot(x='major_discipline', hue='target', data=plot_data, palette='Set1')

In [None]:
plot_data = df[['education_level', 'target']]
plt.figure(figsize=(20,10))
sns.countplot(x='education_level', hue='target', data=plot_data, palette='Set1')

In [None]:
plot_data = df[['enrolled_university', 'target']]
plt.figure(figsize=(20,10))
sns.countplot(x='enrolled_university', hue='target', data=plot_data, palette='Set1')

In [None]:
plot_data = df[['relevent_experience', 'target']]
plt.figure(figsize=(20,10))
sns.countplot(x='relevent_experience', hue='target', data=plot_data, palette='Set1')

In [None]:
plot_data = df[['last_new_job', 'target']]
plt.figure(figsize=(20,10))
sns.countplot(x='last_new_job', hue='target', data=plot_data, palette='Set1')

In [None]:
plot_data = df[['experience_cat', 'target']]
plt.figure(figsize=(20,10))
sns.countplot(x='experience_cat', hue='target', data=plot_data, palette='Set1')

# Permutation Importance

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


df_permutation = df.copy()

target = ['target']
predictors = list(set(list(df.columns)) - set(target))


def encoding(df, columns):
    feature_power = 0.5          # Skew handling
    for j in columns:
        el_type = df[j].dtype
        if el_type == 'object':
            labelencoder = LabelEncoder()
            df.loc[:, j] = labelencoder.fit_transform(df.loc[:, j])
        else:
            df[j] = df[j]**feature_power
    return df, columns

encoding(df_permutation, df_permutation.columns)


X = df_permutation[predictors]
y = df_permutation[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

### Permutation function

In [None]:
import eli5
from eli5.sklearn import PermutationImportance

def permutation(X_train, X_test, y_train, y_test, alg):
    model = alg(n_jobs=-1, random_state=0).fit(X_train, y_train)
    perm = PermutationImportance(model, random_state=0).fit(X_test, y_test)
    return eli5.show_weights(perm, feature_names = X_test.columns.tolist())

### Permutation Importance LGBMClassifier

In [None]:
from lightgbm import LGBMClassifier
permutation(X_train, X_test, y_train, y_test, LGBMClassifier)

### Permutation Importance LGBMRegressor

In [None]:
from lightgbm import LGBMRegressor
permutation(X_train, X_test, y_train, y_test, LGBMRegressor)

### Permutation Importance RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
permutation(X_train, X_test, y_train, y_test, RandomForestClassifier)

### Permutation Importance RandomForestRegressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
permutation(X_train, X_test, y_train, y_test, RandomForestRegressor)

In [None]:
df.columns

# Partial Plots

In [None]:
from matplotlib import pyplot as plt
from pdpbox import pdp
from lightgbm import LGBMClassifier




model = LGBMClassifier(random_state=0).fit(X_train, y_train)

for feature in X_train.columns:
    pdp_dist = pdp.pdp_isolate(model=model,
                               dataset=X_test,
                               model_features=X_test.columns, 
                               feature=feature)

    pdp.pdp_plot(pdp_dist, feature)
    plt.show()

In [None]:
features_to_plot = ['target', 'city_development_index']
inter1  =  pdp.pdp_interact(model=model, 
                            dataset=df_permutation, 
                            model_features=X_test.columns, 
                            features=features_to_plot)

pdp.pdp_interact_plot(pdp_interact_out=inter1, feature_names=features_to_plot, plot_type='contour')
plt.show()

# SHAP

In [None]:
import shap
from sklearn.ensemble import RandomForestClassifier



my_model = RandomForestClassifier(n_estimators=30, random_state=1).fit(X_train, y_train)

def shap_force_plot(X_test, model, row):
    data_for_prediction = X_test.iloc[row,:]
    explainer = shap.TreeExplainer(my_model)
    shap_values = explainer.shap_values(data_for_prediction)
    shap.initjs()
    return shap.force_plot(explainer.expected_value[0], shap_values[0], data_for_prediction)   

In [None]:
shap_force_plot(X_test, model, 0)

In [None]:
shap_force_plot(X_test, model, 100)

In [None]:
shap_force_plot(X_test, model, 1000)

In [None]:
shap_force_plot(X_test, model, 2000)

# Summary Plot

In [None]:
from xgboost import XGBClassifier


model = XGBClassifier(random_state=0).fit(X_train, y_train)

explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

shap.summary_plot(shap_values, X_test)

# Dependence Contribution Plot

In [None]:
from sklearn.ensemble import RandomForestClassifier



my_model = RandomForestClassifier(n_estimators=30, random_state=1).fit(X_train, y_train)

explainer = shap.TreeExplainer(my_model)
data_1 = pd.concat([X, y], axis=1)
data_1 = data_1.iloc[0:100,:]
shap_values = explainer.shap_values(data_1)
shap.dependence_plot('city_development_index', shap_values[1], data_1, interaction_index="target")

In [None]:
shap_values = explainer.shap_values(data_1)
shap.dependence_plot('city', shap_values[1], data_1, interaction_index="target")

In [None]:
shap_values = explainer.shap_values(data_1)
shap.dependence_plot('experience_cat', shap_values[1], data_1, interaction_index="target")

In [None]:
shap_values = explainer.shap_values(data_1)
shap.dependence_plot('experience', shap_values[1], data_1, interaction_index="target")

In [None]:
shap_values = explainer.shap_values(data_1)
shap.dependence_plot('company_size', shap_values[1], data_1, interaction_index="target")

# Submission

In [None]:
df['target'].replace(to_replace='no',   value=0.0, inplace=True)
df['target'].replace(to_replace='yes',  value=1.0, inplace=True)
df['target'] = df['target'].astype('float64')
df['target'].value_counts(dropna=False)


In [None]:
result = df.loc[19158:, 'target']
predictions = np.array(result)
predictions

In [None]:
submission = pd.DataFrame({'enrollee_id':test_data['enrollee_id'],'target':predictions})
submission

In [None]:
submission.to_csv('submission.csv', index=False)
print('Finish')