In [1]:
# Import dependencies
import pandas as pd
from pprint import pprint

# HR ANALYSIS

In [2]:
# Load data
path = '../resources/train.csv'

hr_df = pd.read_csv(path)

In [3]:
# View data
hr_df

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19153,7386,city_173,0.878,Male,No relevent experience,no_enrollment,Graduate,Humanities,14,,,1,42,1.0
19154,31398,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,14,,,4,52,1.0
19155,24576,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,50-99,Pvt Ltd,4,44,0.0
19156,5756,city_65,0.802,Male,Has relevent experience,no_enrollment,High School,,<1,500-999,Pvt Ltd,2,97,0.0


### Features

1. `enrollee_id :` Unique ID for candidate

2. `city :` City code

3. `city_development_index :` Developement index of the city (scaled)

4. `gender :` Gender of candidate

5. `relevent_experience :` Relevant experience of candidate

6. `enrolled_university :` Type of University course enrolled if any

7. `education_level :` Education level of candidate

8. `major_discipline :` Education major discipline of candidate

9. `experience :` Candidate total experience in years

10. `company_size :` No of employees in current employer's company

11. `company_type :` Type of current employer

12. `last_new_job :` Difference in years between previous job and current job

13. `training_hours :` training hours completed

14. `target :` 0 – Not looking for job change, 1 – Looking for a job change

In [4]:
# View data information
print(hr_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19158 entries, 0 to 19157
Data columns (total 14 columns):
enrollee_id               19158 non-null int64
city                      19158 non-null object
city_development_index    19158 non-null float64
gender                    14650 non-null object
relevent_experience       19158 non-null object
enrolled_university       18772 non-null object
education_level           18698 non-null object
major_discipline          16345 non-null object
experience                19093 non-null object
company_size              13220 non-null object
company_type              13018 non-null object
last_new_job              18735 non-null object
training_hours            19158 non-null int64
target                    19158 non-null float64
dtypes: float64(2), int64(2), object(10)
memory usage: 2.0+ MB
None


In [5]:
# Print null value counts for each column
print(hr_df.isnull().sum())
print(hr_df.shape)

enrollee_id                  0
city                         0
city_development_index       0
gender                    4508
relevent_experience          0
enrolled_university        386
education_level            460
major_discipline          2813
experience                  65
company_size              5938
company_type              6140
last_new_job               423
training_hours               0
target                       0
dtype: int64
(19158, 14)


In [6]:
# Drop rows with all nan values
dropped_nan_rows = hr_df.dropna(how='all')

In [7]:
# View dropped rows
print(dropped_nan_rows.isnull().sum())
print(dropped_nan_rows.shape)

enrollee_id                  0
city                         0
city_development_index       0
gender                    4508
relevent_experience          0
enrolled_university        386
education_level            460
major_discipline          2813
experience                  65
company_size              5938
company_type              6140
last_new_job               423
training_hours               0
target                       0
dtype: int64
(19158, 14)


In [8]:
# Drop rows with nan values from selected columns
dropped_rows = hr_df.dropna(subset=['enrolled_university', 
                                     'education_level',
                                     'experience',
                                     'last_new_job'
                                    ])

In [9]:
# View dropped rows
print(dropped_rows.isnull().sum())
print(dropped_rows.shape)

enrollee_id                  0
city                         0
city_development_index       0
gender                    3863
relevent_experience          0
enrolled_university          0
education_level              0
major_discipline          2222
experience                   0
company_size              5310
company_type              5476
last_new_job                 0
training_hours               0
target                       0
dtype: int64
(18014, 14)


In [10]:
# Create a copy of the dropped_rows
dropped_df = dropped_rows.copy()

## Impute missing values with Datawig
* https://github.com/awslabs/datawig
* https://towardsdatascience.com/imputation-of-missing-data-in-tables-with-datawig-2d7ab327ece2

In [11]:
# Import dependencies
import numpy as np
import datawig
from datawig.utils import random_split
from datawig import SimpleImputer

# Perform train-test split (Default is 80/20 split)
df_train, df_test = random_split(dropped_df, split_ratios=[0.8, 0.2])

# Randomly hide 25% of cells in test dataframe
hide_proportion = 0.25
df_test_missing = df_test.mask(np.random.rand(*df_test.shape) > (1 - hide_proportion))

In [12]:
# Define columns with useful info for to-be-imputed column
input_cols = ['city',
              'city_development_index',
              'relevent_experience',
              'enrolled_university', 
              'education_level', 
              'experience',
              'last_new_job',
              'training_hours'
             ]

# Define columns to be imputed
output_col_gen = 'gender'
output_col_major = 'major_discipline'
output_col_comp_size = 'company_size'
output_col_comp_type = 'company_type'

# Initialize a SimpleImputer model for categorical imputation
imputer_gen = datawig.SimpleImputer(
    input_columns = input_cols, # column(s) containing information about the column we want to impute
    output_column = output_col_gen, # the column we'd like to impute values for
    output_path =  'imputer_gen' # stores model data and metrics
    )

imputer_major = datawig.SimpleImputer(
    input_columns = input_cols,
    output_column = output_col_major,
    output_path =  'imputer_major'
    )

imputer_comp_size = datawig.SimpleImputer(
    input_columns = input_cols, 
    output_column = output_col_comp_size, 
    output_path =  'imputer_comp_size'
    )

imputer_comp_type = datawig.SimpleImputer(
    input_columns = input_cols, 
    output_column = output_col_comp_type, 
    output_path =  'imputer_comp_type' 
    )

In [13]:
#Fit an imputer model on the train data
imputer_gen.fit(train_df=df_train, num_epochs=50)
imputer_major.fit(train_df=df_train, num_epochs=50)
imputer_comp_size.fit(train_df=df_train, num_epochs=50)
imputer_comp_type.fit(train_df=df_train, num_epochs=50)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
[15:13:50] src/operator/random/../../common/utils.h:450: 
Storage type fallback detected:
operator = Concat
input storage types = [csr, default, ]
output storage types = [default, ]
params = {"num_args" : 2, "dim" : 1, }
context.dev_mask = cpu
2023-06-07 15:13:50,357 [INFO]  
2023-06-07 15:13:52,513 [INFO]  Epoch[0] Batch [0-320]	Speed: 2389.24 samples/sec	cross-entropy=0.351402	gender-accuracy=0.909073
2023-06-07 15:13:54,726 [INFO]  Epoch[0] Train-cross-entropy=0.346246
2023-06-07 15:13:54,727 [INFO]  Epoch[0] Train-gender-accuracy=0.903854
2023-06-07 15:13:54,727 [INFO]  Epoch[0] Time cost=4.367
2023-06-07 15:13:54,734 [INFO]  Saved checkpoint to "imputer_gen/model-0000.params"
2023-06-07 15:13:55,147 [

2023-06-07 15:14:54,794 [INFO]  Epoch[12] Train-gender-accuracy=0.903951
2023-06-07 15:14:54,794 [INFO]  Epoch[12] Time cost=4.549
2023-06-07 15:14:54,800 [INFO]  Saved checkpoint to "imputer_gen/model-0012.params"
2023-06-07 15:14:55,201 [INFO]  Epoch[12] Validation-cross-entropy=0.335364
2023-06-07 15:14:55,202 [INFO]  Epoch[12] Validation-gender-accuracy=0.894643
2023-06-07 15:14:57,447 [INFO]  Epoch[13] Batch [0-320]	Speed: 2288.51 samples/sec	cross-entropy=0.305563	gender-accuracy=0.909073
2023-06-07 15:14:59,652 [INFO]  Epoch[13] Train-cross-entropy=0.312286
2023-06-07 15:14:59,653 [INFO]  Epoch[13] Train-gender-accuracy=0.903854
2023-06-07 15:14:59,653 [INFO]  Epoch[13] Time cost=4.451
2023-06-07 15:14:59,659 [INFO]  Saved checkpoint to "imputer_gen/model-0013.params"
2023-06-07 15:15:00,069 [INFO]  Epoch[13] Validation-cross-entropy=0.335387
2023-06-07 15:15:00,070 [INFO]  Epoch[13] Validation-gender-accuracy=0.894643
2023-06-07 15:15:02,372 [INFO]  Epoch[14] Batch [0-320]	Spee

<datawig.simple_imputer.SimpleImputer at 0x7fadea6f5f90>

## Test set

In [14]:
# Impute missing values and return original dataframe with predictions
pred_gen_test = imputer_gen.predict(df_test_missing)
pred_major_test = imputer_major.predict(df_test_missing)
pred_comp_size_test = imputer_comp_size.predict(df_test_missing)
pred_comp_type_test = imputer_comp_type.predict(df_test_missing)

In [15]:
pred_gen_test.columns

Index(['enrollee_id', 'city', 'city_development_index', 'gender',
       'relevent_experience', 'enrolled_university', 'education_level',
       'major_discipline', 'experience', 'company_size', 'company_type',
       'last_new_job', 'training_hours', 'target', 'gender_imputed',
       'gender_imputed_proba'],
      dtype='object')

### 'gender' predictions

In [16]:
# Drop unneccessary columns
pred_gen_test_df = pred_gen_test.drop(columns=['city', 
                                               'city_development_index',
                                               'relevent_experience', 
                                               'enrolled_university', 
                                               'education_level',
                                               'major_discipline', 
                                               'experience', 
                                               'company_size', 
                                               'company_type',
                                               'last_new_job', 
                                               'training_hours', 
                                               'target'
                                              ])

In [17]:
acc_gen_test = pred_gen_test_df.loc[pred_gen_test_df['gender_imputed_proba'] < 0.5]

In [18]:
len(acc_gen_test)

2

In [19]:
acc_gen_test

Unnamed: 0,enrollee_id,gender,gender_imputed,gender_imputed_proba
3043,15026.0,,Male,0.496757
18475,29661.0,Male,Female,0.479182


In [20]:
# 'gender' accuracy
gender_test_acc = pred_gen_test_df['gender_imputed_proba'].mean()
print(f"Accuracy result: {gender_test_acc}")

Accuracy result: 0.879860559522874


### 'major_discipline' predictions

In [21]:
# Drop unneccessary columns
pred_major_test_df = pred_major_test.drop(columns=['city', 
                                                   'city_development_index',
                                                   'gender',
                                                   'relevent_experience', 
                                                   'enrolled_university', 
                                                   'education_level',
                                                   'experience', 
                                                   'company_size', 
                                                   'company_type',
                                                   'last_new_job', 
                                                   'training_hours', 
                                                   'target'
                                                  ])

In [22]:
# 'major_discipline' accuracy
major_test_acc = pred_major_test_df['major_discipline_imputed_proba'].mean()
print(f"Accuracy result: {major_test_acc}")

Accuracy result: 0.8930370881698451


### 'company_size' predictions

In [23]:
# Drop unneccessary columns
pred_comp_size_test_df = pred_comp_size_test.drop(columns=['city', 
                                                           'city_development_index',
                                                           'gender',
                                                           'relevent_experience', 
                                                           'enrolled_university', 
                                                           'education_level',
                                                           'major_discipline', 
                                                           'experience',  
                                                           'company_type',
                                                           'last_new_job', 
                                                           'training_hours', 
                                                           'target'
                                                          ])

In [24]:
# 'company_size' accuracy
comp_size_test_acc = pred_comp_size_test_df['company_size_imputed_proba'].mean()
print(f"Accuracy result: {comp_size_test_acc}")

Accuracy result: 0.20031740645082177


### 'company_type' predictions

In [25]:
# Drop unneccessary columns
pred_comp_type_test_df = pred_comp_type_test.drop(columns=['city', 
                                                           'city_development_index',
                                                           'gender',
                                                           'relevent_experience', 
                                                           'enrolled_university', 
                                                           'education_level',
                                                           'major_discipline', 
                                                           'experience', 
                                                           'company_size', 
                                                           'last_new_job', 
                                                           'training_hours', 
                                                           'target'
                                                          ])

In [26]:
# 'company_type' accuracy
comp_type_test_acc = pred_comp_type_test_df['company_type_imputed_proba'].mean()
print(f"Accuracy result: {comp_type_test_acc}")

Accuracy result: 0.7402715356829165


## Make predictions on 'dropped_df'

In [27]:
pred_gen = imputer_gen.predict(dropped_df)
pred_major = imputer_major.predict(dropped_df)
pred_comp_size = imputer_comp_size.predict(dropped_df)
pred_comp_type = imputer_comp_type.predict(dropped_df)

### 'gender' predictions

In [28]:
pred_gen_df = pred_gen.drop(columns=['city', 
                                     'city_development_index',
                                     'relevent_experience', 
                                     'enrolled_university', 
                                     'education_level',
                                     'major_discipline', 
                                     'experience', 
                                     'company_size', 
                                     'company_type',
                                     'last_new_job', 
                                     'training_hours', 
                                     'target'
                                    ])

In [29]:
# 'gender' accuracy
gender_acc = pred_gen_df['gender_imputed_proba'].mean()
print(f"Accuracy result: {gender_acc}")

Accuracy result: 0.8919209057219725


### 'major_discipline' predictions

In [30]:
pred_major_df = pred_major.drop(columns=['city', 
                                         'city_development_index',
                                         'gender',
                                         'relevent_experience', 
                                         'enrolled_university', 
                                         'education_level', 
                                         'experience', 
                                         'company_size', 
                                         'company_type',
                                         'last_new_job', 
                                         'training_hours', 
                                         'target'
                                        ])

In [31]:
# 'major_discipline' accuracy
major_acc = pred_major_df['major_discipline_imputed_proba'].mean()
print(f"Accuracy result: {major_acc}")

Accuracy result: 0.9035371218145098


### 'company_size' predictions

In [32]:
pred_comp_size_df = pred_comp_size.drop(columns=['city', 
                                                 'city_development_index',
                                                 'gender',
                                                 'relevent_experience', 
                                                 'enrolled_university', 
                                                 'education_level',
                                                 'major_discipline', 
                                                 'experience',  
                                                 'company_type',
                                                 'last_new_job', 
                                                 'training_hours', 
                                                 'target'
                                                ])

In [33]:
# 'company_size' accuracy
comp_size_acc = pred_comp_size_df['company_size_imputed_proba'].mean()
print(f"Accuracy result: {comp_size_acc}")

Accuracy result: 0.20323455377716698


### 'company_type' predictions

In [34]:
pred_comp_type_df = pred_comp_type.drop(columns=['city', 
                                                 'city_development_index',
                                                 'gender',
                                                 'relevent_experience', 
                                                 'enrolled_university', 
                                                 'education_level',
                                                 'major_discipline', 
                                                 'experience', 
                                                 'company_size', 
                                                 'last_new_job', 
                                                 'training_hours', 
                                                 'target'
                                                ])

In [35]:
# 'company_type' accuracy
comp_type_acc = pred_comp_type_df['company_type_imputed_proba'].mean()
print(f"Accuracy result: {comp_type_acc}")

Accuracy result: 0.7580764058332564


In [36]:
len(pred_comp_type_df)

18014

In [37]:
df1 = pred_comp_type_df.loc[pred_comp_type_df['company_type_imputed_proba'] < 0.8]
df1

Unnamed: 0,enrollee_id,company_type,company_type_imputed,company_type_imputed_proba
0,8949,,Pvt Ltd,0.726640
2,11561,,Pvt Ltd,0.775364
6,28806,Funded Startup,Pvt Ltd,0.744974
8,27107,Pvt Ltd,Pvt Ltd,0.677420
10,29452,,Pvt Ltd,0.759327
...,...,...,...,...
19151,11385,,Pvt Ltd,0.385932
19152,29754,Funded Startup,Pvt Ltd,0.678418
19153,7386,,Pvt Ltd,0.728412
19154,31398,,Pvt Ltd,0.786314


In [38]:
# Merge 'dropped_df' with predicted dataframes pred_gen, pred_major, pred_comp_size, pred_comp_type 
merge_df = dropped_df.merge(pred_gen_df, on='enrollee_id')
merge_2_df = merge_df.merge(pred_major_df, on='enrollee_id')
merge_3_df = merge_2_df.merge(pred_comp_size_df, on='enrollee_id')
merge_4_df = merge_3_df.merge(pred_comp_type_df, on='enrollee_id')

In [39]:
merge_4_df.columns.tolist()

['enrollee_id',
 'city',
 'city_development_index',
 'gender_x',
 'relevent_experience',
 'enrolled_university',
 'education_level',
 'major_discipline_x',
 'experience',
 'company_size_x',
 'company_type_x',
 'last_new_job',
 'training_hours',
 'target',
 'gender_y',
 'gender_imputed',
 'gender_imputed_proba',
 'major_discipline_y',
 'major_discipline_imputed',
 'major_discipline_imputed_proba',
 'company_size_y',
 'company_size_imputed',
 'company_size_imputed_proba',
 'company_type_y',
 'company_type_imputed',
 'company_type_imputed_proba']

In [40]:
dropped_merge_4_df = merge_4_df.drop(columns=['gender_y',
                                              'gender_imputed_proba',
                                              'major_discipline_y',
                                              'major_discipline_imputed_proba',
                                              'company_size_y',
                                              'company_size_imputed_proba',
                                              'company_type_y',
                                              'company_type_imputed_proba'
                                             ])

In [41]:
dropped_merge_4_df.columns.tolist()

['enrollee_id',
 'city',
 'city_development_index',
 'gender_x',
 'relevent_experience',
 'enrolled_university',
 'education_level',
 'major_discipline_x',
 'experience',
 'company_size_x',
 'company_type_x',
 'last_new_job',
 'training_hours',
 'target',
 'gender_imputed',
 'major_discipline_imputed',
 'company_size_imputed',
 'company_type_imputed']

In [42]:
rename_cols = {'gender_x' : 'gender',
               'major_discipline_x' : 'major_discipline',
               'company_size_x' : 'company_size',
               'company_type_x' : 'company_type'
              }

renamed_merge_4_df = dropped_merge_4_df.rename(columns=rename_cols)

In [43]:
renamed_merge_4_df.columns.tolist()

['enrollee_id',
 'city',
 'city_development_index',
 'gender',
 'relevent_experience',
 'enrolled_university',
 'education_level',
 'major_discipline',
 'experience',
 'company_size',
 'company_type',
 'last_new_job',
 'training_hours',
 'target',
 'gender_imputed',
 'major_discipline_imputed',
 'company_size_imputed',
 'company_type_imputed']

In [44]:
print(renamed_merge_4_df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18014 entries, 0 to 18013
Data columns (total 18 columns):
enrollee_id                 18014 non-null int64
city                        18014 non-null object
city_development_index      18014 non-null float64
gender                      14151 non-null object
relevent_experience         18014 non-null object
enrolled_university         18014 non-null object
education_level             18014 non-null object
major_discipline            15792 non-null object
experience                  18014 non-null object
company_size                12704 non-null object
company_type                12538 non-null object
last_new_job                18014 non-null object
training_hours              18014 non-null int64
target                      18014 non-null float64
gender_imputed              18014 non-null object
major_discipline_imputed    18014 non-null object
company_size_imputed        18014 non-null object
company_type_imputed        18014 non-nul

In [45]:
# Fill nan values with predicted from imputed columns
renamed_merge_4_df['gender'] = renamed_merge_4_df['gender'].fillna(renamed_merge_4_df['gender_imputed'])
renamed_merge_4_df['major_discipline'] = renamed_merge_4_df['major_discipline'].fillna(renamed_merge_4_df['major_discipline_imputed'])
renamed_merge_4_df['company_size'] = renamed_merge_4_df['company_size'].fillna(renamed_merge_4_df['company_size_imputed'])
renamed_merge_4_df['company_type'] = renamed_merge_4_df['company_type'].fillna(renamed_merge_4_df['company_type_imputed'])

In [46]:
# Check nan values are filled
print(renamed_merge_4_df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18014 entries, 0 to 18013
Data columns (total 18 columns):
enrollee_id                 18014 non-null int64
city                        18014 non-null object
city_development_index      18014 non-null float64
gender                      18014 non-null object
relevent_experience         18014 non-null object
enrolled_university         18014 non-null object
education_level             18014 non-null object
major_discipline            18014 non-null object
experience                  18014 non-null object
company_size                18014 non-null object
company_type                18014 non-null object
last_new_job                18014 non-null object
training_hours              18014 non-null int64
target                      18014 non-null float64
gender_imputed              18014 non-null object
major_discipline_imputed    18014 non-null object
company_size_imputed        18014 non-null object
company_type_imputed        18014 non-nul

In [47]:
# Drop imputed columns
cleaned_df = renamed_merge_4_df.drop(columns=[
    'gender_imputed',
    'major_discipline_imputed',
    'company_size_imputed',
    'company_type_imputed'
])

In [48]:
# Check columns have been dropped
cleaned_df.columns

Index(['enrollee_id', 'city', 'city_development_index', 'gender',
       'relevent_experience', 'enrolled_university', 'education_level',
       'major_discipline', 'experience', 'company_size', 'company_type',
       'last_new_job', 'training_hours', 'target'],
      dtype='object')

In [49]:
print(cleaned_df.isnull().sum())

enrollee_id               0
city                      0
city_development_index    0
gender                    0
relevent_experience       0
enrolled_university       0
education_level           0
major_discipline          0
experience                0
company_size              0
company_type              0
last_new_job              0
training_hours            0
target                    0
dtype: int64


In [None]:
# Export the cleaned_df as a CSV file
# cleaned_df.to_csv('../resources/imputed.csv', index=False)