In [1]:
# Import dependencies.
import pandas as pd
import numpy as np
import datawig
from datawig import SimpleImputer

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load data.
df = pd.read_csv('../resources/train.csv')

# View data.
df.head()

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0


In [3]:
# View data information.
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19158 entries, 0 to 19157
Data columns (total 14 columns):
enrollee_id               19158 non-null int64
city                      19158 non-null object
city_development_index    19158 non-null float64
gender                    14650 non-null object
relevent_experience       19158 non-null object
enrolled_university       18772 non-null object
education_level           18698 non-null object
major_discipline          16345 non-null object
experience                19093 non-null object
company_size              13220 non-null object
company_type              13018 non-null object
last_new_job              18735 non-null object
training_hours            19158 non-null int64
target                    19158 non-null float64
dtypes: float64(2), int64(2), object(10)
memory usage: 2.0+ MB
None


In [4]:
# Print null value counts for each column.
df.isnull().sum()

enrollee_id                  0
city                         0
city_development_index       0
gender                    4508
relevent_experience          0
enrolled_university        386
education_level            460
major_discipline          2813
experience                  65
company_size              5938
company_type              6140
last_new_job               423
training_hours               0
target                       0
dtype: int64

In [5]:
# Drop rows with nan values from selected columns. 
# These features all have fewer than 500 missing values and with `NaN`'s dropped can be used to impute missing values.
df = df.dropna(subset=['enrolled_university', 'education_level', 'experience', 'last_new_job'])

In [6]:
# View missing values.
df.isnull().sum()

enrollee_id                  0
city                         0
city_development_index       0
gender                    3863
relevent_experience          0
enrolled_university          0
education_level              0
major_discipline          2222
experience                   0
company_size              5310
company_type              5476
last_new_job                 0
training_hours               0
target                       0
dtype: int64

In [7]:
# Compile the names of the features without missing values into a list.
input_cols = df.dropna(axis=1).drop(columns=['enrollee_id', 'target']).columns.tolist()
input_cols

['city',
 'city_development_index',
 'relevent_experience',
 'enrolled_university',
 'education_level',
 'experience',
 'last_new_job',
 'training_hours']

In [8]:
# Compile the names of the features with missing values (to be imputed) into a list.
output_cols = df.columns[df.isnull().any()].tolist()

In [9]:
# Create a copy of the original DataFrame.
df_cleaned = df.copy()

## Impute missing values with Datawig

In [10]:
# For each remaining feature that has missing values, impute.
for feature in output_cols:
    # Create a new DataFrame that consists of the columns without missing values and the column to impute missing values for.
    feature_included = df[input_cols + [feature]]
    
    # Separate rows without missing values into a DataFrame, which will be used for training.
    feature_df = feature_included.dropna(how='any')
    
    # Separate rows with missing values into another DataFrame.
    imputation_df = feature_included[feature_included.isnull().any(axis=1)]
    
    # Compile the model.
    model = datawig.SimpleImputer(
        input_columns = input_cols, # column(s) containing information about the column we want to impute
        output_column = feature # column we'd like to impute values for
    )
    
    # Train the model.
    model.fit(feature_df);

    # Make predictions using the trained model.
    imputation_df = model.predict(imputation_df)
    
    # Copy the imputed values from the automatically generated column into the original feature column.
    imputation_df[feature] = imputation_df[f'{feature}_imputed']
    
    # Because the column name and row indices are preserved, the imputed feature from `imputation_df` can be used directly to 
    # fill missing values of the same feature from `df_cleaned`.
    df_cleaned[feature].fillna(imputation_df[feature], inplace=True)

2023-06-07 23:07:27,561 [INFO]  
2023-06-07 23:07:31,734 [INFO]  Epoch[0] Batch [0-398]	Speed: 1533.10 samples/sec	cross-entropy=0.356133	gender-accuracy=0.903979
2023-06-07 23:07:35,896 [INFO]  Epoch[0] Train-cross-entropy=0.343402
2023-06-07 23:07:35,896 [INFO]  Epoch[0] Train-gender-accuracy=0.903737
2023-06-07 23:07:35,897 [INFO]  Epoch[0] Time cost=8.334
2023-06-07 23:07:35,899 [INFO]  Saved checkpoint to "gender\model-0000.params"
2023-06-07 23:07:36,581 [INFO]  Epoch[0] Validation-cross-entropy=0.348817
2023-06-07 23:07:36,581 [INFO]  Epoch[0] Validation-gender-accuracy=0.892556
2023-06-07 23:07:40,700 [INFO]  Epoch[1] Batch [0-398]	Speed: 1550.77 samples/sec	cross-entropy=0.328298	gender-accuracy=0.903979
2023-06-07 23:07:44,850 [INFO]  Epoch[1] Train-cross-entropy=0.324988
2023-06-07 23:07:44,851 [INFO]  Epoch[1] Train-gender-accuracy=0.903737
2023-06-07 23:07:44,851 [INFO]  Epoch[1] Time cost=8.269
2023-06-07 23:07:44,855 [INFO]  Saved checkpoint to "gender\model-0001.params"

In [11]:
# # The accuracy of imputations for each feature can be obtained by splitting `feature_df` into a training set and a testing set, 
# # fitting `SimpleImputer` on the training set, and making predictions for the testing test to compared with the target values.

# # Perform train-test split while ensuring that strata's relative proportions are represented.
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(feature_df[input_cols], 
#                                                     feature_df[feature], 
#                                                     random_state=42, 
#                                                     stratify=feature_df[feature])

# # Concatenate X and y since features and label (in this case, the column with missing values to be imputed) are combined for the
# # Datawig model.
# train = pd.concat([X_train, y_train], axis=1)
# test = pd.concat([X_test, y_test], axis=1)

# # Compile the model.
# model = datawig.SimpleImputer(
#     input_columns = input_cols, # column(s) containing information about the column we want to impute
#     output_column = feature # the column we'd like to impute values for
# )

# # Train the model.
# model.fit(train);

# # Make imputations using the trained model.
# imputations = model.predict(test)

# # Compute the accuracy score.
# from sklearn.metrics import accuracy_score
# accuracy_score(imputations[feature], imputations[f'{feature}_imputed'])

In [12]:
# Export df_cleaned as a CSV file.
df_cleaned.to_csv('../resources/imputed_loop.csv', index=False)