In [1]:
import pandas as pd
X = pd.read_csv('train.csv')
X_test = pd.read_csv('test.csv')

#dropping any row without target
X.dropna(axis=0, subset=['Promoted_or_Not'], inplace=True)


print(f"Training data contains {X.shape[0]} employees and {X.shape[1]} features.")
X.head()

Training data contains 38312 employees and 19 features.


Unnamed: 0,EmployeeNo,Division,Qualification,Gender,Channel_of_Recruitment,Trainings_Attended,Year_of_birth,Last_performance_score,Year_of_recruitment,Targets_met,Previous_Award,Training_score_average,State_Of_Origin,Foreign_schooled,Marital_Status,Past_Disciplinary_Action,Previous_IntraDepartmental_Movement,No_of_previous_employers,Promoted_or_Not
0,YAK/S/00001,Commercial Sales and Marketing,"MSc, MBA and PhD",Female,Direct Internal process,2,1986,12.5,2011,1,0,41,ANAMBRA,No,Married,No,No,0,0
1,YAK/S/00002,Customer Support and Field Operations,First Degree or HND,Male,Agency and others,2,1991,12.5,2015,0,0,52,ANAMBRA,Yes,Married,No,No,0,0
2,YAK/S/00003,Commercial Sales and Marketing,First Degree or HND,Male,Direct Internal process,2,1987,7.5,2012,0,0,42,KATSINA,Yes,Married,No,No,0,0
3,YAK/S/00004,Commercial Sales and Marketing,First Degree or HND,Male,Agency and others,3,1982,2.5,2009,0,0,42,NIGER,Yes,Single,No,No,1,0
4,YAK/S/00006,Information and Strategy,First Degree or HND,Male,Direct Internal process,3,1990,7.5,2012,0,0,77,AKWA IBOM,Yes,Married,No,No,1,0


In [2]:
#getting the columns with missing values
# missing_values = pd.concat([X.isnull().sum(), X_test.isnull().sum()], keys=['X', 'X_test'], axis=1, sort=True)
# print(missing_values)
X.isnull().sum()

EmployeeNo                                0
Division                                  0
Qualification                          1679
Gender                                    0
Channel_of_Recruitment                    0
Trainings_Attended                        0
Year_of_birth                             0
Last_performance_score                    0
Year_of_recruitment                       0
Targets_met                               0
Previous_Award                            0
Training_score_average                    0
State_Of_Origin                           0
Foreign_schooled                          0
Marital_Status                            0
Past_Disciplinary_Action                  0
Previous_IntraDepartmental_Movement       0
No_of_previous_employers                  0
Promoted_or_Not                           0
dtype: int64

In [3]:
#dropping 'EmployeeNo' column, since it is of no importance to our training set
X.drop('EmployeeNo', axis=1, inplace=True)

index_no = X_test['EmployeeNo']
X_test.drop('EmployeeNo', axis=1, inplace=True)

In [4]:
X['Qualification'].value_counts()

First Degree or HND         25578
MSc, MBA and PhD            10469
Non-University Education      586
Name: Qualification, dtype: int64

In [5]:
map_dict = {'Qualification': {
    'Non-University Education': 1, 
    'First Degree or HND': 2,
    'MSc, MBA and PhD': 3
}}

X = X.replace(map_dict)
X_test = X_test.replace(map_dict)
# Filling in the missing values with the most frequent value.
# X['Qualification'].fillna('First Degree or HND', inplace=True)
# X_test['Qualification'].fillna('First Degree or HND', inplace=True)

In [6]:
X_test['State_Of_Origin'].unique()

array(['FCT', 'OGUN', 'KANO', 'RIVERS', 'BENUE', 'LAGOS', 'OYO', 'BORNO',
       'KATSINA', 'CROSS RIVER', 'KADUNA', 'DELTA', 'IMO', 'ONDO', 'OSUN',
       'TARABA', 'ANAMBRA', 'NASSARAWA', 'SOKOTO', 'ENUGU', 'EDO', 'ABIA',
       'BAUCHI', 'KEBBI', 'ZAMFARA', 'KWARA', 'NIGER', 'KOGI', 'GOMBE',
       'ADAMAWA', 'PLATEAU', 'EKITI', 'BAYELSA', 'YOBE', 'AKWA IBOM',
       'JIGAWA', 'EBONYI'], dtype=object)

In [7]:
#Grouping the states into geopolitical zones, excluding Lagos and FCT
zones = dict(
    SOUTH_WEST = ['OYO', 'ONDO', 
                  'EKITI', 'OGUN', 'OSUN'],
    SOUTH_EAST = ['ANAMBRA', 'IMO', 
                  'ENUGU', 'ABIA', 'EBONYI'],
    NORTH_WEST = ['JIGAWA', 'KANO', 'KATSINA', 
                  'KADUNA', 'KEBBI', 'ZAMFARA', 'SOKOTO'],
    NORTH_EAST = ['GOMBE', 'BAUCHI', 'YOBE', 
                  'BORNO', 'ADAMAWA', 'TARABA'],
    NORTH_CENTRAL = ['NIGER', 'BENUE', 'NASSARAWA', 
                     'PLATEAU', 'KOGI', 'KWARA'],
    SOUTH_SOUTH = ['AKWA IBOM', 'EDO', 'RIVERS', 
                   'CROSS RIVER', 'DELTA', 'BAYELSA']
)

X.loc[X.State_Of_Origin.isin(zones['SOUTH_WEST']), 'State_Of_Origin'] = 'SOUTH_WEST'
X.loc[X.State_Of_Origin.isin(zones['SOUTH_EAST']), 'State_Of_Origin'] = 'SOUTH_EAST'
X.loc[X.State_Of_Origin.isin(zones['SOUTH_SOUTH']), 'State_Of_Origin'] = 'SOUTH_SOUTH'
X.loc[X.State_Of_Origin.isin(zones['NORTH_WEST']), 'State_Of_Origin'] = 'NORTH_WEST'
X.loc[X.State_Of_Origin.isin(zones['NORTH_EAST']), 'State_Of_Origin'] = 'NORTH_EAST'
X.loc[X.State_Of_Origin.isin(zones['NORTH_CENTRAL']), 'State_Of_Origin'] = 'NORTH_CENTRAL'


In [8]:
X['State_Of_Origin'].value_counts()

SOUTH_SOUTH      6567
LAGOS            6204
SOUTH_WEST       6069
SOUTH_EAST       5305
NORTH_WEST       5107
NORTH_CENTRAL    4114
NORTH_EAST       2557
FCT              2389
Name: State_Of_Origin, dtype: int64

In [9]:
pd.crosstab(index=X['State_Of_Origin'], columns=X['Promoted_or_Not'])

Promoted_or_Not,0,1
State_Of_Origin,Unnamed: 1_level_1,Unnamed: 2_level_1
FCT,2202,187
LAGOS,5670,534
NORTH_CENTRAL,3794,320
NORTH_EAST,2334,223
NORTH_WEST,4674,433
SOUTH_EAST,4847,458
SOUTH_SOUTH,6014,553
SOUTH_WEST,5536,533


In [10]:
X.dtypes

Division                                object
Qualification                          float64
Gender                                  object
Channel_of_Recruitment                  object
Trainings_Attended                       int64
Year_of_birth                            int64
Last_performance_score                 float64
Year_of_recruitment                      int64
Targets_met                              int64
Previous_Award                           int64
Training_score_average                   int64
State_Of_Origin                         object
Foreign_schooled                        object
Marital_Status                          object
Past_Disciplinary_Action                object
Previous_IntraDepartmental_Movement     object
No_of_previous_employers                object
Promoted_or_Not                          int64
dtype: object

In [11]:

#gathering the columns with categorical data
object_cols = [col for col in X.columns if X[col].dtype == 'object']

#getting the number of unique entries in each categorical column
num_of_uniques = list(map(lambda col: X[col].nunique(), object_cols))
d = dict(zip(object_cols, num_of_uniques))
sorted(d.items(), key = lambda x: x[1])


[('Gender', 2),
 ('Foreign_schooled', 2),
 ('Past_Disciplinary_Action', 2),
 ('Previous_IntraDepartmental_Movement', 2),
 ('Channel_of_Recruitment', 3),
 ('Marital_Status', 3),
 ('No_of_previous_employers', 7),
 ('State_Of_Origin', 8),
 ('Division', 9)]

In [12]:
from sklearn.preprocessing import OneHotEncoder

#using OneHotEncoder to transform the categorical data into numerics
my_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)

OH_X = pd.DataFrame(my_encoder.fit_transform(X[object_cols]))
OH_X_test = pd.DataFrame(my_encoder.transform(X_test[object_cols]))

#OH_encoder removes indexes, putting them back
OH_X.index, OH_X_test.index = X.index, X_test.index

#removinig categorical columns (will replace with onehot encoding)
num_X = X.drop(object_cols, axis=1)
num_X_test = X_test.drop(object_cols, axis=1)

#adding onehot encoded columns to numerical features
X = pd.concat([OH_X, num_X], axis=1)
X_test = pd.concat([OH_X_test, num_X_test], axis=1)

X_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,36,37,Qualification,Trainings_Attended,Year_of_birth,Last_performance_score,Year_of_recruitment,Targets_met,Previous_Award,Training_score_average
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.0,2,1976,7.5,2017,0,0,65
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,,2,1991,0.0,2018,0,0,69
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,3.0,2,1984,7.5,2012,0,0,76
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.0,2,1984,2.5,2009,0,0,52
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.0,2,1983,7.5,2014,1,0,69


In [13]:
X['Promoted_or_Not'].value_counts()

0    35071
1     3241
Name: Promoted_or_Not, dtype: int64

In [14]:
X.groupby('Promoted_or_Not').mean()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,36,37,Qualification,Trainings_Attended,Year_of_birth,Last_performance_score,Year_of_recruitment,Targets_met,Previous_Award,Training_score_average
Promoted_or_Not,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.046762,0.309486,0.207265,0.12603,0.096176,0.045821,0.019731,0.018619,0.130107,0.29694,...,0.024864,0.01055,2.266565,2.258191,1986.167517,7.56273,2013.123777,0.320379,0.013943,54.641584
1,0.045048,0.259488,0.217217,0.164147,0.107374,0.029929,0.01265,0.013268,0.150879,0.314101,...,0.021907,0.009565,2.303894,2.204875,1986.661833,9.173095,2013.311941,0.705955,0.122802,63.210429


In [15]:
#separating target from predictors
y = X['Promoted_or_Not']
X.drop('Promoted_or_Not', axis=1, inplace=True)

In [16]:
# from sklearn.pipeline import Pipeline
# from sklearn.compose import ColumnTransformer
# from sklearn.impute import SimpleImputer
# from sklearn.preprocessing import OneHotEncoder

# #preprocessing the numerical values
# numerical_transformer = SimpleImputer(strategy='median')

# #preprocessing the categorical values
# categorical_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='most_frequent')),
#     ('onehot', OneHotEncoder(handle_unknown='ignore'))
# ])

# #bundling the numerical and categorical values together for preprocessing
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', numerical_transformer, numerical_cols),
#         ('cat', categorical_transformer, categorical_cols)
#     ])

# #bundling the preprocessor and model
# clf = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('model', model)
# ])

In [17]:
from xgboost import XGBClassifier

#declaring the model
model = XGBClassifier(n_estimators=1000, learning_rate=0.07, n_jobs=2)

In [18]:
# from sklearn.feature_selection import RFE

# rfe = RFE(model, 15)
# rfe.fit(X, y)

# print(rfe.support_)
# print(rfe.ranking_)

In [19]:
X_test.columns

Index([                       0,                        1,
                              2,                        3,
                              4,                        5,
                              6,                        7,
                              8,                        9,
                             10,                       11,
                             12,                       13,
                             14,                       15,
                             16,                       17,
                             18,                       19,
                             20,                       21,
                             22,                       23,
                             24,                       25,
                             26,                       27,
                             28,                       29,
                             30,                       31,
                             32,                       3

In [20]:
features = [0, 1 ,2, 3, 5, 6, 7, 8, 9, 'Trainings_Attended',
            'Last_performance_score', 'Targets_met', 'Previous_Award', 'Training_score_average']

In [21]:
from sklearn.model_selection import train_test_split

#using train_test_split to split the training data for model validation
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)

X_train = X_train[features]
X_valid = X_valid[features]
# X_test = X_test[features]

In [22]:
from sklearn.metrics import accuracy_score

#fitting the model and using it to predict X_test
model.fit(X_train, y_train,
         early_stopping_rounds=5,
         eval_set=[(X_valid, y_valid)],
         verbose=False)
prediction = model.predict(X_valid)
print("Model accuracy using f1_score: ", accuracy_score(prediction, y_valid))

Model accuracy using f1_score:  0.9257470964374266


In [23]:
model.fit(X, y)
preds_test = model.predict(X_test)

In [24]:
output = pd.DataFrame({
    'EmployeeNo': index_no,
    'Promoted_or_Not': preds_test
})
output.to_csv('submission_local.csv', index=False)
output.head()

Unnamed: 0,EmployeeNo,Promoted_or_Not
0,YAK/S/00005,0
1,YAK/S/00011,0
2,YAK/S/00015,0
3,YAK/S/00016,0
4,YAK/S/00017,0
