In [1]:
#import libraries
import pandas as pd
import numpy as np
from datetime import datetime
import category_encoders as ce

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.preprocessing import LabelEncoder,StandardScaler,Normalizer
from sklearn.metrics import f1_score,r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer

import lightgbm as lgb
import seaborn as sns
import matplotlib.pyplot as plt


import warnings
warnings.filterwarnings('ignore')

In [3]:
#load in the data
df = pd.read_csv('./promotion/train.csv')
df.head()

Unnamed: 0,EmployeeNo,Division,Qualification,Gender,Channel_of_Recruitment,Trainings_Attended,Year_of_birth,Last_performance_score,Year_of_recruitment,Targets_met,Previous_Award,Training_score_average,State_Of_Origin,Foreign_schooled,Marital_Status,Past_Disciplinary_Action,Previous_IntraDepartmental_Movement,No_of_previous_employers,Promoted_or_Not
0,YAK/S/00001,Commercial Sales and Marketing,"MSc, MBA and PhD",Female,Direct Internal process,2,1986,12.5,2011,1,0,41,ANAMBRA,No,Married,No,No,0,0
1,YAK/S/00002,Customer Support and Field Operations,First Degree or HND,Male,Agency and others,2,1991,12.5,2015,0,0,52,ANAMBRA,Yes,Married,No,No,0,0
2,YAK/S/00003,Commercial Sales and Marketing,First Degree or HND,Male,Direct Internal process,2,1987,7.5,2012,0,0,42,KATSINA,Yes,Married,No,No,0,0
3,YAK/S/00004,Commercial Sales and Marketing,First Degree or HND,Male,Agency and others,3,1982,2.5,2009,0,0,42,NIGER,Yes,Single,No,No,1,0
4,YAK/S/00006,Information and Strategy,First Degree or HND,Male,Direct Internal process,3,1990,7.5,2012,0,0,77,AKWA IBOM,Yes,Married,No,No,1,0


In [9]:
#split into train and test

train_df = df.iloc[:int(df.shape[0]*0.8)]
test_df = df.iloc[int(df.shape[0]*0.8):]

In [10]:
#data cleaning - train_df
train_df.head()

Unnamed: 0,EmployeeNo,Division,Qualification,Gender,Channel_of_Recruitment,Trainings_Attended,Year_of_birth,Last_performance_score,Year_of_recruitment,Targets_met,Previous_Award,Training_score_average,State_Of_Origin,Foreign_schooled,Marital_Status,Past_Disciplinary_Action,Previous_IntraDepartmental_Movement,No_of_previous_employers,Promoted_or_Not
0,YAK/S/00001,Commercial Sales and Marketing,"MSc, MBA and PhD",Female,Direct Internal process,2,1986,12.5,2011,1,0,41,ANAMBRA,No,Married,No,No,0,0
1,YAK/S/00002,Customer Support and Field Operations,First Degree or HND,Male,Agency and others,2,1991,12.5,2015,0,0,52,ANAMBRA,Yes,Married,No,No,0,0
2,YAK/S/00003,Commercial Sales and Marketing,First Degree or HND,Male,Direct Internal process,2,1987,7.5,2012,0,0,42,KATSINA,Yes,Married,No,No,0,0
3,YAK/S/00004,Commercial Sales and Marketing,First Degree or HND,Male,Agency and others,3,1982,2.5,2009,0,0,42,NIGER,Yes,Single,No,No,1,0
4,YAK/S/00006,Information and Strategy,First Degree or HND,Male,Direct Internal process,3,1990,7.5,2012,0,0,77,AKWA IBOM,Yes,Married,No,No,1,0


In [11]:
import datetime

today = datetime.date.today()
today.year

2023

In [12]:
#change year of birth and Year of Recruitment to age

def year_replace(df):
    #get current year
    today = datetime.date.today()
    year = today.year
    df['Age'] = year - df['Year_of_birth']
    df['Age_of_recruitment'] = year - df['Year_of_recruitment']
    
    #drop year_of_birth and year_of_recruitment.
    df = df.drop(['Year_of_recruitment','Year_of_birth'],axis=1)
    return df

train_df = year_replace(train_df)

In [14]:
#drop EmployeeNo
train_df = train_df.drop(['EmployeeNo'],axis=1)

In [17]:
#check for null or missing values
train_df.isnull().sum()

Division                                  0
Qualification                          1359
Gender                                    0
Channel_of_Recruitment                    0
Trainings_Attended                        0
Last_performance_score                    0
Targets_met                               0
Previous_Award                            0
Training_score_average                    0
State_Of_Origin                           0
Foreign_schooled                          0
Marital_Status                            0
Past_Disciplinary_Action                  0
Previous_IntraDepartmental_Movement       0
No_of_previous_employers                  0
Promoted_or_Not                           0
Age                                       0
Age_of_recruitment                        0
dtype: int64

In [32]:
#replace null values in Qualification with "No degree class"
train_df.Qualification = train_df.Qualification.fillna('No degree')

#leave Not_Sure as a category in Marital Status like it is. 
#replace More than 5 in No_of_previous_emplyers with 6
train_df.No_of_previous_employers = train_df.No_of_previous_employers.replace('More than 5','6')

#change column to int.
train_df.No_of_previous_employers = train_df.No_of_previous_employers.astype(int)

In [25]:
train_df.head()

Unnamed: 0,Division,Qualification,Gender,Channel_of_Recruitment,Trainings_Attended,Last_performance_score,Targets_met,Previous_Award,Training_score_average,State_Of_Origin,Foreign_schooled,Marital_Status,Past_Disciplinary_Action,Previous_IntraDepartmental_Movement,No_of_previous_employers,Promoted_or_Not,Age,Age_of_recruitment
0,Commercial Sales and Marketing,"MSc, MBA and PhD",Female,Direct Internal process,2,12.5,1,0,41,ANAMBRA,No,Married,No,No,0,0,37,12
1,Customer Support and Field Operations,First Degree or HND,Male,Agency and others,2,12.5,0,0,52,ANAMBRA,Yes,Married,No,No,0,0,32,8
2,Commercial Sales and Marketing,First Degree or HND,Male,Direct Internal process,2,7.5,0,0,42,KATSINA,Yes,Married,No,No,0,0,36,11
3,Commercial Sales and Marketing,First Degree or HND,Male,Agency and others,3,2.5,0,0,42,NIGER,Yes,Single,No,No,1,0,41,14
4,Information and Strategy,First Degree or HND,Male,Direct Internal process,3,7.5,0,0,77,AKWA IBOM,Yes,Married,No,No,1,0,33,11


In [29]:
y = train_df['Promoted_or_Not']
X = train_df.drop(['Promoted_or_Not'],axis=1)

#split the dataset into train and test
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [33]:
#encodings
train_df.dtypes

Division                                object
Qualification                           object
Gender                                  object
Channel_of_Recruitment                  object
Trainings_Attended                       int64
Last_performance_score                 float64
Targets_met                              int64
Previous_Award                           int64
Training_score_average                   int64
State_Of_Origin                         object
Foreign_schooled                        object
Marital_Status                          object
Past_Disciplinary_Action                object
Previous_IntraDepartmental_Movement     object
No_of_previous_employers                 int64
Promoted_or_Not                          int64
Age                                      int64
Age_of_recruitment                       int64
dtype: object

In [None]:
#Label Encoding the color column
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()  #instantiate the Label Encoder
train_df['Division','Gender','Channel_of_Recruitment','Marital_Status'] = le.fit_transform(
    train_df['Division','Gender','Channel_of_Recruitment','Marital_Status'])



#instantiate encoders
ce_base = ce.BaseNEncoder(cols=['State_Of_Origin'],base=5)