In [1]:
# Importing all the necessary libraries
import pandas as pd
import numpy as np
import sklearn
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

### Data reading part 

In [2]:
df = pd.read_csv("HR_Analytics_Train_Data.csv")
test = pd.read_csv("HR_Analytics_Test_Data.csv")

### Preprocessing

In [3]:
df.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,2542,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,48945,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0


In [4]:
# Keeping only important columns
df = df[["employee_id", "department", "education", "no_of_trainings", "previous_year_rating", "length_of_service", "KPIs_met >80%", "awards_won?", "avg_training_score", "is_promoted"]]
df.head()

Unnamed: 0,employee_id,department,education,no_of_trainings,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,Sales & Marketing,Master's & above,1,5.0,8,1,0,49,0
1,65141,Operations,Bachelor's,1,5.0,4,0,0,60,0
2,7513,Sales & Marketing,Bachelor's,1,3.0,7,0,0,50,0
3,2542,Sales & Marketing,Bachelor's,2,1.0,10,0,0,50,0
4,48945,Technology,Bachelor's,1,3.0,2,0,0,73,0


In [5]:
df.shape

(54808, 10)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54808 entries, 0 to 54807
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   employee_id           54808 non-null  int64  
 1   department            54808 non-null  object 
 2   education             52399 non-null  object 
 3   no_of_trainings       54808 non-null  int64  
 4   previous_year_rating  50684 non-null  float64
 5   length_of_service     54808 non-null  int64  
 6   KPIs_met >80%         54808 non-null  int64  
 7   awards_won?           54808 non-null  int64  
 8   avg_training_score    54808 non-null  int64  
 9   is_promoted           54808 non-null  int64  
dtypes: float64(1), int64(7), object(2)
memory usage: 4.2+ MB


In [7]:
df.isnull().sum()

employee_id                0
department                 0
education               2409
no_of_trainings            0
previous_year_rating    4124
length_of_service          0
KPIs_met >80%              0
awards_won?                0
avg_training_score         0
is_promoted                0
dtype: int64

In [8]:
# I assume that they are Freshers i.e length_of_service = 1
# Therefore I'll impute the null of previous_year_rating with "0"
df["previous_year_rating"]=df["previous_year_rating"].fillna(0)

In [9]:
# Also using ffill and bfill for education
# Bfill(backward-fill) propagates the first observed non-null value backward until another non-null value is met.
# While ffill(forward-fill) propagates the last observed non-null value forward until another non-null value is encountered.
df["education"]=df["education"].ffill(axis=0)
df["education"]=df["education"].bfill(axis=0)

In [10]:
df['education'].value_counts()

Bachelor's          38390
Master's & above    15579
Below Secondary       839
Name: education, dtype: int64

In [11]:
df['department'].value_counts()

Sales & Marketing    16840
Operations           11348
Technology            7138
Procurement           7138
Analytics             5352
Finance               2536
HR                    2418
Legal                 1039
R&D                    999
Name: department, dtype: int64

### Feature Engineering

In [12]:
#Label Encoding education Column
le_education = LabelEncoder()
df['education'] = le_education.fit_transform(df['education'])

In [13]:
#Label Encoding department Column
le_department = LabelEncoder()
df['department'] = le_department.fit_transform(df['department'])

In [14]:
df['department'].value_counts()

7    16840
4    11348
8     7138
5     7138
0     5352
1     2536
2     2418
3     1039
6      999
Name: department, dtype: int64

### Modeling

In [15]:
# Creating X and y df for training the model
X = df.drop("is_promoted", axis=1)
y = df["is_promoted"]

In [16]:
# Using DecisionTreeRegressor
dec_tree_reg = DecisionTreeRegressor(random_state=0)
dec_tree_reg.fit(X, y.values)

DecisionTreeRegressor(random_state=0)

In [17]:
y_pred = dec_tree_reg.predict(X)

In [18]:
error = np.sqrt(mean_squared_error(y, y_pred))

In [19]:
error

0.0

In [20]:
f1_score(y, y_pred)

1.0

In [21]:
X

Unnamed: 0,employee_id,department,education,no_of_trainings,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,65438,7,2,1,5.0,8,1,0,49
1,65141,4,0,1,5.0,4,0,0,60
2,7513,7,0,1,3.0,7,0,0,50
3,2542,7,0,2,1.0,10,0,0,50
4,48945,8,0,1,3.0,2,0,0,73
...,...,...,...,...,...,...,...,...,...
54803,3030,8,0,1,3.0,17,0,0,78
54804,74592,4,2,1,2.0,6,0,0,56
54805,13918,0,0,1,5.0,3,1,0,79
54806,13614,7,0,1,1.0,2,0,0,45
