In [1]:
#Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
import sklearn

In [2]:
# Reading the datasets
train = pd.read_csv('Train_HCM_data.csv')
test = pd.read_csv('Test_HCM_data.csv')
train

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,2542,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,48945,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54803,3030,Technology,region_14,Bachelor's,m,sourcing,1,48,3.0,17,0,0,78,0
54804,74592,Operations,region_27,Master's & above,f,other,1,37,2.0,6,0,0,56,0
54805,13918,Analytics,region_1,Bachelor's,m,other,1,27,5.0,3,1,0,79,0
54806,13614,Sales & Marketing,region_9,,m,sourcing,1,29,1.0,2,0,0,45,0


In [3]:
# Lets check the value count of is_promoted column (Target variable)
train['is_promoted'].value_counts()


0    50140
1     4668
Name: is_promoted, dtype: int64

In [4]:
# Lets check the shape of the train and test datasets
print("Shape of the Training Data :", train.shape)
print("Shape of the Test Data :", test.shape)

Shape of the Training Data : (54808, 14)
Shape of the Test Data : (23490, 13)


In [5]:
# Lets check the null values in train dataset
train.isnull().sum()

employee_id                0
department                 0
region                     0
education               2409
gender                     0
recruitment_channel        0
no_of_trainings            0
age                        0
previous_year_rating    4124
length_of_service          0
KPIs_met >80%              0
awards_won?                0
avg_training_score         0
is_promoted                0
dtype: int64

In [6]:
# Missing values in training/testing data set

# Lets calculate the total missing values in the train dataset
train_total = train.isnull().sum()
# Lets calculate the percentage of missing values in the train dataset
train_percent = ((train.isnull().sum()/train.shape[0])*100).round(2)

# Lets calculate the total missing values in the test dataset
test_total = test.isnull().sum()
# Lets calculate the percentage of missing values in the test dataset
test_percent = ((test.isnull().sum()/test.shape[0])*100).round(2)

# Lets make a dataset consisting of total no. of missing values and percentage of missing values in the dataset
train_missing_data = pd.concat([train_total, train_percent, test_total, test_percent],
                                axis=1, 
                                keys=['Train_Total', 'Train_Percent %','Test_Total', 'Test_Percent %'],
                                sort = True)

# Lets check the details
train_missing_data.style.background_gradient(cmap = 'copper')


Unnamed: 0,Train_Total,Train_Percent %,Test_Total,Test_Percent %
KPIs_met >80%,0,0.0,0.0,0.0
age,0,0.0,0.0,0.0
avg_training_score,0,0.0,0.0,0.0
awards_won?,0,0.0,0.0,0.0
department,0,0.0,0.0,0.0
education,2409,4.4,1034.0,4.4
employee_id,0,0.0,0.0,0.0
gender,0,0.0,0.0,0.0
is_promoted,0,0.0,,
length_of_service,0,0.0,0.0,0.0


We can see from the above table, that Only two columns have missing values in Train and Test Dataset both. Also, the Percentage of Missing values is around 4 and 7% in education, and previous_year_rating respectively. So, do not have delete any missing values, we can simply impute the values using Mean, Median, and Mode Values.


In [7]:
# Lets impute the missing values in the Training Data
train['education'] = train['education'].fillna(train['education'].mode()[0])
train['previous_year_rating'] = train['previous_year_rating'].fillna(train['previous_year_rating'].mode()[0])

# Lets check whether the Null values are still present or not?
print("Number of Missing Values Left in the Training Data :", train.isnull().sum().sum())

Number of Missing Values Left in the Training Data : 0


In [8]:
# Lets impute the missing values in the Testing Data
test['education'] = test['education'].fillna(test['education'].mode()[0])
test['previous_year_rating'] = test['previous_year_rating'].fillna(test['previous_year_rating'].mode()[0])

# Lets check whether the Null values are still present or not?
print("Number of Missing Values Left in the Testing Data :", test.isnull().sum().sum())

Number of Missing Values Left in the Testing Data : 0


We imputed the missing values, using the Mode values, even for the previous year rating, it only seems to be numerical, but in real it's also categorical. After, Imputing the missing values in the training and testing data set we can see that there are no Null Values left in any of the datasets. So, we are Done with the Treatment of the Missing Values.


In [9]:
train1 = train.copy()
test1 = test.copy()

In [10]:
# creating a Metric of Sum
train1['sum_metric'] = train1['awards_won?']+ train1['KPIs_met >80%'] + train1['previous_year_rating']
test1['sum_metric'] = test1['awards_won?']+test1['KPIs_met >80%'] + test1['previous_year_rating']

# creating a total score column
train1['total_score'] = train1['avg_training_score'] * train1['no_of_trainings']
test1['total_score'] = test1['avg_training_score'] * test1['no_of_trainings']

train1 = train1.drop(['recruitment_channel', 'region', 'employee_id'], axis = 1)
test1 = test1.drop(['recruitment_channel', 'region', 'employee_id'], axis = 1)

In [11]:
# lets encode the education in their degree of importance 
train1['education'] = train1['education'].replace(("Master's & above", "Bachelor's", "Below Secondary"),(3, 2, 1))
test1['education'] = test1['education'].replace(("Master's & above", "Bachelor's", "Below Secondary"),(3, 2, 1))

# lets use Label Encoding for Gender and Department to convert them into Numerical
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train1['department'] = le.fit_transform(train1['department'])
test1['department'] = le.fit_transform(test1['department'])
train1['gender'] = le.fit_transform(train1['gender'])
test1['gender'] = le.fit_transform(test1['gender'])

# lets check whether we still have any categorical columns left after encoding
print(train1.select_dtypes('object').columns)
print(test1.select_dtypes('object').columns)

Index([], dtype='object')
Index([], dtype='object')


In [12]:
# lets split the target data from the train data

y = train1['is_promoted']
x = train1.drop(['is_promoted'], axis = 1)
x_test = test1

# lets print the shapes of these newly formed data sets
print("Shape of the x :", x.shape)
print("Shape of the y :", y.shape)
print("Shape of the x Test :", x_test.shape)

Shape of the x : (54808, 12)
Shape of the y : (54808,)
Shape of the x Test : (23490, 12)


In [13]:
from sklearn.model_selection import train_test_split

x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size = 0.2, random_state = 0)


In [14]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_valid = sc.transform(x_valid)
x_test = sc.transform(x_test)

In [15]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()
model.fit(x_train, y_train)

y_pred = model.predict(x_valid)

In [16]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score, precision_score,recall_score, f1_score

In [17]:
print('Accuracy Score is:', accuracy_score(y_valid, y_pred))
print('Recall Score is:', recall_score(y_valid, y_pred))
print('Precision Score:', precision_score(y_valid, y_pred))
print('F1 score is:', f1_score(y_valid, y_pred))

Accuracy Score is: 0.9083196496989601
Recall Score is: 0.44408251900108575
Precision Score: 0.4534368070953437
F1 score is: 0.44871091607240815


In [18]:
from sklearn.svm import SVC
svm=SVC(random_state=1)
svm.fit(x_train,y_train)
y_pred1 = model.predict(x_valid)

In [19]:
print('Accuracy Score is:', accuracy_score(y_valid, y_pred1))
print('Recall Score is:', recall_score(y_valid, y_pred1))
print('Precision Score:', precision_score(y_valid, y_pred1))
print('F1 score is:', f1_score(y_valid, y_pred1))

Accuracy Score is: 0.9083196496989601
Recall Score is: 0.44408251900108575
Precision Score: 0.4534368070953437
F1 score is: 0.44871091607240815


In [20]:
class_names=np.array(['0','1'])
print(classification_report(y_valid, y_pred1, target_names=class_names))

              precision    recall  f1-score   support

           0       0.95      0.95      0.95     10041
           1       0.45      0.44      0.45       921

    accuracy                           0.91     10962
   macro avg       0.70      0.70      0.70     10962
weighted avg       0.91      0.91      0.91     10962



In [26]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=700,
                             min_samples_split=10,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(x_train,y_train)

RandomForestClassifier(min_samples_split=10, n_estimators=700, n_jobs=-1,
                       oob_score=True, random_state=1)

In [28]:
y_pred3= rf.predict(x_valid)
print('Accuracy Score is:', accuracy_score(y_valid, y_pred3))
print('Recall Score is:', recall_score(y_valid, y_pred3))
print('Precision Score:', precision_score(y_valid, y_pred3))
print('F1 score is:', f1_score(y_valid, y_pred3))

Accuracy Score is: 0.940156905674147
Recall Score is: 0.32356134636264927
Precision Score: 0.9003021148036254
F1 score is: 0.476038338658147


In [29]:
y_pred3= rf.predict(x_test)

In [30]:
test['is_promoted']= y_pred3

In [31]:
test


Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,8724,Technology,region_26,Bachelor's,m,sourcing,1,24,3.0,1,1,0,77,0
1,74430,HR,region_4,Bachelor's,f,other,1,31,3.0,5,0,0,51,0
2,72255,Sales & Marketing,region_13,Bachelor's,m,other,1,31,1.0,4,0,0,47,0
3,38562,Procurement,region_2,Bachelor's,f,other,3,31,2.0,9,0,0,65,0
4,64486,Finance,region_29,Bachelor's,m,sourcing,1,30,4.0,7,0,0,61,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23485,53478,Legal,region_2,Below Secondary,m,sourcing,1,24,3.0,1,0,0,61,0
23486,25600,Technology,region_25,Bachelor's,m,sourcing,1,31,3.0,7,0,0,74,0
23487,45409,HR,region_16,Bachelor's,f,sourcing,1,26,4.0,4,0,0,50,0
23488,1186,Procurement,region_31,Bachelor's,m,sourcing,3,27,3.0,1,0,0,70,0


In [34]:
output=pd.DataFrame(data={"employee_id":test["employee_id"],"is_promoted":test['is_promoted']}) 
output.to_csv(path_or_buf="C:/Users/radhi/Data Science and Analytics/Competition HR Analytics/results.csv",index=False,quoting=3,)