In [1]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import sklearn

In [2]:
# Reading data
train= pd.read_csv('train_LZdllcl.csv')
test=pd.read_csv('test_2umaH9m.csv')

In [3]:
# lets check the shape of the train and test datasets
print("Shape of the Training Data :", train.shape)
print("Shape of the Test Data :", test.shape)

Shape of the Training Data : (54808, 14)
Shape of the Test Data : (23490, 13)


In [4]:
# columns in Training Data
train.columns

Index(['employee_id', 'department', 'region', 'education', 'gender',
       'recruitment_channel', 'no_of_trainings', 'age', 'previous_year_rating',
       'length_of_service', 'KPIs_met >80%', 'awards_won?',
       'avg_training_score', 'is_promoted'],
      dtype='object')

In [5]:
# columns in Testing Data
test.columns

Index(['employee_id', 'department', 'region', 'education', 'gender',
       'recruitment_channel', 'no_of_trainings', 'age', 'previous_year_rating',
       'length_of_service', 'KPIs_met >80%', 'awards_won?',
       'avg_training_score'],
      dtype='object')

In [6]:
# lets check the head of the dataset
train.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,2542,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,48945,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0


In [7]:
# lets check the head of the test data
test.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,8724,Technology,region_26,Bachelor's,m,sourcing,1,24,,1,1,0,77
1,74430,HR,region_4,Bachelor's,f,other,1,31,3.0,5,0,0,51
2,72255,Sales & Marketing,region_13,Bachelor's,m,other,1,31,1.0,4,0,0,47
3,38562,Procurement,region_2,Bachelor's,f,other,3,31,2.0,9,0,0,65
4,64486,Finance,region_29,Bachelor's,m,sourcing,1,30,4.0,7,0,0,61


In [8]:
train['is_promoted'].value_counts()

0    50140
1     4668
Name: is_promoted, dtype: int64

In [9]:
train_total = train.isnull().sum()
train_percent = ((train.isnull().sum()/train.shape[0])*100).round(2)
test_total = test.isnull().sum()
test_percent = ((test.isnull().sum()/test.shape[0])*100).round(2)
train_missing_data = pd.concat([train_total, train_percent, test_total, test_percent],
                                axis=1, 
                                keys=['Train_Total', 'Train_Percent %','Test_Total', 'Test_Percent %'],
                                sort = True)
train_missing_data.style.background_gradient(cmap = 'copper')

Unnamed: 0,Train_Total,Train_Percent %,Test_Total,Test_Percent %
KPIs_met >80%,0,0.0,0.0,0.0
age,0,0.0,0.0,0.0
avg_training_score,0,0.0,0.0,0.0
awards_won?,0,0.0,0.0,0.0
department,0,0.0,0.0,0.0
education,2409,4.4,1034.0,4.4
employee_id,0,0.0,0.0,0.0
gender,0,0.0,0.0,0.0
is_promoted,0,0.0,,
length_of_service,0,0.0,0.0,0.0


# Treatment of Missing Values

In [10]:
# missing values in training data set

# lets calculate the total missing values in the dataset
train_total = train.isnull().sum()

# lets calculate the total missing values in the dataset
test_total = test.isnull().sum()


In [11]:
train_total

employee_id                0
department                 0
region                     0
education               2409
gender                     0
recruitment_channel        0
no_of_trainings            0
age                        0
previous_year_rating    4124
length_of_service          0
KPIs_met >80%              0
awards_won?                0
avg_training_score         0
is_promoted                0
dtype: int64

In [12]:
test_total

employee_id                0
department                 0
region                     0
education               1034
gender                     0
recruitment_channel        0
no_of_trainings            0
age                        0
previous_year_rating    1812
length_of_service          0
KPIs_met >80%              0
awards_won?                0
avg_training_score         0
dtype: int64

In [13]:
# lets impute the missing values in the Training Data

train['education'] = train['education'].fillna(train['education'].mode()[0])
train['previous_year_rating'] = train['previous_year_rating'].fillna(train['previous_year_rating'].mode()[0])

# lets check whether the Null values are still present or not?
print("Number of Missing Values Left in the Training Data :", train.isnull().sum().sum())

Number of Missing Values Left in the Training Data : 0


In [14]:
# lets impute the missing values in the Testing Data

test['education'] = test['education'].fillna(test['education'].mode()[0])
test['previous_year_rating'] = test['previous_year_rating'].fillna(test['previous_year_rating'].mode()[0])

# lets check whether the Null values are still present or not?
print("Number of Missing Values Left in the Training Data :", test.isnull().sum().sum())

Number of Missing Values Left in the Training Data : 0


In [15]:
train1= train.copy()
test1= test.copy()

# Feature Engineering

In [16]:
# lets create some extra features from existing features to improve our Model

# creating a Metric of Sum
train1['sum_metric'] = train1['awards_won?']+train1['KPIs_met >80%'] + train1['previous_year_rating']
test1['sum_metric'] = test1['awards_won?']+test1['KPIs_met >80%'] + test1['previous_year_rating']

# creating a total score column
train1['total_score'] = train1['avg_training_score'] * train1['no_of_trainings']
test1['total_score'] = test1['avg_training_score'] * test1['no_of_trainings']

In [17]:
# lets remove some of the columns which are not very useful for predicting the promotion.

# we already know that the recruitment channel is very least related to promotion of an employee, so lets remove this column
# even the region seems to contribute very less, when it comes to promotion, so lets remove it too.
# also the employee id is not useful so lets remove it.

train1 = train1.drop(['recruitment_channel', 'region', 'employee_id'], axis = 1)
test1 = test1.drop(['recruitment_channel', 'region', 'employee_id'], axis = 1)

# lets check the columns in train and test data set after feature engineering
train1.columns

Index(['department', 'education', 'gender', 'no_of_trainings', 'age',
       'previous_year_rating', 'length_of_service', 'KPIs_met >80%',
       'awards_won?', 'avg_training_score', 'is_promoted', 'sum_metric',
       'total_score'],
      dtype='object')

In [18]:
# lets encode the education in their degree of importance 
train1['education'] = train1['education'].replace(("Master's & above", "Bachelor's", "Below Secondary"),(3, 2, 1))
test1['education'] = test1['education'].replace(("Master's & above", "Bachelor's", "Below Secondary"),(3, 2, 1))

# lets use Label Encoding for Gender and Department to convert them into Numerical
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train1['department'] = le.fit_transform(train1['department'])
test1['department'] = le.fit_transform(test1['department'])
train1['gender'] = le.fit_transform(train1['gender'])
test1['gender'] = le.fit_transform(test1['gender'])

# lets check whether we still have any categorical columns left after encoding
print(train1.select_dtypes('object').columns)
print(test1.select_dtypes('object').columns)

Index([], dtype='object')
Index([], dtype='object')


# Splitting the Data

In [19]:
# lets split the target data from the train data

y = train1['is_promoted']
x = train1.drop(['is_promoted'], axis = 1)
x_test = test1

# lets print the shapes of these newly formed data sets
print("Shape of the x :", x.shape)
print("Shape of the y :", y.shape)
print("Shape of the x Test :", x_test.shape)

Shape of the x : (54808, 12)
Shape of the y : (54808,)
Shape of the x Test : (23490, 12)


In [20]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score, precision_score,recall_score, f1_score
x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size = 0.2, random_state = 0)


In [21]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_valid = sc.transform(x_valid)
x_test = sc.transform(x_test)

In [22]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()
model.fit(x_train, y_train)

y_pred = model.predict(x_valid)

In [23]:

print('Accuracy Score is:', accuracy_score(y_valid, y_pred))
print('Recall Score is:', recall_score(y_valid, y_pred))
print('Precision Score:', precision_score(y_valid, y_pred))
print('F1 score is:', f1_score(y_valid, y_pred))

Accuracy Score is: 0.9066776135741653
Recall Score is: 0.44408251900108575
Precision Score: 0.44456521739130433
F1 score is: 0.4443237370994025


In [24]:
from sklearn.svm import SVC
svm=SVC(random_state=1)
svm.fit(x_train,y_train)
y_pred1 = model.predict(x_valid)

In [25]:
print('Accuracy Score is:', accuracy_score(y_valid, y_pred1))
print('Recall Score is:', recall_score(y_valid, y_pred1))
print('Precision Score:', precision_score(y_valid, y_pred1))
print('F1 score is:', f1_score(y_valid, y_pred1))

Accuracy Score is: 0.9066776135741653
Recall Score is: 0.44408251900108575
Precision Score: 0.44456521739130433
F1 score is: 0.4443237370994025


In [26]:
class_names=np.array(['0','1'])
print(classification_report(y_valid, y_pred1, target_names=class_names))

              precision    recall  f1-score   support

           0       0.95      0.95      0.95     10041
           1       0.44      0.44      0.44       921

    accuracy                           0.91     10962
   macro avg       0.70      0.70      0.70     10962
weighted avg       0.91      0.91      0.91     10962



In [27]:
from sklearn.linear_model import LogisticRegression
Logit_model= LogisticRegression()
Logit_model.fit(x_train, y_train)
y_pred2= Logit_model.predict(x_valid)
print('Accuracy Score is:', accuracy_score(y_valid, y_pred2))
print('Recall Score is:', recall_score(y_valid, y_pred2))
print('Precision Score:', precision_score(y_valid, y_pred2))
print('F1 score is:', f1_score(y_valid, y_pred2))

Accuracy Score is: 0.9191753329684365
Recall Score is: 0.08577633007600434
Precision Score: 0.6422764227642277
F1 score is: 0.15134099616858238


In [28]:
class_names=np.array(['0','1'])
print(classification_report(y_valid, y_pred2, target_names=class_names))

              precision    recall  f1-score   support

           0       0.92      1.00      0.96     10041
           1       0.64      0.09      0.15       921

    accuracy                           0.92     10962
   macro avg       0.78      0.54      0.55     10962
weighted avg       0.90      0.92      0.89     10962



In [29]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=700,
                             min_samples_split=10,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(x_train,y_train)

RandomForestClassifier(min_samples_split=10, n_estimators=700, n_jobs=-1,
                       oob_score=True, random_state=1)

In [30]:
y_pred3= rf.predict(x_valid)
print('Accurac.y Score is:', accuracy_score(y_valid, y_pred3))
print('Recall Score is:', recall_score(y_valid, y_pred3))
print('Precision Score:', precision_score(y_valid, y_pred3))
print('F1 score is:', f1_score(y_valid, y_pred3))

Accurac.y Score is: 0.940156905674147
Recall Score is: 0.32356134636264927
Precision Score: 0.9003021148036254
F1 score is: 0.476038338658147


In [31]:
# Applying Gaussian Naive Bayes Classifier

from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score, precision_score,recall_score, f1_score
Classifier= GaussianNB()
Classifier.fit(x_train, y_train)
y_pred4= Classifier.predict(x_valid)
y_train_pred= Classifier.predict(x_valid)

print('Accurac.y Score is:', accuracy_score(y_valid, y_pred4))
print('Recall Score is:', recall_score(y_valid, y_pred4))
print('Precision Score:', precision_score(y_valid, y_pred4))
print('F1 score is:', f1_score(y_valid, y_pred4))

print("\n The confusion matrix is : \n", confusion_matrix(y_valid, y_pred4))

Accurac.y Score is: 0.9019339536580916
Recall Score is: 0.20846905537459284
Precision Score: 0.35687732342007433
F1 score is: 0.26319396847155585

 The confusion matrix is : 
 [[9695  346]
 [ 729  192]]


In [32]:
 # Applying Bernoulli Naive Bayes Classifier
    
from sklearn.naive_bayes import BernoulliNB
classifier= BernoulliNB()
classifier.fit(x_train, y_train)
y_pred5= classifier.predict(x_valid)
y_train_pred1= classifier.predict(x_valid)

print('Accurac.y Score is:', accuracy_score(y_valid, y_pred5))
print('Recall Score is:', recall_score(y_valid, y_pred5))
print('Precision Score:', precision_score(y_valid, y_pred5))
print('F1 score is:', f1_score(y_valid, y_pred5))

print("\n The confusion matrix is : \n", confusion_matrix(y_valid, y_pred5))

Accurac.y Score is: 0.8959131545338442
Recall Score is: 0.20304017372421282
Precision Score: 0.3148148148148148
F1 score is: 0.24686468646864687

 The confusion matrix is : 
 [[9634  407]
 [ 734  187]]


In [33]:
# Applying SVM Model
from sklearn import svm
from sklearn.svm import SVC

svm_linear = SVC(kernel= "linear")
svm_linear.fit(x_train,y_train)
y_pred6 = svm_linear.predict(x_valid)
y_train_pred2 = svm_linear.predict(x_train)

print('Accurac.y Score is:', accuracy_score(y_valid, y_pred6))
print('Recall Score is:', recall_score(y_valid, y_pred6))
print('Precision Score:', precision_score(y_valid, y_pred6))
print('F1 score is:', f1_score(y_valid, y_pred6))

print("\n The confusion matrix is : \n", confusion_matrix(y_valid, y_pred6))

Accurac.y Score is: 0.9159824849480022
Recall Score is: 0.0
Precision Score: 0.0
F1 score is: 0.0

 The confusion matrix is : 
 [[10041     0]
 [  921     0]]


  _warn_prf(average, modifier, msg_start, len(result))


In [34]:
# Applying KNN model

from sklearn.neighbors import KNeighborsClassifier
classifer = KNeighborsClassifier(n_neighbors=4,metric = "minkowski") 
classifer.fit(x_train,y_train)
y_pred7 = classifer.predict(x_valid)

y_train_pred3 = classifer.predict(x_train)

print('Accurac.y Score is:', accuracy_score(y_valid, y_pred7))
print('Recall Score is:', recall_score(y_valid, y_pred7))
print('Precision Score:', precision_score(y_valid, y_pred7))
print('F1 score is:', f1_score(y_valid, y_pred7))

print("\n The confusion matrix is : \n", confusion_matrix(y_valid, y_pred7))

Accurac.y Score is: 0.9273855135924102
Recall Score is: 0.1965255157437568
Precision Score: 0.7637130801687764
F1 score is: 0.3126079447322971

 The confusion matrix is : 
 [[9985   56]
 [ 740  181]]


In [35]:
y_pred3= rf.predict(x_test)
test['is_promoted']= y_pred3
test

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,8724,Technology,region_26,Bachelor's,m,sourcing,1,24,3.0,1,1,0,77,0
1,74430,HR,region_4,Bachelor's,f,other,1,31,3.0,5,0,0,51,0
2,72255,Sales & Marketing,region_13,Bachelor's,m,other,1,31,1.0,4,0,0,47,0
3,38562,Procurement,region_2,Bachelor's,f,other,3,31,2.0,9,0,0,65,0
4,64486,Finance,region_29,Bachelor's,m,sourcing,1,30,4.0,7,0,0,61,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23485,53478,Legal,region_2,Below Secondary,m,sourcing,1,24,3.0,1,0,0,61,0
23486,25600,Technology,region_25,Bachelor's,m,sourcing,1,31,3.0,7,0,0,74,0
23487,45409,HR,region_16,Bachelor's,f,sourcing,1,26,4.0,4,0,0,50,0
23488,1186,Procurement,region_31,Bachelor's,m,sourcing,3,27,3.0,1,0,0,70,0


In [36]:
output=pd.DataFrame(data={"employee_id":test["employee_id"],"Prediction":test['is_promoted']}) 
output.to_csv(path_or_buf="C:/Users/Wahu_Buzz/Downloads/results.csv",index=False,quoting=3,)