In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../../datasets/Extended_Employee_Performance_and_Productivity_Data.csv')

In [3]:
df.columns

Index(['Employee_ID', 'Department', 'Gender', 'Age', 'Job_Title', 'Hire_Date',
       'Years_At_Company', 'Education_Level', 'Performance_Score',
       'Monthly_Salary', 'Work_Hours_Per_Week', 'Projects_Handled',
       'Overtime_Hours', 'Sick_Days', 'Remote_Work_Frequency', 'Team_Size',
       'Training_Hours', 'Promotions', 'Employee_Satisfaction_Score',
       'Resigned'],
      dtype='object')

In [4]:
df.head()

Unnamed: 0,Employee_ID,Department,Gender,Age,Job_Title,Hire_Date,Years_At_Company,Education_Level,Performance_Score,Monthly_Salary,Work_Hours_Per_Week,Projects_Handled,Overtime_Hours,Sick_Days,Remote_Work_Frequency,Team_Size,Training_Hours,Promotions,Employee_Satisfaction_Score,Resigned
0,1,IT,Male,55,Specialist,2022-01-19 08:03:05.556036,2,High School,5,6750.0,33,32,22,2,0,14,66,0,2.63,False
1,2,Finance,Male,29,Developer,2024-04-18 08:03:05.556036,0,High School,5,7500.0,34,34,13,14,100,12,61,2,1.72,False
2,3,Finance,Male,55,Specialist,2015-10-26 08:03:05.556036,8,High School,3,5850.0,37,27,6,3,50,10,1,0,3.17,False
3,4,Customer Support,Female,48,Analyst,2016-10-22 08:03:05.556036,7,Bachelor,2,4800.0,52,10,28,12,100,10,0,1,1.86,False
4,5,Engineering,Female,36,Analyst,2021-07-23 08:03:05.556036,3,Bachelor,2,4800.0,38,11,29,13,100,15,9,1,1.25,False


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 20 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Employee_ID                  100000 non-null  int64  
 1   Department                   100000 non-null  object 
 2   Gender                       100000 non-null  object 
 3   Age                          100000 non-null  int64  
 4   Job_Title                    100000 non-null  object 
 5   Hire_Date                    100000 non-null  object 
 6   Years_At_Company             100000 non-null  int64  
 7   Education_Level              100000 non-null  object 
 8   Performance_Score            100000 non-null  int64  
 9   Monthly_Salary               100000 non-null  float64
 10  Work_Hours_Per_Week          100000 non-null  int64  
 11  Projects_Handled             100000 non-null  int64  
 12  Overtime_Hours               100000 non-null  int64  
 13  

In [6]:
df['Education_Level'].unique()

array(['High School', 'Bachelor', 'Master', 'PhD'], dtype=object)

In [7]:
df['Job_Title'].unique()

array(['Specialist', 'Developer', 'Analyst', 'Manager', 'Technician',
       'Engineer', 'Consultant'], dtype=object)

In [8]:
df['Department'].unique()

array(['IT', 'Finance', 'Customer Support', 'Engineering', 'Marketing',
       'HR', 'Operations', 'Sales', 'Legal'], dtype=object)

In [9]:
df.isnull().sum()

Employee_ID                    0
Department                     0
Gender                         0
Age                            0
Job_Title                      0
Hire_Date                      0
Years_At_Company               0
Education_Level                0
Performance_Score              0
Monthly_Salary                 0
Work_Hours_Per_Week            0
Projects_Handled               0
Overtime_Hours                 0
Sick_Days                      0
Remote_Work_Frequency          0
Team_Size                      0
Training_Hours                 0
Promotions                     0
Employee_Satisfaction_Score    0
Resigned                       0
dtype: int64

In [10]:
df['Education_Level_enc'] = df['Education_Level'].map({'High School':0,'Bachelor':1,'Master':2,'PhD':3})
df['Job_Title_enc'] = df['Job_Title'].map({'Specialist':0,'Developer':1,'Analyst':2,'Manager':3,'Technician':4,'Engineer':5,'Consultant':6})
df['Department_enc'] = df['Department'].map({'IT':0,'Finance':1,'Customer Support':2,'Engineering':3,'Marketing':4,'HR':5,'Operations':6,'Sales':7,'Legal':8})

In [11]:
X = df[['Years_At_Company','Sick_Days','Monthly_Salary','Training_Hours','Department_enc','Work_Hours_Per_Week','Projects_Handled','Promotions','Education_Level_enc','Job_Title_enc']]
y = df['Performance_Score']

In [12]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=1)

In [13]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [14]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

In [15]:
models = {'LR':LinearRegression(),
          'Log':LogisticRegression(),
          'KNN3':KNeighborsClassifier(n_neighbors=3),
          'KNN5':KNeighborsClassifier(n_neighbors=5),
          'KNN7':KNeighborsClassifier(n_neighbors=7),
          'SVMp2':SVC(kernel='poly',degree=2),
          'SVMp3':SVC(kernel='poly',degree=3),
          'DT':DecisionTreeClassifier(max_depth = 2, max_features = 5, min_samples_leaf = 3, min_samples_split = 2),
          'RF':RandomForestClassifier(max_depth = 5, min_samples_leaf = 1, n_estimators = 25, n_jobs = -1),
          'AdaB':AdaBoostClassifier(estimator = DecisionTreeClassifier(max_depth = 2), n_estimators = 25, random_state = 1)}
for name,model in models.items():
    model.fit(X_train,y_train)
    train_acc = model.score(X_train,y_train)
    test_acc = model.score(X_test,y_test)
    print(f"{name} | Train: {train_acc:.3f} | {test_acc:.3f}")

LR | Train: 0.286 | 0.287


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Log | Train: 0.297 | 0.289
KNN3 | Train: 0.973 | 0.944
KNN5 | Train: 0.965 | 0.945
KNN7 | Train: 0.962 | 0.946
SVMp2 | Train: 0.314 | 0.312
SVMp3 | Train: 0.258 | 0.257
DT | Train: 0.316 | 0.313
RF | Train: 0.581 | 0.570
AdaB | Train: 0.799 | 0.802


In [17]:
scaler.fit(X)
X_scaled = scaler.transform(X)
final_employee_model = KNeighborsClassifier(n_neighbors=7)
final_employee_model.fit(X_scaled,y)

In [18]:
import pickle
pickle.dump(final_employee_model,open('final_employee_model.pkl','wb'))
pickle.dump(scaler,open('scaler.pkl','wb'))

In [19]:
q = [[8,3,5850.0,1,1,37,27,0,0,0]]
q_scaled = scaler.transform(q)
yp = final_employee_model.predict(q_scaled)[0]
round(yp,2)



np.int64(2)