In [7]:
import numpy as np 
import pandas as pd 

In [8]:
df = pd.read_csv("employee.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 20 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Employee_ID                  100000 non-null  int64  
 1   Department                   100000 non-null  object 
 2   Gender                       100000 non-null  object 
 3   Age                          100000 non-null  int64  
 4   Job_Title                    100000 non-null  object 
 5   Hire_Date                    100000 non-null  object 
 6   Years_At_Company             100000 non-null  int64  
 7   Education_Level              100000 non-null  object 
 8   Performance_Score            100000 non-null  int64  
 9   Monthly_Salary               100000 non-null  float64
 10  Work_Hours_Per_Week          100000 non-null  int64  
 11  Projects_Handled             100000 non-null  int64  
 12  Overtime_Hours               100000 non-null  int64  
 13  

In [9]:
df = df.drop(columns=['Employee_ID','Hire_Date','Work_Hours_Per_Week','Sick_Days',
                    'Remote_Work_Frequency','Team_Size','Training_Hours','Resigned'])
df.head()

Unnamed: 0,Department,Gender,Age,Job_Title,Years_At_Company,Education_Level,Performance_Score,Monthly_Salary,Projects_Handled,Overtime_Hours,Promotions,Employee_Satisfaction_Score
0,IT,Male,55,Specialist,2,High School,5,6750.0,32,22,0,2.63
1,Finance,Male,29,Developer,0,High School,5,7500.0,34,13,2,1.72
2,Finance,Male,55,Specialist,8,High School,3,5850.0,27,6,0,3.17
3,Customer Support,Female,48,Analyst,7,Bachelor,2,4800.0,10,28,1,1.86
4,Engineering,Female,36,Analyst,3,Bachelor,2,4800.0,11,29,1,1.25


In [10]:
df.isna().sum()

Department                     0
Gender                         0
Age                            0
Job_Title                      0
Years_At_Company               0
Education_Level                0
Performance_Score              0
Monthly_Salary                 0
Projects_Handled               0
Overtime_Hours                 0
Promotions                     0
Employee_Satisfaction_Score    0
dtype: int64

In [11]:
df.duplicated().sum()

0

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 12 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Department                   100000 non-null  object 
 1   Gender                       100000 non-null  object 
 2   Age                          100000 non-null  int64  
 3   Job_Title                    100000 non-null  object 
 4   Years_At_Company             100000 non-null  int64  
 5   Education_Level              100000 non-null  object 
 6   Performance_Score            100000 non-null  int64  
 7   Monthly_Salary               100000 non-null  float64
 8   Projects_Handled             100000 non-null  int64  
 9   Overtime_Hours               100000 non-null  int64  
 10  Promotions                   100000 non-null  int64  
 11  Employee_Satisfaction_Score  100000 non-null  float64
dtypes: float64(2), int64(6), object(4)
memory usage: 9.2+ MB


### Split Data

In [13]:
X = df[['Years_At_Company','Monthly_Salary','Overtime_Hours','Promotions','Employee_Satisfaction_Score']]
y = df["Performance_Score"]

print(X.columns)

Index(['Years_At_Company', 'Monthly_Salary', 'Overtime_Hours', 'Promotions',
       'Employee_Satisfaction_Score'],
      dtype='object')


In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(X_train.shape, X_test.shape)

(70000, 5) (30000, 5)


### **Data Prep**

##### scaling

In [15]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)

X_test = scaler.fit_transform(X_test)

In [16]:
import joblib
joblib.dump(scaler,"scaler.pkl")

['scaler.pkl']

metrics score preparation

In [17]:
from sklearn.metrics import accuracy_score

def modelperformance(predictions):
    print("Accuracy score : {}".format(accuracy_score(y_test,predictions)))

hyperparametertuning preparation

In [18]:
from sklearn.model_selection import GridSearchCV

### **Model building**

#### Model Logistic Regressions

In [19]:
from sklearn.linear_model import LogisticRegression

log_model = LogisticRegression()
log_model.fit(X_train,y_train)

In [20]:
# predict 1
log_model.predict([[0,0,0,0,0]])

array([3], dtype=int64)

In [21]:
# predict 2
log_model.predict(X_test)


array([4, 5, 5, ..., 2, 5, 4], dtype=int64)

In [22]:
# store the predictions
log_predictions = log_model.predict(X_test)

In [23]:
# see the accuracy score
modelperformance(log_predictions)
accuracy_score(y_test, log_predictions)

Accuracy score : 0.31046666666666667


0.31046666666666667

#### Model KNN

In [24]:
from sklearn.neighbors import KNeighborsClassifier

param_grid = {'n_neighbors': [3,5,7,9,11],
              'weights': ['uniform', 'distance']}

In [25]:
gridkn = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)

In [26]:
gridkn.fit(X_train,y_train)

In [27]:
gridkn.best_params_

{'n_neighbors': 3, 'weights': 'distance'}

In [28]:
predictions = gridkn.predict(X_test)

In [29]:
modelperformance(predictions)
accuracy_score(y_test, predictions)

Accuracy score : 0.4872666666666667


0.4872666666666667

In [38]:
joblib.dump(gridkn, "model.pkl")

['model.pkl']

#### Model SVM

In [30]:
from sklearn.svm import SVC
svm = SVC()

In [31]:
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear','rbf']
}

In [32]:
gridsvc = GridSearchCV(svm, param_grid, cv=3, n_jobs=-1, verbose=2)

In [33]:
gridsvc.fit(X_train,y_train)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


In [34]:
gridsvc.best_params_

{'C': 1, 'kernel': 'rbf'}

In [37]:
y_pred = gridsvc.predict(X_test)

KeyboardInterrupt: 

In [36]:
modelperformance(y_pred)

Accuracy score : 0.42583333333333334


### Notes on Sklearn Library

1. From sklearn.model_selection
    - import train_test_split
    - import GrideSearchCV

2. From sklearn.preprocessing 
    - import StandardScaler

3. From sklearn.metrics 
    - import accuracy_score