In [578]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder,OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

# Project:
# Job Roles Burnout Prediction Analysis

In [579]:
df=pd.read_csv(r"c:\Users\user\Desktop\Kaggle Datasets\synthetic_employee_burnout.csv")

# Data info

In [580]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               2000 non-null   object 
 1   Age                2000 non-null   int64  
 2   Gender             2000 non-null   object 
 3   JobRole            2000 non-null   object 
 4   Experience         2000 non-null   int64  
 5   WorkHoursPerWeek   2000 non-null   int64  
 6   RemoteRatio        2000 non-null   int64  
 7   SatisfactionLevel  2000 non-null   float64
 8   StressLevel        2000 non-null   int64  
 9   Burnout            2000 non-null   int64  
dtypes: float64(1), int64(6), object(3)
memory usage: 156.4+ KB


In [581]:
df.shape

(2000, 10)

In [582]:
df.columns

Index(['Name', 'Age', 'Gender', 'JobRole', 'Experience', 'WorkHoursPerWeek',
       'RemoteRatio', 'SatisfactionLevel', 'StressLevel', 'Burnout'],
      dtype='object')

# Data Handling & Cleaning

In [583]:
df.isna().sum()
df.duplicated().sum()

np.int64(0)

In [584]:
df.drop(columns=["Name"],inplace=True)

In [585]:
df.head()

Unnamed: 0,Age,Gender,JobRole,Experience,WorkHoursPerWeek,RemoteRatio,SatisfactionLevel,StressLevel,Burnout
0,32,Male,Analyst,3,60,21,4.4,1,0
1,40,Female,Engineer,9,47,67,2.09,2,0
2,33,Female,Engineer,2,44,20,2.58,3,0
3,35,Female,Manager,6,44,70,3.23,8,0
4,59,Male,Sales,8,38,46,4.41,1,0


## Converting Data Type

In [586]:
df = pd.get_dummies(df, columns=["JobRole"])

In [587]:
le = LabelEncoder()
df['Gender'] = le.fit_transform(df['Gender'])

In [588]:
df.head()

Unnamed: 0,Age,Gender,Experience,WorkHoursPerWeek,RemoteRatio,SatisfactionLevel,StressLevel,Burnout,JobRole_Analyst,JobRole_Engineer,JobRole_HR,JobRole_Manager,JobRole_Sales
0,32,1,3,60,21,4.4,1,0,True,False,False,False,False
1,40,0,9,47,67,2.09,2,0,False,True,False,False,False
2,33,0,2,44,20,2.58,3,0,False,True,False,False,False
3,35,0,6,44,70,3.23,8,0,False,False,False,True,False
4,59,1,8,38,46,4.41,1,0,False,False,False,False,True


# Machine Learning Analysis

In [589]:
x=df.drop(columns=["Burnout"])
y=df["Burnout"]

In [590]:
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=42,test_size=0.2)
st=StandardScaler()
x_train_scaled=st.fit_transform(x_train)
x_test_scaled = st.transform(x_test)
x_train_scaled.shape
print(f"The shape of x train,test scaled {x_train_scaled.shape},{x_test_scaled.shape},and the shape of y train,test is {y_train.shape},{y_test.shape} ")

The shape of x train,test scaled (1600, 12),(400, 12),and the shape of y train,test is (1600,),(400,) 


In [591]:
model=LogisticRegression(class_weight="balanced")
model.fit(x_train_scaled,y_train)

In [592]:
y_pred=model.predict(x_test_scaled)

# Evaluate performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.95
Confusion Matrix:
 [[348  20]
 [  0  32]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.95      0.97       368
           1       0.62      1.00      0.76        32

    accuracy                           0.95       400
   macro avg       0.81      0.97      0.87       400
weighted avg       0.97      0.95      0.96       400



# Predicting With New Data 

In [593]:
df.head()

Unnamed: 0,Age,Gender,Experience,WorkHoursPerWeek,RemoteRatio,SatisfactionLevel,StressLevel,Burnout,JobRole_Analyst,JobRole_Engineer,JobRole_HR,JobRole_Manager,JobRole_Sales
0,32,1,3,60,21,4.4,1,0,True,False,False,False,False
1,40,0,9,47,67,2.09,2,0,False,True,False,False,False
2,33,0,2,44,20,2.58,3,0,False,True,False,False,False
3,35,0,6,44,70,3.23,8,0,False,False,False,True,False
4,59,1,8,38,46,4.41,1,0,False,False,False,False,True


In [594]:
new_data = pd.DataFrame([{
    'Age': 35,
    'Gender': 1,
    'Experience': 0,
    'WorkHoursPerWeek': 60,
    'RemoteRatio': 15,
    'SatisfactionLevel': 3.50,
    'StressLevel': 1,
    'JobRole_Analyst': True,
    'JobRole_Engineer': False,
    'JobRole_HR': False,
    'JobRole_Manager': False,
    'JobRole_Sales': False
}])
# Apply the same scaler (no fit, just transform)
new_data_scaled = st.transform(new_data)
new_data_scaled

array([[-0.52160232,  0.97897106, -1.1188413 ,  0.87290963, -1.19907791,
         0.44205333, -1.50859065,  1.93956303, -0.4793988 , -0.48038446,
        -0.51363709, -0.51072038]])

In [None]:
prediction=model.predict(new_data_scaled)
probability = model.predict_proba(new_data_scaled)

print("Predicted class:", prediction[0])
print("Probability scores:", probability[0])

Predicted class: 0
Probability scores: [9.99991158e-01 8.84191631e-06]
