In [54]:
# Import pandas (as pd) to read the data
import pandas as pd

# Read "turnover.csv" and save it in a DataFrame called data
data = pd.read_csv("turnover_data.csv")

# Take a quick look to the first 5 rows of data
print(data.head(5))

# Get some information on the types of variables in data
data.info()


   satisfaction  evaluation  number_of_projects  average_montly_hours  \
0          0.38        0.53                   2                   157   
1          0.80        0.86                   5                   262   
2          0.11        0.88                   7                   272   
3          0.72        0.87                   5                   223   
4          0.37        0.52                   2                   159   

   time_spend_company  work_accident  churn  promotion department  salary  
0                   3              0      1          0      sales     low  
1                   6              0      1          0      sales  medium  
2                   4              0      1          0      sales  medium  
3                   5              0      1          0      sales     low  
4                   3              0      1          0      sales     low  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 column

In [55]:
# Print the unique values of the "department" column
print(data.department.unique())

# Print the unique values of the "salary" column
print(data.salary.unique())

['sales' 'accounting' 'hr' 'technical' 'support' 'management' 'IT'
 'product_mng' 'marketing' 'RandD']
['low' 'medium' 'high']


In [56]:
# Change the type of the "salary" column to categorical
data.salary = data.salary.astype('category')

data.salary = pd.Categorical(data.salary, categories=['low', 'medium', 'high'])

# Provide the correct order of categories
data.salary = data.salary.cat.reorder_categories(['low', 'medium', 'high'])

# Encode categories
data.salary = data.salary.cat.codes

# Get dummies and save them inside a new DataFrame
departments = pd.get_dummies(data.department)

# Take a quick look to the first 5 rows of the new DataFrame called departments
print(departments.head())

# Drop the "accounting" column to avoid "dummy trap"
departments = departments.drop("accounting", axis=1)

# Drop the old column "department" as you don't need it anymore
data = data.drop("department", axis=1)

# Join the new DataFrame "departments" to your employee dataset: done
data = data.join(departments)

      IT  RandD  accounting     hr  management  marketing  product_mng  sales  \
0  False  False       False  False       False      False        False   True   
1  False  False       False  False       False      False        False   True   
2  False  False       False  False       False      False        False   True   
3  False  False       False  False       False      False        False   True   
4  False  False       False  False       False      False        False   True   

   support  technical  
0    False      False  
1    False      False  
2    False      False  
3    False      False  
4    False      False  


In [57]:
#Percentage of employees who Churn

# Use len() function to get the total number of observations and save it as the number of employees
n_employees = len(data)

# Print the number of employees who left/stayed
print(data.churn.value_counts())

# Print the percentage of employees who left/stayed
print(data.churn.value_counts()/n_employees * 100)



churn
0    11428
1     3571
Name: count, dtype: int64
churn
0    76.191746
1    23.808254
Name: count, dtype: float64


In [58]:
# Set the target and features

# Choose the dependent variable column (churn) and set it as target
target = data.churn

# Drop column churn and set everything else as features
features = data.drop("churn",axis=1)

# Import the function for splitting dataset into train and test
from sklearn.model_selection import train_test_split

# Use that function to create the splits both for target and for features
# Set the test sample to be 25% of your observations
target_train, target_test, features_train, features_test = train_test_split(target,features,test_size=0.25,random_state=42)

# Import the classification algorithm
from sklearn.tree import DecisionTreeClassifier

# Initialize it and call model by specifying the random_state parameter
model = DecisionTreeClassifier(random_state=42)

# Apply a decision tree model to fit features to the target
model.fit(features_train, target_train)

In [59]:
#Checking the Accuracy of the prediction

# Apply a decision tree model to fit features to the target in the training set
model.fit(features_train,target_train)

# Check the accuracy score of the prediction for the training set
model.score(features_train,target_train)*100

# Check the accuracy score of the prediction for the test set
model.score(features_test,target_test)*100

97.22666666666666

In [60]:
## Exporting the Tree
# Import the graphical visualization export function
from sklearn.tree import export_graphviz

# Apply Decision Tree model to fit Features to the Target
model.fit(features_train,target_train)

# Export the tree to a dot file
export_graphviz(model,"tree.dot")

In [61]:
#tuning turnover classifier
# Initialize the DecisionTreeClassifier while limiting the depth of the tree to 5
model_depth_5 = DecisionTreeClassifier(max_depth=5, random_state=42)

# Fit the model
model_depth_5.fit(features_train,target_train)

# Print the accuracy of the prediction for the training set
print(model_depth_5.score(features_train,target_train)*100)

# Print the accuracy of the prediction for the test set
print(model_depth_5.score(features_test,target_test)*100)

97.71535247577563
97.06666666666666


In [62]:
# Initialize the DecisionTreeClassifier while limiting the sample size in leaves to 100
model_sample_100 = DecisionTreeClassifier(min_samples_leaf = 100, random_state=42)

# Fit the model
model_sample_100.fit(features_train,target_train)

# Print the accuracy of the prediction (in percentage points) for the training set
print(model_sample_100.score(features_train,target_train)*100)

# Print the accuracy of the prediction (in percentage points) for the test set
print(model_sample_100.score(features_test,target_test)*100)

96.57747355320473
96.13333333333334


In [63]:
# Calculating accuracy metrics: Recall
# Import the function to calculate recall score
from sklearn.metrics import recall_score

# Use the initial model to predict churn
prediction = model.predict(features_test)

# Calculate recall score by comparing target_test with the prediction
recall_score(target_test, prediction)

0.9632107023411371

In [64]:
# Calculating ROC/ AUC Score

# Import the function to calculate ROC/AUC score
from sklearn.metrics import roc_auc_score

# Use initial model to predict churn (based on features_test)
prediction = model.predict(features_test)

# Calculate ROC/AUC score by comparing target_test with the prediction
roc_auc_score(target_test, prediction)

0.9691623087590718

In [65]:
# Balancing Classes

# Initialize the DecisionTreeClassifier 
model_depth_5_b = DecisionTreeClassifier(max_depth=5,class_weight="balanced",random_state=42)

# Fit the model
model_depth_5_b.fit(features_train,target_train)

# Print the accuracy of the prediction (in percentage points) for the test set
print(model_depth_5_b.score(features_test,target_test)*100)

93.70666666666668


In [66]:
# Comparison of Employee Attrition Models

# Print the recall score for imbalanced Model
print("Scores for Imbalanced model")
print(recall_score(target_test,prediction))
# Print the ROC/AUC score
print(roc_auc_score(target_test,prediction))

# Initialize the model
model_depth_7_b = DecisionTreeClassifier(max_depth=7,class_weight="balanced",random_state=42)
# Fit it to the training component
model_depth_7_b.fit(features_train,target_train)
# Make prediction using test component
prediction_b = model_depth_7_b.predict(features_test)

print("Scores for Balanced model")

# Print the recall score for the balanced model
print(recall_score(target_test,prediction_b))
# Print the ROC/AUC score for the balanced model
print(roc_auc_score(target_test,prediction_b))

Scores for Imbalanced model
0.9632107023411371
0.9691623087590718
Scores for Balanced model
0.9319955406911928
0.959863876199084


In [67]:
#Cross Validation

# Import the function for implementing cross validation
from sklearn.model_selection import cross_val_score

# Use that function to print the cross validation score for 10 folds
print(cross_val_score(model,features,target,cv=10))

[0.98533333 0.98533333 0.974      0.96533333 0.96       0.97933333
 0.99       0.99333333 1.         1.        ]


In [68]:
# Setting up grid search parameters
# Generate values for maximum depth
depth = [i for i in range(5,21,1)]

# Generate values for minimum sample size
samples = [i for i in range(50,500,50)]

# Create the dictionary with parameters to be checked
parameters = dict(max_depth=depth, min_samples_leaf=samples)

In [69]:
# import the GridSearchCV function
from sklearn.model_selection import GridSearchCV

# set up parameters: done
parameters = dict(max_depth=depth, min_samples_leaf=samples)

# initialize the param_search function using the GridSearchCV function, initial model and parameters above
param_search = GridSearchCV(model, parameters, cv=3)

# fit the param_search to the training dataset
param_search.fit(features_train, target_train)

# print the best parameters found
print(param_search.best_params_)

{'max_depth': 5, 'min_samples_leaf': 50}


In [70]:
# Sorting important features

# Best Model 
model_best = DecisionTreeClassifier(max_depth=5, min_samples_leaf = 50 , random_state=42)
# Fit it to the training component
model_best.fit(features_train,target_train)

# Calculate feature importances
feature_importances = model_best.feature_importances_

# Create a list of features: done
feature_list = list(features)

# Save the results inside a DataFrame using feature_list as an index
relative_importances = pd.DataFrame(index=feature_list, data=feature_importances, columns=["importance"])

# Sort values to learn most important features
relative_importances.sort_values(by="importance", ascending=False)

Unnamed: 0,importance
satisfaction,0.551529
time_spend_company,0.157009
evaluation,0.144354
number_of_projects,0.092864
average_montly_hours,0.053087
technical,0.000631
hr,0.000295
salary,0.000231
promotion,0.0
work_accident,0.0


In [71]:
# Selecting Important Features
# select only features with relative importance higher than 1%
selected_features = relative_importances[relative_importances.importance >0.01]

# create a list from those features: done
selected_list = selected_features.index

# transform both features_train and features_test components to include only selected features
features_train_selected = features_train[selected_list]
features_test_selected = features_test[selected_list]

In [73]:
# Develop and test the best model
# Initialize the best model using parameters provided in description
model_best = DecisionTreeClassifier(max_depth = 5, min_samples_leaf =50, class_weight = "balanced", random_state=42)

# Fit the model using only selected features from training set: done
model_best.fit(features_train_selected, target_train)

# Make prediction based on selected list of features from test set
prediction_best = model_best.predict(features_test_selected)

# Print the general accuracy of the model_best
print(model_best.score(features_test_selected, target_test) * 100)

# Print the recall score of the model predictions
print(recall_score(target_test, prediction_best) * 100)

# Print the ROC/AUC score of the model predictions
print(roc_auc_score(target_test, prediction_best) * 100)



93.46666666666667
91.9732441471572
92.95472582401672


# Insights and Recommendations

## 1. Departments with High Attrition: 
Departments such as accounting, IT, and support exhibit higher attrition rates. These departments should be the focus of targeted retention strategies to address underlying issues such as job satisfaction and workload.

## 2. Impact of Salary on Attrition: 
While salary is a factor in attrition, it is not the most significant predictor. Efforts to improve retention should focus more on factors such as job satisfaction and employee evaluations.

## 3. Employee Satisfaction:
High attrition is strongly linked to low satisfaction levels. 
Initiatives to improve job satisfaction, such as providing growth opportunities, recognizing employee achievements, and fostering a positive work environment, could help reduce attrition.

## 4. Workload Management:
The number of projects and average monthly hours are significant predictors of attrition. Balancing workloads and ensuring employees are not overburdened can help in retaining employees.

## 5. Targeted Retention Strategies:
Develop tailored strategies for high-attrition departments, focusing on improving job satisfaction and work conditions.
Consider reviewing and adjusting salary structures, especially for lower salary levels, to ensure they are competitive and fair.

These insights and recommendations can help the company address the root causes of attrition and implement effective strategies to improve employee retention.