## Part 1: Preprocessing

In [244]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras import layers
from tensorflow.keras.metrics import CategoricalAccuracy, BinaryAccuracy
from tensorflow.keras import backend as K
from sklearn.model_selection import train_test_split


#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
attrition_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [245]:
print("Columns in DataFrame:", attrition_df.columns)

Columns in DataFrame: Index(['Age', 'Attrition', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'EnvironmentSatisfaction', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'NumCompaniesWorked', 'OverTime', 'PercentSalaryHike',
       'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')


In [246]:
# Determine the number of unique values in each column.
attrition_df.nunique()

Age                         43
Attrition                    2
BusinessTravel               3
Department                   3
DistanceFromHome            29
Education                    5
EducationField               6
EnvironmentSatisfaction      4
HourlyRate                  71
JobInvolvement               4
JobLevel                     5
JobRole                      9
JobSatisfaction              4
MaritalStatus                3
NumCompaniesWorked          10
OverTime                     2
PercentSalaryHike           15
PerformanceRating            2
RelationshipSatisfaction     4
StockOptionLevel             4
TotalWorkingYears           40
TrainingTimesLastYear        7
WorkLifeBalance              4
YearsAtCompany              37
YearsInCurrentRole          19
YearsSinceLastPromotion     16
YearsWithCurrManager        18
dtype: int64

In [249]:
# Create y_df with the Attrition and Department columns
y_df = attrition_df[['Attrition', 'Department']]
print("Columns in y_df:", y_df.columns)


Columns in y_df: Index(['Attrition', 'Department'], dtype='object')


In [250]:
# Create a list of at least 10 column names to use as X data
mycolumns = ['Education', 'DistanceFromHome', 'HourlyRate', 'JobSatisfaction', 'PercentSalaryHike',
             'TotalWorkingYears', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 
             'YearsSinceLastPromotion', 'RelationshipSatisfaction']
# Example list of columns (excluding 'Attrition' and 'Department')


# Create X_df using your selected columns
X_df = attrition_df[mycolumns]


# Show the data types for X_df
X_df.dtypes



Education                   int64
DistanceFromHome            int64
HourlyRate                  int64
JobSatisfaction             int64
PercentSalaryHike           int64
TotalWorkingYears           int64
WorkLifeBalance             int64
YearsAtCompany              int64
YearsInCurrentRole          int64
YearsSinceLastPromotion     int64
RelationshipSatisfaction    int64
dtype: object

In [251]:
# Reshape y values for attrition and department
y_attrition = attrition_df['Attrition'].values.reshape(-1, 1)
y_department = attrition_df['Department'].values.reshape(-1, 1)

In [252]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_attrition_train, y_attrition_test, y_department_train, y_department_test = train_test_split(
    X_df, y_attrition, y_department, test_size=0.2, random_state=42)

In [253]:
# Create a StandardScaler
scaler = StandardScaler()


# Fit the StandardScaler to the training data

scaler.fit(X_train)


# Scale the training and testing data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [255]:
# Create a OneHotEncoder for the Department column
D_encoder = OneHotEncoder(sparse_output=False)

# Fit the encoder to the training data
D_encoder.fit(y_department_train)


In [256]:
# Create two new variables by applying the encoder to the training and testing data
y_D_train_encoded = D_encoder.transform(y_department_train)
y_D_test_encoded = D_encoder.transform(y_department_test)


In [260]:
# Create a LabelEncoder for the Attrition column
encoder = LabelEncoder()
# Fit the encoder to the training data and transform the data
y_attr_train_encoded = encoder.fit_transform(y_attrition_train)
y_attr_test_encoded = encoder.transform(y_attrition_test)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


## Create, Compile, and Train the Model

In [263]:
# Find the number of columns in the X training data

input_shape = X_train_scaled.shape[1]

# Create the input layer
input_layer = layers.Input(shape=(input_shape,), name='input_features')

# Create at least two shared layers
shared1 = layers.Dense(24, activation='relu')(input_layer)
shared2 = layers.Dense(12, activation='relu')(shared1)

In [264]:
# Create a branch for Department
# with a hidden layer and an output layer

# Create the hidden layer
department_hidden = layers.Dense(12, activation='relu', name='department_hidden')(shared2)

# Create the output layer
department_output = layers.Dense(3, activation='softmax', name='department_output')(department_hidden)



In [265]:
# Create a branch for Attrition
# with a hidden layer and an output layer

# Create the hidden layer

attrition_hidden = layers.Dense(12, activation='relu', name='attrition_hidden')(shared2)
# Create the output layer
attrition_output = layers.Dense(1, activation='sigmoid', name='attrition_output')(attrition_hidden)



In [266]:
# Create the model
model = Model(inputs=input_layer, outputs=[department_output, attrition_output])


# Compile the model
model.compile(optimizer='adam',
              loss={'department_output': 'categorical_crossentropy', 'attrition_output': 'binary_crossentropy'},
              metrics={'department_output': 'accuracy', 'attrition_output': 'accuracy'})

# Summarize the model
model.summary()


In [267]:
# Train the model
model.fit(X_train_scaled, 
          {'department_output': y_D_train_encoded, 'attrition_output': y_attr_train_encoded}, 
          epochs=50, batch_size=10, validation_split=0.2)

Epoch 1/50
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - attrition_output_accuracy: 0.5889 - department_output_accuracy: 0.3683 - loss: 1.7626 - val_attrition_output_accuracy: 0.7966 - val_department_output_accuracy: 0.6314 - val_loss: 1.4508
Epoch 2/50
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - attrition_output_accuracy: 0.8510 - department_output_accuracy: 0.6762 - loss: 1.2955 - val_attrition_output_accuracy: 0.7966 - val_department_output_accuracy: 0.6314 - val_loss: 1.3354
Epoch 3/50
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - attrition_output_accuracy: 0.8379 - department_output_accuracy: 0.6517 - loss: 1.1985 - val_attrition_output_accuracy: 0.7966 - val_department_output_accuracy: 0.6314 - val_loss: 1.3114
Epoch 4/50
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - attrition_output_accuracy: 0.8291 - department_output_accuracy: 0.6502 - loss: 1.2012 - val_attrit

<keras.src.callbacks.history.History at 0x284a6b16a70>

In [268]:
# Evaluate the model with the testing data
# Evaluate the model with the testing data
results = model.evaluate(X_test_scaled, 
                         {'department_output': y_D_test_encoded, 'attrition_output': y_attr_test_encoded})
# Print the accuracy for both department and attrition
pred_categories = ['department', 'attrition']
for i, cat in enumerate(pred_categories):
    print(f"{cat} accuracy: {results[i+1]}")

[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - attrition_output_accuracy: 0.8087 - department_output_accuracy: 0.5551 - loss: 1.4326 
department accuracy: 0.8231292366981506
attrition accuracy: 0.5850340127944946


In [269]:
# Print the results to see what is being returned
print("Evaluation Results:", results)

Evaluation Results: [1.409824252128601, 0.8231292366981506, 0.5850340127944946]


# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?
2. What activation functions did you choose for your output layers, and why?
3. Can you name a few ways that this model might be improved?

YOUR ANSWERS HERE

1. My thoughts are accuracy isn't always the best metric, especially if the data is imbalanced. For example, if most employees stay and few leave, predicting everyone stays could give high accuracy but isn't useful. Metrics like precision, recall, F1-score, and ROC-AUC are better because they give a fuller picture of how well the model is performing.
2. I used Softmax for Department Prediction: This is used because it’s a multi-class problem. Softmax gives a probability for each department, making it clear which department is the best fit for each employee.
And I used; Sigmoid for Attrition Prediction: This is used for the binary problem of predicting whether an employee will stay or leave. Sigmoid gives a probability between 0 and 1, which is perfect for this yes/no type of question.

3. There are several methods to enhance a model performance such as:
a. Tune Hyperparameters: Adjust things like the number of layers, neurons, learning rate, batch size, and epochs to find the best settings.
b. Use Cross-Validation: Split the data into multiple parts and train the model on each part to get a more reliable performance estimate.
c. Feature Engineering: Create new features or select the most relevant ones to improve the model's input data.
d. Handle Class Imbalance: Use techniques like SMOTE or class weighting to make sure the model learns well from both common and rare classes.
e. Try Different Architectures: Experiment with more layers, different types of layers, or dropout layers to prevent overfitting.
f. Ensemble Methods: Combine predictions from multiple models to reduce errors and improve overall performance.
These improvements can make the model more accurate and reliable in predicting employee attrition and the best department for each employee.

But having more data is always key.






