## Part 1: Preprocessing

In [None]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras import layers

#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
attrition_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [None]:
# Determine the number of unique values in each column.
attrition_df.nunique()

Unnamed: 0,0
Age,43
Attrition,2
BusinessTravel,3
Department,3
DistanceFromHome,29
Education,5
EducationField,6
EnvironmentSatisfaction,4
HourlyRate,71
JobInvolvement,4


In [None]:
# Create y_df with the Attrition and Department columns

# Create y_df with the Attrition and Department columns
y_df = attrition_df[['Attrition', 'Department']]


In [None]:
# Create a list of at least 10 column names to use as X data
selected_columns = ['Age', 'Education', 'DistanceFromHome', 'JobSatisfaction', 'StockOptionLevel',
                    'OverTime', 'TotalWorkingYears', 'WorkLifeBalance', 'YearsAtCompany', 'TrainingTimesLastYear']

# Create X_df using your selected columns
X_df = attrition_df[selected_columns]

# Show the data types for X_df
print(X_df.dtypes)


Age                       int64
Education                 int64
DistanceFromHome          int64
JobSatisfaction           int64
StockOptionLevel          int64
OverTime                 object
TotalWorkingYears         int64
WorkLifeBalance           int64
YearsAtCompany            int64
TrainingTimesLastYear     int64
dtype: object


In [None]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split



In [None]:
# Convert your X data to numeric data types however you see fit
# Add new code cells as necessary

from sklearn.preprocessing import LabelEncoder

# Convert the 'OverTime' column to numeric using LabelEncoder
le = LabelEncoder()
X_df['OverTime'] = le.fit_transform(X_df['OverTime'])

# Check the transformed 'OverTime' column
X_df['OverTime'].value_counts()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_df['OverTime'] = le.fit_transform(X_df['OverTime'])


Unnamed: 0_level_0,count
OverTime,Unnamed: 1_level_1
0,1054
1,416


In [None]:
# Create a StandardScaler
scaler = StandardScaler()

# Split the data into training and testing sets before scaling
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.3, random_state=42)

# Fit the StandardScaler to the training data
scaler.fit(X_train)

# Scale the training and testing data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Print the first few rows of the scaled data to confirm
print(X_train_scaled[:5])


[[ 1.41369115 -0.8472024  -1.01772844  1.1744894   0.20342603 -0.63848315
   2.64888247 -1.07369461 -0.32028746 -1.38941706]
 [-0.09834647 -0.8472024  -0.40936167 -0.6497364   1.34728057 -0.63848315
  -0.53430534  0.36115182 -0.65002814 -2.16897549]
 [-1.71838678 -1.79684431 -0.28768831 -0.6497364  -0.94042852 -0.63848315
  -1.29827042  0.36115182 -0.97976881  0.16969979]
 [ 1.41369115  0.10243951 -0.77438173 -0.6497364   0.20342603 -0.63848315
   2.64888247  0.36115182  4.1312116  -0.60985864]
 [ 1.62969653  1.05208141 -0.28768831 -0.6497364  -0.94042852 -0.63848315
   0.86629729  0.36115182  0.17432354  0.94925822]]


In [None]:
from sklearn.preprocessing import OneHotEncoder

# Create a OneHotEncoder for the Department column
ohe_department = OneHotEncoder(sparse_output=False)

# Fit the encoder to the training data
ohe_department.fit(y_train[['Department']])

# Create two new variables by applying the encoder
# to the training and testing data

# Apply the encoder to the training and testing data
department_train_encoded = ohe_department.transform(y_train[['Department']])
department_test_encoded = ohe_department.transform(y_test[['Department']])

# Display the shape of the encoded Department column
print(department_train_encoded.shape)
print(department_test_encoded.shape)



(1029, 3)
(441, 3)


In [None]:
# Create a OneHotEncoder for the Attrition column
ohe_attrition = OneHotEncoder(sparse_output=False)

# Fit the encoder to the training data
ohe_attrition.fit(y_train[['Attrition']])

# Create two new variables by applying the encoder
# to the training and testing data

# Apply the encoder to the training and testing data
attrition_train_encoded = ohe_attrition.transform(y_train[['Attrition']])
attrition_test_encoded = ohe_attrition.transform(y_test[['Attrition']])

# Display the shape of the encoded Attrition column
print(attrition_train_encoded.shape)
print(attrition_test_encoded.shape)



(1029, 2)
(441, 2)


## Create, Compile, and Train the Model

In [None]:
# Find the number of columns in the X training data
input_shape = X_train_scaled.shape[1]
print(f'Number of columns in X training data: {input_shape}')

from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model

# Create the input layer
input_layer = Input(shape=(input_shape,))

# Create at least two shared layers
shared_layer_1 = Dense(64, activation='relu')(input_layer)
shared_layer_2 = Dense(32, activation='relu')(shared_layer_1)


Number of columns in X training data: 10


In [None]:
# Create a branch for Department
# with a hidden layer and an output layer

# Create the hidden layer
department_hidden_layer = Dense(16, activation='relu')(shared_layer_2)

# Create the output layer
department_output = Dense(3, activation='softmax', name='department_output')(department_hidden_layer)



In [None]:
# Create a branch for Attrition
# with a hidden layer and an output layer

# Create the hidden layer
attrition_hidden_layer = Dense(16, activation='relu')(shared_layer_2)

# Create the output layer
attrition_output = Dense(2, activation='softmax', name='attrition_output')(attrition_hidden_layer)


In [None]:
# Create the model
model = Model(inputs=input_layer, outputs=[department_output, attrition_output])


# Compile the model
model.compile(optimizer='adam',
              loss={'department_output': 'categorical_crossentropy', 'attrition_output': 'categorical_crossentropy'},
              metrics={'department_output': 'accuracy', 'attrition_output': 'accuracy'})


# Summarize the model
model.summary()


In [None]:
# Train the model
history = model.fit(X_train_scaled,
                    {'department_output': department_train_encoded, 'attrition_output': attrition_train_encoded},
                    epochs=10, batch_size=32,
                    validation_data=(X_test_scaled, {'department_output': department_test_encoded, 'attrition_output': attrition_test_encoded}))



Epoch 1/10
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 36ms/step - attrition_output_accuracy: 0.7782 - department_output_accuracy: 0.6342 - loss: 1.5145 - val_attrition_output_accuracy: 0.8617 - val_department_output_accuracy: 0.6463 - val_loss: 1.2544
Epoch 2/10
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - attrition_output_accuracy: 0.8378 - department_output_accuracy: 0.6610 - loss: 1.2389 - val_attrition_output_accuracy: 0.8617 - val_department_output_accuracy: 0.6463 - val_loss: 1.2050
Epoch 3/10
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - attrition_output_accuracy: 0.8099 - department_output_accuracy: 0.6351 - loss: 1.2554 - val_attrition_output_accuracy: 0.8617 - val_department_output_accuracy: 0.6463 - val_loss: 1.1901
Epoch 4/10
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - attrition_output_accuracy: 0.8197 - department_output_accuracy: 0.6568 - loss: 1.2002 - val_attri

In [None]:
# Evaluate the model with the testing data
test_results = model.evaluate(X_test_scaled,
                              {'department_output': department_test_encoded, 'attrition_output': attrition_test_encoded})

print(f'Test loss: {test_results[0]}')
print(f'Test department accuracy: {test_results[1]}')
print(f'Test attrition accuracy: {test_results[2]}')


[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - attrition_output_accuracy: 0.8745 - department_output_accuracy: 0.6427 - loss: 1.1809 
Test loss: 1.1690846681594849
Test department accuracy: 0.8775510191917419
Test attrition accuracy: 0.646258533000946


In [None]:
# Print the accuracy for both department and attrition
print(f'Department accuracy: {test_results[1]}')
print(f'Attrition accuracy: {test_results[2]}')


Department accuracy: 0.8775510191917419
Attrition accuracy: 0.646258533000946


# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?

2. What activation functions did you choose for your output layers, and why?

3. Can you name a few ways that this model might be improved?

YOUR ANSWERS HERE

1. Accuracy is useful as a general measure of performance, but it may not be the best metric depending on the distribution of classes. For instance, if the dataset is imbalanced (e.g., more people in one department than others or more people not leaving the company than those who do), accuracy could give a misleading impression of performance. In such cases, metrics like precision, recall, and F1-score would provide more insight into how well the model handles minority classes.

2. I used softmax for both output layers. Softmax is ideal for multi-class classification tasks (like the department column) because it converts raw model outputs into a probability distribution across all classes. Even for binary classification (attrition), softmax was chosen because it effectively normalizes the predictions into probabilities, ensuring clear and interpretable outputs.

3. Data Augmentation: In cases of imbalanced data, augmenting the minority classes could help the model learn better.

Hyperparameter Tuning: You could adjust the learning rate, batch size, or number of layers to optimize the model's performance.

Class Weights: If the dataset is imbalanced, assigning class weights during training can help the model focus more on underrepresented classes.

Using Other Metrics: As mentioned earlier, using additional metrics like precision, recall, or the F1-score could help you evaluate the model more effectively, especially for the minority classes.

Cross-validation: Implementing k-fold cross-validation can help ensure the model generalizes better across different subsets of the data.