## Part 1: Preprocessing

In [3]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras import layers

#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
attrition_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [4]:
# Determine the number of unique values in each column.
attrition_df.nunique()

Age                         43
Attrition                    2
BusinessTravel               3
Department                   3
DistanceFromHome            29
Education                    5
EducationField               6
EnvironmentSatisfaction      4
HourlyRate                  71
JobInvolvement               4
JobLevel                     5
JobRole                      9
JobSatisfaction              4
MaritalStatus                3
NumCompaniesWorked          10
OverTime                     2
PercentSalaryHike           15
PerformanceRating            2
RelationshipSatisfaction     4
StockOptionLevel             4
TotalWorkingYears           40
TrainingTimesLastYear        7
WorkLifeBalance              4
YearsAtCompany              37
YearsInCurrentRole          19
YearsSinceLastPromotion     16
YearsWithCurrManager        18
dtype: int64

In [19]:
# Create y_df with the Attrition and Department columns
y_df = attrition_df[['Attrition', 'Department']]
y_df.head()

# Print the first few rows of y_df to verify its contents
print(y_df.head())


  Attrition              Department
0       Yes                   Sales
1        No  Research & Development
2       Yes  Research & Development
3        No  Research & Development
4        No  Research & Development


In [34]:
# Display the columns in the dataset
print(attrition_df.columns)


Index(['Age', 'Attrition', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'EnvironmentSatisfaction', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'NumCompaniesWorked', 'OverTime', 'PercentSalaryHike',
       'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')


In [36]:
# Step 1: Create a list of at least 10 valid column names to use as X data
selected_columns = [
    'Age', 'BusinessTravel', 'DistanceFromHome', 'Education', 'EnvironmentSatisfaction',
    'JobInvolvement', 'JobLevel', 'JobRole', 'MaritalStatus', 'HourlyRate'
]

# Step 2: Create X_df using the selected columns
X_df = attrition_df[selected_columns]

# Step 3: Show the data types for X_df
print(X_df.dtypes)




Age                         int64
BusinessTravel             object
DistanceFromHome            int64
Education                   int64
EnvironmentSatisfaction     int64
JobInvolvement              int64
JobLevel                    int64
JobRole                    object
MaritalStatus              object
HourlyRate                  int64
dtype: object


In [49]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

# Step 2: Split the data into training and testing sets
# Define X (features) and y (target)
X = X_df
y = y_df['Attrition']

# Perform the split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the resulting datasets to confirm
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


X_train shape: (1176, 10)
X_test shape: (294, 10)
y_train shape: (1176,)
y_test shape: (294,)


In [61]:
# Convert your X data to numeric data types however you see fit
# Add new code cells as necessary

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Step 1: Identify the categorical and numerical columns
categorical_features = ['BusinessTravel', 'JobRole', 'MaritalStatus']
numerical_features = ['Age', 'DistanceFromHome', 'Education', 'EnvironmentSatisfaction', 
                      'JobInvolvement', 'JobLevel', 'HourlyRate']

# Step 2: Create transformers for the preprocessing pipeline
categorical_transformer = OneHotEncoder()
numerical_transformer = StandardScaler()

# Step 3: Create a preprocessor with the appropriate transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Step 4: Fit and transform the training data
X_train_preprocessed = preprocessor.fit_transform(X_train)

# Step 5: Transform the testing data
X_test_preprocessed = preprocessor.transform(X_test)

# Print the shape of the preprocessed training data
print("X_train_preprocessed shape:", X_train_preprocessed.shape)
print("X_test_preprocessed shape:", X_test_preprocessed.shape)


X_train_preprocessed shape: (1176, 22)
X_test_preprocessed shape: (294, 22)


In [72]:
from sklearn.preprocessing import StandardScaler

# Create a StandardScaler
scaler = StandardScaler()

# Fit the StandardScaler to the training data
scaler.fit(X_train_preprocessed)

# Scale the training and testing data
X_train_scaled = scaler.transform(X_train_preprocessed)
X_test_scaled = scaler.transform(X_test_preprocessed)

# Print the shapes of the scaled data to confirm
print("X_train_scaled shape:", X_train_scaled.shape)
print("X_test_scaled shape:", X_test_scaled.shape)


X_train_scaled shape: (1176, 22)
X_test_scaled shape: (294, 22)


In [82]:
from sklearn.preprocessing import OneHotEncoder

# Create a OneHotEncoder for the Department column
department_encoder = OneHotEncoder(sparse=False)

# Extract the Department column for training and testing
department_train = y_df.loc[y_train.index, 'Department'].values.reshape(-1, 1)
department_test = y_df.loc[y_test.index, 'Department'].values.reshape(-1, 1)

# Fit the encoder to the training data
department_encoder.fit(department_train)

# Create two new variables by applying the encoder
# to the training and testing data
department_train_encoded = department_encoder.transform(department_train)
department_test_encoded = department_encoder.transform(department_test)

# Print the shapes of the encoded data to confirm
print("department_train_encoded shape:", department_train_encoded.shape)
print("department_test_encoded shape:", department_test_encoded.shape)



department_train_encoded shape: (1176, 3)
department_test_encoded shape: (294, 3)




In [91]:
from sklearn.preprocessing import OneHotEncoder

# Create a OneHotEncoder for the Attrition column
attrition_encoder = OneHotEncoder(sparse=False)

# Extract the Attrition column for training and testing
attrition_train = y_train.values.reshape(-1, 1)
attrition_test = y_test.values.reshape(-1, 1)

# Fit the encoder to the training data
attrition_encoder.fit(attrition_train)

# Create two new variables by applying the encoder
# to the training and testing data
attrition_train_encoded = attrition_encoder.transform(attrition_train)
attrition_test_encoded = attrition_encoder.transform(attrition_test)

# Print the shapes of the encoded data to confirm
print("attrition_train_encoded shape:", attrition_train_encoded.shape)
print("attrition_test_encoded shape:", attrition_test_encoded.shape)


attrition_train_encoded shape: (1176, 2)
attrition_test_encoded shape: (294, 2)




## Create, Compile, and Train the Model

In [100]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Find the number of columns in the X training data
input_dim = X_train_scaled.shape[1]
print("Number of columns in X_train_scaled:", input_dim)

# Create the input layer
model = Sequential()

# Create at least two shared layers
model.add(Dense(units=32, activation='relu', input_shape=(input_dim,)))
model.add(Dense(units=16, activation='relu'))
model.add(Dense(units=8, activation='relu'))

# Output layer
model.add(Dense(units=1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Print the model summary
model.summary()




Number of columns in X_train_scaled: 22
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 32)                736       
                                                                 
 dense_1 (Dense)             (None, 16)                528       
                                                                 
 dense_2 (Dense)             (None, 8)                 136       
                                                                 
 dense_3 (Dense)             (None, 1)                 9         
                                                                 
Total params: 1,409
Trainable params: 1,409
Non-trainable params: 0
_________________________________________________________________


In [107]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Concatenate

# Find the number of columns in the X training data
input_dim = X_train_scaled.shape[1]
print("Number of columns in X_train_scaled:", input_dim)

# Create the input layer
input_layer = Input(shape=(input_dim,))
shared_layer_1 = Dense(units=32, activation='relu')(input_layer)
shared_layer_2 = Dense(units=16, activation='relu')(shared_layer_1)
shared_layer_3 = Dense(units=8, activation='relu')(shared_layer_2)

# Create a branch for Department
# with a hidden layer and an output layer

# Create the hidden layer
department_input = Input(shape=(department_train_encoded.shape[1],), name='department_input')
department_hidden = Dense(units=8, activation='relu')(department_input)

# Create the output layer
department_output = Dense(units=1, activation='sigmoid', name='department_output')(department_hidden)

# Concatenate the shared layers with the Department branch
merged = Concatenate()([shared_layer_3, department_hidden])

# Final output layer
final_output = Dense(units=1, activation='sigmoid', name='final_output')(merged)

# Create the final model
model = Model(inputs=[input_layer, department_input], outputs=[final_output, department_output])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Print the model summary
model.summary()




Number of columns in X_train_scaled: 22
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 22)]         0           []                               
                                                                                                  
 dense_4 (Dense)                (None, 32)           736         ['input_1[0][0]']                
                                                                                                  
 dense_5 (Dense)                (None, 16)           528         ['dense_4[0][0]']                
                                                                                                  
 department_input (InputLayer)  [(None, 3)]          0           []                               
                                                      

In [113]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Concatenate

# Find the number of columns in the X training data
input_dim = X_train_scaled.shape[1]
print("Number of columns in X_train_scaled:", input_dim)

# Create the input layer
input_layer = Input(shape=(input_dim,))
shared_layer_1 = Dense(units=32, activation='relu')(input_layer)
shared_layer_2 = Dense(units=16, activation='relu')(shared_layer_1)
shared_layer_3 = Dense(units=8, activation='relu')(shared_layer_2)

# Create a branch for Department with a hidden layer and an output layer

# Create the hidden layer for Department branch
department_input = Input(shape=(department_train_encoded.shape[1],), name='department_input')
department_hidden = Dense(units=8, activation='relu')(department_input)

# Create the output layer for Department branch
department_output = Dense(units=1, activation='sigmoid', name='department_output')(department_hidden)

# Create a branch for Attrition with a hidden layer and an output layer

# Create the hidden layer for Attrition branch
attrition_input = Input(shape=(1,), name='attrition_input')
attrition_hidden = Dense(units=8, activation='relu')(attrition_input)

# Create the output layer for Attrition branch
attrition_output = Dense(units=1, activation='sigmoid', name='attrition_output')(attrition_hidden)

# Concatenate the shared layers with the Department and Attrition branches
merged = Concatenate()([shared_layer_3, department_hidden, attrition_hidden])

# Final output layer
final_output = Dense(units=1, activation='sigmoid', name='final_output')(merged)

# Create the final model
model = Model(inputs=[input_layer, department_input, attrition_input], outputs=[final_output, department_output, attrition_output])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Print the model summary
model.summary()




Number of columns in X_train_scaled: 22
Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 22)]         0           []                               
                                                                                                  
 dense_8 (Dense)                (None, 32)           736         ['input_2[0][0]']                
                                                                                                  
 dense_9 (Dense)                (None, 16)           528         ['dense_8[0][0]']                
                                                                                                  
 department_input (InputLayer)  [(None, 3)]          0           []                               
                                                    

In [118]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Concatenate

# Find the number of columns in the X training data
input_dim = X_train_scaled.shape[1]
print("Number of columns in X_train_scaled:", input_dim)

# Create the input layer
input_layer = Input(shape=(input_dim,))
shared_layer_1 = Dense(units=32, activation='relu')(input_layer)
shared_layer_2 = Dense(units=16, activation='relu')(shared_layer_1)
shared_layer_3 = Dense(units=8, activation='relu')(shared_layer_2)

# Create a branch for Department with a hidden layer and an output layer

# Create the hidden layer for Department branch
department_input = Input(shape=(department_train_encoded.shape[1],), name='department_input')
department_hidden = Dense(units=8, activation='relu')(department_input)

# Create the output layer for Department branch
department_output = Dense(units=1, activation='sigmoid', name='department_output')(department_hidden)

# Create a branch for Attrition with a hidden layer and an output layer

# Create the hidden layer for Attrition branch
attrition_input = Input(shape=(1,), name='attrition_input')
attrition_hidden = Dense(units=8, activation='relu')(attrition_input)

# Create the output layer for Attrition branch
attrition_output = Dense(units=1, activation='sigmoid', name='attrition_output')(attrition_hidden)

# Concatenate the shared layers with the Department and Attrition branches
merged = Concatenate()([shared_layer_3, department_hidden, attrition_hidden])

# Final output layer
final_output = Dense(units=1, activation='sigmoid', name='final_output')(merged)

# Create the model
model = Model(inputs=[input_layer, department_input, attrition_input], outputs=[final_output, department_output, attrition_output])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Summarize the model
model.summary()


Number of columns in X_train_scaled: 22
Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 22)]         0           []                               
                                                                                                  
 dense_13 (Dense)               (None, 32)           736         ['input_3[0][0]']                
                                                                                                  
 dense_14 (Dense)               (None, 16)           528         ['dense_13[0][0]']               
                                                                                                  
 department_input (InputLayer)  [(None, 3)]          0           []                               
                                                    

In [132]:
#Convert the y_train and y_test labels to numerical values
y_train_encoded = y_train.apply(lambda x: 1 if x == 'Yes' else 0).values
y_test_encoded = y_test.apply(lambda x: 1 if x == 'Yes' else 0).values

print("y_train_encoded shape:", y_train_encoded.shape)
print("y_test_encoded shape:", y_test_encoded.shape)

#Build and compile the model
model = Sequential()
model.add(Dense(units=32, activation='relu', input_shape=(X_train_scaled.shape[1],)))
model.add(Dense(units=16, activation='relu'))
model.add(Dense(units=8, activation='relu'))
model.add(Dense(units=1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

#Train the model
history = model.fit(X_train_scaled, y_train_encoded, 
                    epochs=50, batch_size=32, 
                    validation_data=(X_test_scaled, y_test_encoded))

# Print the training history keys to confirm training
print(history.history.keys())






y_train_encoded shape: (1176,)
y_test_encoded shape: (294,)
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
dict_keys(['loss', 'accuracy', 'val_loss', 'val_accuracy'])


In [133]:
# Evaluate the model with the testing data
test_loss, test_accuracy = model.evaluate(X_test_scaled, y_test_encoded, verbose=2)

print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")


10/10 - 0s - loss: 0.4817 - accuracy: 0.8469 - 201ms/epoch - 20ms/step
Test Loss: 0.4817318022251129
Test Accuracy: 0.8469387888908386


In [138]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Concatenate
from sklearn.preprocessing import OneHotEncoder

# Data Preparation
y_train_encoded = y_train.apply(lambda x: 1 if x == 'Yes' else 0).values
y_test_encoded = y_test.apply(lambda x: 1 if x == 'Yes' else 0).values

department_encoder = OneHotEncoder(sparse=False)
department_train_encoded = department_encoder.fit_transform(y_df.loc[y_train.index, 'Department'].values.reshape(-1, 1))
department_test_encoded = department_encoder.transform(y_df.loc[y_test.index, 'Department'].values.reshape(-1, 1))

# Model Building
# Input for attrition prediction
attrition_input = Input(shape=(X_train_scaled.shape[1],), name='attrition_input')

# Input for department prediction
department_input = Input(shape=(department_train_encoded.shape[1],), name='department_input')

# Shared layers for attrition prediction
shared_layer = Dense(units=32, activation='relu')(attrition_input)
shared_layer = Dense(units=16, activation='relu')(shared_layer)
shared_layer = Dense(units=8, activation='relu')(shared_layer)

# Department branch
department_hidden = Dense(units=8, activation='relu')(department_input)
department_output = Dense(units=1, activation='sigmoid', name='department_output')(department_hidden)

# Merge shared layers and department branch
merged = Concatenate()([shared_layer, department_hidden])

# Final output for attrition prediction
attrition_output = Dense(units=1, activation='sigmoid', name='attrition_output')(merged)

# Create the model with both outputs
model = Model(inputs=[attrition_input, department_input], outputs=[attrition_output, department_output])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Model Training
model.fit(
    [X_train_scaled, department_train_encoded],  # Inputs
    [y_train_encoded, department_train_encoded[:, 0]],  # Outputs
    epochs=50, batch_size=32,
    validation_data=([X_test_scaled, department_test_encoded], [y_test_encoded, department_test_encoded[:, 0]])
)

# Model Evaluation and Printing Accuracies
evaluation_results = model.evaluate(
    [X_test_scaled, department_test_encoded],
    [y_test_encoded, department_test_encoded[:, 0]],
    verbose=2
)

# evaluation_results: [total_loss, attrition_output_loss, department_output_loss, attrition_output_accuracy, department_output_accuracy]
print(f"Attrition Test Accuracy: {evaluation_results[3]}")
print(f"Department Test Accuracy: {evaluation_results[4]}")





Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
10/10 - 0s - loss: 0.5002 - attrition_output_loss: 0.4903 - department_output_loss: 0.0099 - attrition_output_accuracy: 0.8401 - department_output_accuracy: 1.0000 - 213ms/epoch - 21ms/step
Attrition Test Accuracy: 0.8401360511779785
Department Test Accuracy: 1.0


# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?

2. What activation functions did you choose for your output layers, and why?

3. Can you name a few ways that this model might be improved?

YOUR ANSWERS HERE

1. No, because the dataset might be imbalanced. Metrics like precision or recall might provide a better understanding of model performance.
2. Sigmoid, because it is suitable for binary classification tasks by providing a probability output between 0 and 1.
3. Use cross-validation for more reliable evaluation or different techniques possibly