## Part 1: Preprocessing

In [27]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras import layers
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from tensorflow import keras


#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
attrition_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [28]:
# Determine the number of unique values in each column.
attrition_df.nunique()

Unnamed: 0,0
Age,43
Attrition,2
BusinessTravel,3
Department,3
DistanceFromHome,29
Education,5
EducationField,6
EnvironmentSatisfaction,4
HourlyRate,71
JobInvolvement,4


In [29]:
# Create y_df with the Attrition and Department columns
y_df = attrition_df[["Attrition", "Department"]]


In [30]:
# Create a list of at least 10 column names to use as X data
X_columns = ["Age", "BusinessTravel", "DistanceFromHome", "Education",
             "EnvironmentSatisfaction", "HourlyRate", "JobInvolvement",
             "PerformanceRating", "TotalWorkingYears", "YearsAtCompany"]

# Create X_df using your selected columns
X_df = attrition_df[X_columns]

# Show the data types for X_df
X_df.dtypes

Unnamed: 0,0
Age,int64
BusinessTravel,object
DistanceFromHome,int64
Education,int64
EnvironmentSatisfaction,int64
HourlyRate,int64
JobInvolvement,int64
PerformanceRating,int64
TotalWorkingYears,int64
YearsAtCompany,int64


In [31]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
# Targets (y)
y_department = attrition_df[["Department"]]
y_attrition = attrition_df[["Attrition"]]

# Train-test split
X_train, X_test, y_train_department, y_test_department, y_train_attrition, y_test_attrition = train_test_split(
    X_df, y_department, y_attrition, test_size=0.2, random_state=42
)


In [32]:
# Convert your X data to numeric data types however you see fit
# Add new code cells as necessary
# Encoder for BusinessTravel
travel_encoder = OneHotEncoder(sparse_output=False)

# Fit on training data and transform both sets
X_train_travel = travel_encoder.fit_transform(X_train[["BusinessTravel"]])
X_test_travel = travel_encoder.transform(X_test[["BusinessTravel"]])

# Drop original 'BusinessTravel' and concatenate the encoded columns
import numpy as np
X_train_encoded = np.concatenate([X_train.drop(columns=["BusinessTravel"]), X_train_travel], axis=1)
X_test_encoded = np.concatenate([X_test.drop(columns=["BusinessTravel"]), X_test_travel], axis=1)

In [33]:
# Create a StandardScaler
scaler = StandardScaler()

# Fit the StandardScaler to the training data
X_train_scaled = scaler.fit_transform(X_train_encoded)

# Scale the training and testing data
X_test_scaled = scaler.transform(X_test_encoded)


In [34]:
# Create a OneHotEncoder for the Department column

dept_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit the encoder to the training data
dept_encoder.fit(y_train_department["Department"].values.reshape(-1, 1))

# Create two new variables by applying the encoder
# to the training and testing data
y_train_department_encoded = dept_encoder.transform(np.array(y_train_department["Department"]).reshape(-1, 1))
y_test_department_encoded = dept_encoder.transform(np.array(y_test_department["Department"]).reshape(-1, 1))

y_train_department_encoded

array([[0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       ...,
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [35]:
# Convert to Dataframe
y_train_department_encoded_df = pd.DataFrame(
    y_train_department_encoded,
    columns=dept_encoder.get_feature_names_out(['Department']),
    index=y_train_department.index
)

y_test_department_encoded_df = pd.DataFrame(
    y_test_department_encoded,
    columns=dept_encoder.get_feature_names_out(['Department']),
    index=y_test_department.index
)


In [36]:
print("Encoded training data (Department):")
print(y_train_department_encoded[:5])

print("\nEncoded testing data (Department):")
print(y_test_department_encoded[:5])

Encoded training data (Department):
[[0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]]

Encoded testing data (Department):
[[0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]]


In [37]:
print("Encoded training data (Department):")
print(y_train_department_encoded_df[:5])

print("\nEncoded testing data (Department):")
print(y_test_department_encoded_df[:5])

Encoded training data (Department):
      Department_Human Resources  Department_Research & Development  \
1097                         0.0                                1.0   
727                          0.0                                1.0   
254                          0.0                                0.0   
1175                         0.0                                1.0   
1341                         0.0                                1.0   

      Department_Sales  
1097               0.0  
727                0.0  
254                1.0  
1175               0.0  
1341               0.0  

Encoded testing data (Department):
      Department_Human Resources  Department_Research & Development  \
1041                         0.0                                0.0   
184                          0.0                                1.0   
1222                         1.0                                0.0   
67                           0.0                                1.0

In [38]:
# Create a OneHotEncoder for the Attrition column
#attrition_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
attrition_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop=None)
#attrition_encoder.fit(y_train_attrition)
attrition_encoder.fit(np.array(y_train_attrition['Attrition']).reshape(-1, 1))
# Create two new variables by applying the encoder
# to the training and testing data

#y_train_attrition_encoded = attrition_encoder.transform(y_train_attrition)
#y_test_attrition_encoded = attrition_encoder.transform(y_test_attrition)
y_train_attrition_encoded = attrition_encoder.transform(np.array(y_train_attrition['Attrition']).reshape(-1, 1))
y_test_attrition_encoded = attrition_encoder.transform(np.array(y_test_attrition['Attrition']).reshape(-1, 1))

y_train_attrition_encoded

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [0., 1.],
       [1., 0.],
       [1., 0.]])

In [39]:
y_train_attrition_encoded_df = pd.DataFrame(
    y_train_attrition_encoded,
    columns=attrition_encoder.get_feature_names_out(['Attrition']),
    index=y_train_attrition.index
)

y_test_attrition_encoded_df = pd.DataFrame(
    y_test_attrition_encoded,
    columns=attrition_encoder.get_feature_names_out(['Attrition']),
    index=y_test_attrition.index
)


In [40]:
# Print the encoded outputs for verification
print("Encoded training data (Attrition):")
print(y_train_attrition_encoded_df[:5])

print("\nEncoded testing data (Attrition):")
print(y_test_attrition_encoded_df[:5])

Encoded training data (Attrition):
      Attrition_No  Attrition_Yes
1097           1.0            0.0
727            1.0            0.0
254            1.0            0.0
1175           1.0            0.0
1341           1.0            0.0

Encoded testing data (Attrition):
      Attrition_No  Attrition_Yes
1041           1.0            0.0
184            1.0            0.0
1222           0.0            1.0
67             1.0            0.0
220            1.0            0.0


## Create, Compile, and Train the Model

In [41]:
# Find the number of columns in the X training data
input_features = X_train_scaled.shape[1]


# Create the input layer
input_layer = layers.Input(shape=(input_features,), name='input_layer')


# Create at least two shared layers
shared_layer1 = layers.Dense(64, activation='relu', name='shared_layer1')(input_layer)
shared_layer2 = layers.Dense(128, activation='relu', name='shared_layer2')(shared_layer1)

In [42]:
# Create a branch for Department
# with a hidden layer and an output layer

# Create the hidden layer
#dept_hidden = layers.Dense(16, activation="relu")(shared_layer2)
dept_hidden = layers.Dense(16, activation="relu", name='dept_hidden')(shared_layer2)


# Create the output layer
dept_output = layers.Dense(
    y_train_department_encoded.shape[1],
    activation="softmax",
    name="Department_Output"
)(dept_hidden)


In [43]:
# Create a branch for Attrition
# with a hidden layer and an output layer

# Create the hidden layer
attr_hidden = layers.Dense(32, activation="relu", name='attr_hidden')(shared_layer2)


# Create the output layer
# Attrition Output Layer (Two neurons for one-hot encoding)
# attr_output = layers.Dense(
   #  y_train_attrition_encoded.shape[1],
    # activation="sigmoid",
   #  name="Attrition_Output"
# )(attr_hidden)
attr_output = layers.Dense(
    1,  # Single neuron for binary classification
    activation="sigmoid",  # Use sigmoid for binary classification
    name="Attrition_Output"
)(attr_hidden)

In [44]:
# Create the model
model = Model(inputs=input_layer, outputs=[dept_output, attr_output])

# Compile the model
model.compile(
    optimizer="adam",
    loss={
        "Department_Output": "categorical_crossentropy",
        "Attrition_Output": "categorical_crossentropy",
    },
    metrics={
        "Department_Output": "accuracy",
        "Attrition_Output": "accuracy",
    }
)


# Summarize the model
model.summary()

In [43]:
print(y_train_department_encoded.shape)

(1176, 3)


In [44]:
print(y_train_attrition_encoded.shape)

(1176, 2)


In [18]:
print("Shape of X_train_scaled:", X_train_scaled.shape)

Shape of X_train_scaled: (1176, 12)


In [19]:
print("Shape of y_train_department_encoded:", y_train_department_encoded.shape)
print("Sample y_train_department_encoded:", y_train_department_encoded[:5])

print("Shape of y_train_attrition_encoded:", y_train_attrition_encoded.shape)
print("Sample y_train_attrition_encoded:", y_train_attrition_encoded[:5])

Shape of y_train_department_encoded: (1176, 3)
Sample y_train_department_encoded: [[0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]]
Shape of y_train_attrition_encoded: (1176, 2)
Sample y_train_attrition_encoded: [[1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]]


In [20]:
print(y_train_attrition["Attrition"].unique())

['No' 'Yes']


In [21]:
print("y_train_attrition_encoded shape:", y_train_attrition_encoded.shape)
print("Unique values in y_train_attrition_encoded:", np.unique(y_train_attrition_encoded, axis=0))


y_train_attrition_encoded shape: (1176, 2)
Unique values in y_train_attrition_encoded: [[0. 1.]
 [1. 0.]]


In [45]:
# Train the model
# Combine outputs: 'Department_Output' for department and 'Attrition_Output' for attrition
## y_train_targets = {
  ##  'Attrition_Output': y_train_attrition_encoded,  # Already one-hot encoded
   ## 'Department_Output': y_train_department_encoded  # Already one-hot encoded
## }

## history = model.fit(
##    X_train_scaled,
    ## y_train_targets,  # Directly use the one-hot encoded labels
    ## epochs=100,
    ## batch_size=32,
    ## validation_split=0.2,
    ## $$verbose=1
##)
model.fit(
    X_train_scaled,
    {
        "Department_Output": y_train_department_encoded_df.values,
        "Attrition_Output": y_train_attrition_encoded_df.values,
    },
    epochs=100,
    batch_size=32,
    validation_split=0.2,
    verbose=1
)

Epoch 1/100


ValueError: Arguments `target` and `output` must have the same shape. Received: target.shape=(None, 2), output.shape=(None, 3)

In [None]:
print("Unique values in y_train_department:", y_train_department["Department"].unique())
print("Unique values in y_train_attrition:", y_train_attrition["Attrition"].unique())



In [46]:
# Evaluate the model with the testing data
loss, dept_acc, attr_acc = model.evaluate(
    X_test_scaled,
    {
        "Department_Output": y_test_department_encoded,
        "Attrition_Output": y_test_attrition_encoded
    }
)

ValueError: Arguments `target` and `output` must have the same shape. Received: target.shape=(None, 2), output.shape=(None, 3)

In [None]:
# Print the accuracy for both department and attrition
print(f"Department Accuracy: {dept_acc}")
print(f"Attrition Accuracy: {attr_acc}")

# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?

2. What activation functions did you choose for your output layers, and why?

3. Can you name a few ways that this model might be improved?

YOUR ANSWERS HERE

1. Accuracy can be a good baseline metric to measure data results but should be evaluated with caution. If the data is imbalanced than accuracy can be a misleading metric.
2. For the Attrition output I used a sigmoid activation because it was a binary classification problem. For the Department output I used softmax because it is multiclass.
3. Two things that come to mind first are experiment with hyperparameter tuning changing number of layers and neurons in the model. Secondly, you could preprocess the data a little more with some feature engineering. Doing these things could have a positive impact on the models result