## Part 1: Preprocessing

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras import layers

#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
attrition_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [2]:
# Determine the number of unique values in each column.
attrition_df.nunique()

Age                         43
Attrition                    2
BusinessTravel               3
Department                   3
DistanceFromHome            29
Education                    5
EducationField               6
EnvironmentSatisfaction      4
HourlyRate                  71
JobInvolvement               4
JobLevel                     5
JobRole                      9
JobSatisfaction              4
MaritalStatus                3
NumCompaniesWorked          10
OverTime                     2
PercentSalaryHike           15
PerformanceRating            2
RelationshipSatisfaction     4
StockOptionLevel             4
TotalWorkingYears           40
TrainingTimesLastYear        7
WorkLifeBalance              4
YearsAtCompany              37
YearsInCurrentRole          19
YearsSinceLastPromotion     16
YearsWithCurrManager        18
dtype: int64

In [3]:
# Create y_df with the Attrition and Department columns
y_df = attrition_df[['Attrition', 'Department']]
y_df

Unnamed: 0,Attrition,Department
0,Yes,Sales
1,No,Research & Development
2,Yes,Research & Development
3,No,Research & Development
4,No,Research & Development
...,...,...
1465,No,Research & Development
1466,No,Research & Development
1467,No,Research & Development
1468,No,Sales


In [4]:
attrition_df.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'EnvironmentSatisfaction', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'NumCompaniesWorked', 'OverTime', 'PercentSalaryHike',
       'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [5]:
# Create a list of at least 10 column names to use as X data
X_columns = [
    'Age',
    'HourlyRate',
    'YearsAtCompany',
    'JobLevel',
    'JobSatisfaction',
    'WorkLifeBalance',
    'EnvironmentSatisfaction',
    'DistanceFromHome',
    'YearsInCurrentRole',
    'TotalWorkingYears',
    'YearsSinceLastPromotion',
    'RelationshipSatisfaction'
]

# Create X_df using your selected columns
X_df = attrition_df[X_columns]

# Show the data types for X_df
print("\nData types for X_df:")
print(X_df.dtypes)



Data types for X_df:
Age                         int64
HourlyRate                  int64
YearsAtCompany              int64
JobLevel                    int64
JobSatisfaction             int64
WorkLifeBalance             int64
EnvironmentSatisfaction     int64
DistanceFromHome            int64
YearsInCurrentRole          int64
TotalWorkingYears           int64
YearsSinceLastPromotion     int64
RelationshipSatisfaction    int64
dtype: object


In [20]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [21]:
y = pd.get_dummies(y_df['Attrition'], drop_first=True)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    X_df, 
    y,
    random_state=42,
    stratify=y,
    test_size=0.2
)

In [23]:
# Create a StandardScaler
scaler = StandardScaler()

# Fit the StandardScaler to the training data
scaler.fit(X_train)

# Scale the training and testing data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [30]:

# Create a OneHotEncoder for the Department column
ohe = OneHotEncoder(sparse_output=False)

# Create two new variables by applying the encoder
# to the training and testing data
department_train = y_df.loc[y_train.index, 'Department'].values.reshape(-1, 1)
department_test = y_df.loc[y_test.index, 'Department'].values.reshape(-1, 1)

ohe.fit(department_train)

# Then transform both training and testing department data
department_train_encoded = ohe.transform(department_train)
department_test_encoded = ohe.transform(department_test)

# Convert to DataFrames with meaningful column names
department_columns = ohe.get_feature_names_out(['Department'])
department_train_encoded = pd.DataFrame(
    department_train_encoded, 
    columns=department_columns,
    index=y_train.index
)
department_test_encoded = pd.DataFrame(
    department_test_encoded, 
    columns=department_columns,
    index=y_test.index
)

In [31]:
# Create a OneHotEncoder for the Attrition column
attrition_encoder = OneHotEncoder(sparse_output=False)

# Get attrition data and reshape for encoding
attrition_train = y_df.loc[y_train.index, 'Attrition'].values.reshape(-1, 1)
attrition_test = y_df.loc[y_test.index, 'Attrition'].values.reshape(-1, 1)

# Fit the encoder on training attrition data
attrition_encoder.fit(attrition_train)

# Transform both training and testing attrition data
attrition_train_encoded = attrition_encoder.transform(attrition_train)
attrition_test_encoded = attrition_encoder.transform(attrition_test)

# Convert to DataFrames with meaningful column names
attrition_columns = attrition_encoder.get_feature_names_out(['Attrition'])
attrition_train_encoded = pd.DataFrame(
    attrition_train_encoded, 
    columns=attrition_columns,
    index=y_train.index
)
attrition_test_encoded = pd.DataFrame(
    attrition_test_encoded, 
    columns=attrition_columns,
    index=y_test.index
)



In [32]:
# Print shapes and first few rows to verify encoding
print("Training encoded attrition shape:", attrition_train_encoded.shape)
print("Testing encoded attrition shape:", attrition_test_encoded.shape)
print("\nFirst few rows of encoded training attrition:")
print(attrition_train_encoded.head())

Training encoded attrition shape: (1176, 2)
Testing encoded attrition shape: (294, 2)

First few rows of encoded training attrition:
      Attrition_No  Attrition_Yes
1194           1.0            0.0
128            1.0            0.0
810            1.0            0.0
478            1.0            0.0
491            1.0            0.0


## Create, Compile, and Train the Model

In [33]:
# Find the number of columns in the X training data
input_features = X_train_scaled.shape[1]
print("Number of input features:", input_features)

# Create the input layer
input_layer = layers.Input(shape=(input_features,))

# Create at least two shared layers
# Create shared layers with good practices for this type of problem
shared_layer1 = layers.Dense(units=32, activation='relu')(input_layer)
shared_layer1 = layers.BatchNormalization()(shared_layer1)
shared_layer1 = layers.Dropout(0.2)(shared_layer1)

shared_layer2 = layers.Dense(units=16, activation='relu')(shared_layer1)
shared_layer2 = layers.BatchNormalization()(shared_layer2)
shared_layer2 = layers.Dropout(0.2)(shared_layer2)

print("\nNetwork structure so far:")
print(f"Input shape: {input_features}")
print("Shared layer 1: 32 units with ReLU, BatchNorm, and 20% dropout")
print("Shared layer 2: 16 units with ReLU, BatchNorm, and 20% dropout")

Number of input features: 12

Network structure so far:
Input shape: 12
Shared layer 1: 32 units with ReLU, BatchNorm, and 20% dropout
Shared layer 2: 16 units with ReLU, BatchNorm, and 20% dropout


In [36]:
# Create a branch for Department
# with a hidden layer and an output layer
attrition_branch = layers.Dense(units=8, activation='relu')(shared_layer2)
attrition_branch = layers.BatchNormalization()(attrition_branch)
attrition_branch = layers.Dropout(0.1)(attrition_branch)

# Create the hidden layer
n_attrition = attrition_train_encoded.shape[1]
attrition_output = layers.Dense(units=n_attrition, activation='softmax', name='attrition_output')(attrition_branch)

print("\nAttrition Branch Structure:")
print("Hidden layer: 8 units with ReLU, BatchNorm, and 10% dropout")
print(f"Output layer: {n_attrition} units with softmax activation")




Attrition Branch Structure:
Hidden layer: 8 units with ReLU, BatchNorm, and 10% dropout
Output layer: 2 units with softmax activation


In [37]:
# Create a branch for Attrition
# with a hidden layer and an output layer
department_branch = layers.Dense(units=8, activation='relu')(shared_layer2)
department_branch = layers.BatchNormalization()(department_branch)
department_branch = layers.Dropout(0.1)(department_branch)

n_departments = department_train_encoded.shape[1]
department_output = layers.Dense(units=n_departments, activation='softmax', name='department_output')(department_branch)

print("\nDepartment Branch Structure:")
print("Hidden layer: 8 units with ReLU, BatchNorm, and 10% dropout")
print(f"Output layer: {n_departments} units with softmax activation")




Department Branch Structure:
Hidden layer: 8 units with ReLU, BatchNorm, and 10% dropout
Output layer: 3 units with softmax activation


In [38]:
# Create the complete model
model = Model(
   inputs=input_layer,
   outputs=[attrition_output, department_output]  # Error: attrition_output not created yet
)

# Let's create the attrition branch and output first
attrition_branch = layers.Dense(units=8, activation='relu')(shared_layer2)
attrition_branch = layers.BatchNormalization()(attrition_branch)
attrition_branch = layers.Dropout(0.1)(attrition_branch)

# Create Attrition output layer
n_attrition = attrition_train_encoded.shape[1]
attrition_output = layers.Dense(units=n_attrition, activation='softmax', name='attrition_output')(attrition_branch)

# Now create the complete model
model = Model(
   inputs=input_layer,
   outputs=[attrition_output, department_output]
)

# Compile the model
model.compile(
   optimizer='adam',
   loss={
       'attrition_output': 'categorical_crossentropy',
       'department_output': 'categorical_crossentropy'
   },
   metrics={
       'attrition_output': 'accuracy',
       'department_output': 'accuracy'
   }
)

# Display model summary
model.summary()


In [43]:
# Evaluate the model with the testing data
evaluation = model.evaluate(
    X_test_scaled,
    {
        'attrition_output': attrition_test_encoded,
        'department_output': department_test_encoded
    }
)

[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - attrition_output_accuracy: 0.8361 - attrition_output_loss: 0.4001 - department_output_accuracy: 0.6516 - department_output_loss: 0.7899 - loss: 1.1854 


In [44]:
# Print the accuracy for both department and attrition
print(f"Test Loss: {evaluation[0]}")
print(f"Attrition Output Loss: {evaluation[1]}")
print(f"Department Output Loss: {evaluation[2]}")
print(f"Attrition Output Accuracy: {evaluation[3]}")
print(f"Department Output Accuracy: {evaluation[4]}")

Test Loss: 1.228675365447998
Attrition Output Loss: 0.4193209707736969
Department Output Loss: 0.8347762227058411
Attrition Output Accuracy: 0.8299319744110107
Department Output Accuracy: 0.6496598720550537


# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?

2. What activation functions did you choose for your output layers, and why?

3. Can you name a few ways that this model might be improved?

YOUR ANSWERS HERE

1. 
2. 
3. 

1. For the attrition data set yes, where it has above a 75% accuracy score, and no for the Department as it falls below that threshold

2. BatchNormalization, Dense & Dropout. Not sure what these do, claude cave me these.

3. Cleaning the data more will always help. Using less columns from the data frame, or split it into different models. 