## Part 1: Preprocessing

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras import layers
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder


#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
attrition_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [2]:
# Determine the number of unique values in each column.
attrition_df.nunique()

Age                         43
Attrition                    2
BusinessTravel               3
Department                   3
DistanceFromHome            29
Education                    5
EducationField               6
EnvironmentSatisfaction      4
HourlyRate                  71
JobInvolvement               4
JobLevel                     5
JobRole                      9
JobSatisfaction              4
MaritalStatus                3
NumCompaniesWorked          10
OverTime                     2
PercentSalaryHike           15
PerformanceRating            2
RelationshipSatisfaction     4
StockOptionLevel             4
TotalWorkingYears           40
TrainingTimesLastYear        7
WorkLifeBalance              4
YearsAtCompany              37
YearsInCurrentRole          19
YearsSinceLastPromotion     16
YearsWithCurrManager        18
dtype: int64

In [3]:
# Create y_df with the Attrition and Department columns
y_df = attrition_df[["Attrition", "Department"]]
y_df.head()

Unnamed: 0,Attrition,Department
0,Yes,Sales
1,No,Research & Development
2,Yes,Research & Development
3,No,Research & Development
4,No,Research & Development


In [4]:
# Create a list of at least 10 column names to use as X data
x_list = ['Age', 'DistanceFromHome', 'Education', 'JobSatisfaction', 'OverTime', 'PercentSalaryHike', 
          'JobLevel', 'YearsAtCompany', 'YearsSinceLastPromotion', 'HourlyRate','EnvironmentSatisfaction']

# Create X_df using your selected columns
X_df = attrition_df[x_list]

# Show the data types for X_df
X_df.dtypes



Age                         int64
DistanceFromHome            int64
Education                   int64
JobSatisfaction             int64
OverTime                   object
PercentSalaryHike           int64
JobLevel                    int64
YearsAtCompany              int64
YearsSinceLastPromotion     int64
HourlyRate                  int64
EnvironmentSatisfaction     int64
dtype: object

In [5]:
# Convert your X data to numeric data types however you see fit
encoder = LabelEncoder()

# Assuming df is your DataFrame and 'OverTime' is the column to encode
X_df['OverTime'] = encoder.fit_transform(X_df['OverTime'])

# Check the unique values to confirm
X_df['OverTime'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_df['OverTime'] = encoder.fit_transform(X_df['OverTime'])


OverTime
0    1054
1     416
Name: count, dtype: int64

In [6]:
X_df.dtypes


Age                        int64
DistanceFromHome           int64
Education                  int64
JobSatisfaction            int64
OverTime                   int64
PercentSalaryHike          int64
JobLevel                   int64
YearsAtCompany             int64
YearsSinceLastPromotion    int64
HourlyRate                 int64
EnvironmentSatisfaction    int64
dtype: object

In [7]:
# In the given code, the split occured before encoding, but that lead to errors. So I split data after ecoding OverTime column.
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, random_state=78)


In [8]:
# Create a StandardScaler
scaler = StandardScaler()

# Fit the StandardScaler to the training data
scaler.fit(X_train)

# Scale the training and testing data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [9]:
# Initialize OneHotEncoder
encoder_dept = OneHotEncoder() 

# Fit the encoder to the training data
encoder_dept.fit(y_train[['Department']])

# Ensure y_train and y_test are reshaped properly for transformation
y_train_encoded_dept = encoder_dept.transform(y_train[['Department']]).toarray()
y_test_encoded_dept = encoder_dept.transform(y_test[['Department']]).toarray()

y_train_encoded_dept

array([[0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       ...,
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.]])

In [10]:
# Create a OneHotEncoder for the Attrition column
encoder_attr = OneHotEncoder() 

# Fit the encoder to the training data
encoder_attr.fit(y_train[['Attrition']])

# Create two new variables by applying the encoder
# to the training and testing data
y_train_encoded_attr = encoder_attr.transform(y_train[['Attrition']]).toarray()
y_test_encoded_attr = encoder_attr.transform(y_test[['Attrition']]).toarray()

y_train_encoded_attr

array([[0., 1.],
       [1., 0.],
       [0., 1.],
       ...,
       [1., 0.],
       [1., 0.],
       [0., 1.]])

In [11]:
#Viewed the y_train shapes to know the units neeed in the output layer
print(y_train_encoded_attr.shape)
print(y_train_encoded_dept.shape)

(1102, 2)
(1102, 3)


## Create, Compile, and Train the Model

In [12]:
# Find the number of columns in the X training data
len(X_train.columns)


11

In [21]:
# Create the input layer
input_layer = layers.Input(shape=(11,), name='input_features')

# Create at least two shared layers
shared_layer1 = layers.Dense(64, activation='relu')(input_layer)
shared_layer2 = layers.Dense(128, activation='relu')(shared_layer1)

In [22]:
# Create a branch for Department
# with a hidden layer and an output layer

# Create the hidden layer
hidden_layer_dept = layers.Dense(32, activation='relu')(shared_layer2)

# Create the output layer
department_output = layers.Dense(3, activation='softmax', name='department_output')(hidden_layer_dept)


In [23]:
# Create a branch for Attrition
# with a hidden layer and an output layer

# Create the hidden layer
hidden_layer_attr = layers.Dense(32, activation='relu')(shared_layer2)

# Create the output layer
attrition_output = layers.Dense(2, activation='sigmoid', name='attrition_output')(hidden_layer_attr)


In [24]:
# Create the model
model = Model(inputs=input_layer, outputs=[
    department_output,
    attrition_output
])

# Compile the model
model.compile(optimizer='adam', 
              loss= {'department_output': 'categorical_crossentropy', 
                    'attrition_output': 'binary_crossentropy'
                    }, 
              metrics= {'department_output': 'accuracy', 
                    'attrition_output': 'accuracy'
                    })

# Summarize the model
model.summary()

In [25]:
# Train the model
history = model.fit(
    X_train,
    {
        'department_output': y_train_encoded_dept,
        'attrition_output': y_train_encoded_attr
    },
    epochs=100, 
    batch_size=32, 
    validation_split=0.2,
    verbose=1
)


Epoch 1/100


[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - attrition_output_accuracy: 0.7437 - department_output_accuracy: 0.5694 - loss: 1.9451 - val_attrition_output_accuracy: 0.8235 - val_department_output_accuracy: 0.6471 - val_loss: 1.2472
Epoch 2/100
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - attrition_output_accuracy: 0.8203 - department_output_accuracy: 0.6443 - loss: 1.2798 - val_attrition_output_accuracy: 0.8235 - val_department_output_accuracy: 0.6606 - val_loss: 1.3039
Epoch 3/100
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - attrition_output_accuracy: 0.8268 - department_output_accuracy: 0.5913 - loss: 1.3199 - val_attrition_output_accuracy: 0.8235 - val_department_output_accuracy: 0.6516 - val_loss: 1.2178
Epoch 4/100
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - attrition_output_accuracy: 0.8104 - department_output_accuracy: 0.6445 - loss: 1.2460 - val_attrition_outp

In [26]:
# Evaluate the model with the testing data
results = model.evaluate(X_test, {
    'department_output': y_test_encoded_dept,
    'attrition_output': y_test_encoded_attr
}, verbose=1)

results


[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 739us/step - attrition_output_accuracy: 0.8666 - department_output_accuracy: 0.6115 - loss: 1.4443


[1.3989639282226562, 0.8722826242446899, 0.6086956262588501]

In [27]:
# Print the accuracy for both department and attrition
print("Department Accuracy:", results[2])
print("Attrition Accuracy:", results[1])

Department Accuracy: 0.6086956262588501
Attrition Accuracy: 0.8722826242446899


# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?

2. What activation functions did you choose for your output layers, and why?

3. Can you name a few ways that this model might be improved?

YOUR ANSWERS HERE

1. I believe it is. The model accuracy on the training data was close to accuracy on the testing data. Showing that the accuracy was a good measure of the models performance, even on 'new' data. 
2. I chose softmax for the department output because the department is mutually exclusive, you can not simultaneously work for two departments. I chose sigmoid for the attrition output because it is a binary classification. 
3. I improved model accuracy by adding another feature, 11 instead of 10, and carefully selecting features that would be related to the targets. I played around with a few features before getting to the accuracy of 87% I have now. For example, when I added relationship satisfaction or work life balance and increased the iput layer to 12, it weakended the model accuracy closer to 80% for attrition. 