## Part 1: Preprocessing

In [91]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras import layers

#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
attrition_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [92]:
# Determine the number of unique values in each column.
attrition_df.nunique()

Age                         43
Attrition                    2
BusinessTravel               3
Department                   3
DistanceFromHome            29
Education                    5
EducationField               6
EnvironmentSatisfaction      4
HourlyRate                  71
JobInvolvement               4
JobLevel                     5
JobRole                      9
JobSatisfaction              4
MaritalStatus                3
NumCompaniesWorked          10
OverTime                     2
PercentSalaryHike           15
PerformanceRating            2
RelationshipSatisfaction     4
StockOptionLevel             4
TotalWorkingYears           40
TrainingTimesLastYear        7
WorkLifeBalance              4
YearsAtCompany              37
YearsInCurrentRole          19
YearsSinceLastPromotion     16
YearsWithCurrManager        18
dtype: int64

In [93]:
# Create y_df with the Attrition and Department columns
y_df = attrition_df['Department']
y_df.head()


0                     Sales
1    Research & Development
2    Research & Development
3    Research & Development
4    Research & Development
Name: Department, dtype: object

In [94]:
print(attrition_df.columns)

Index(['Age', 'Attrition', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'EnvironmentSatisfaction', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'NumCompaniesWorked', 'OverTime', 'PercentSalaryHike',
       'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')


In [95]:
# Create a list of at least 10 column names to use as X data
X_columns = ['Education', 'Age', 'DistanceFromHome', 'JobSatisfaction', 'OverTime', 
             'StockOptionLevel', 'WorkLifeBalance', 'YearsAtCompany', 
             'YearsSinceLastPromotion', 'NumCompaniesWorked', 'YearsWithCurrManager', 'RelationshipSatisfaction', 'MaritalStatus', 'BusinessTravel', 'JobLevel', 'JobRole', 'Attrition', 'Department']

# Create X_df using your selected columns
X_df = attrition_df[X_columns]

# Show the data types for X_df
print(X_df.dtypes)

Education                    int64
Age                          int64
DistanceFromHome             int64
JobSatisfaction              int64
OverTime                    object
StockOptionLevel             int64
WorkLifeBalance              int64
YearsAtCompany               int64
YearsSinceLastPromotion      int64
NumCompaniesWorked           int64
YearsWithCurrManager         int64
RelationshipSatisfaction     int64
MaritalStatus               object
BusinessTravel              object
JobLevel                     int64
JobRole                     object
Attrition                   object
Department                  object
dtype: object


In [96]:
print(X_df['BusinessTravel'])

0           Travel_Rarely
1       Travel_Frequently
2           Travel_Rarely
3       Travel_Frequently
4           Travel_Rarely
              ...        
1465    Travel_Frequently
1466        Travel_Rarely
1467        Travel_Rarely
1468    Travel_Frequently
1469        Travel_Rarely
Name: BusinessTravel, Length: 1470, dtype: object


In [97]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.2, random_state=42)

# Print the shapes of the resulting datasets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)



X_train shape: (1176, 18)
X_test shape: (294, 18)
y_train shape: (1176,)
y_test shape: (294,)


In [98]:
# Convert your X data to numeric data types however you see fit
# Add new code cells as necessary
# Convert categorical variables to numeric using one-hot encoding
from sklearn.preprocessing import OneHotEncoder

# List of categorical columns
categorical_columns = ['OverTime', 'MaritalStatus', 'BusinessTravel', 'JobRole']

# Initialize the OneHotEncoder
onehot = OneHotEncoder(handle_unknown='ignore')

# Fit and transform the categorical columns for training data
X_train_encoded = onehot.fit_transform(X_train[categorical_columns])

# Transform the categorical columns for testing data
X_test_encoded = onehot.transform(X_test[categorical_columns])

# Get the feature names after one-hot encoding
feature_names = onehot.get_feature_names_out(categorical_columns)

# Create new DataFrames with encoded categorical variables
X_train_encoded_df = pd.DataFrame(X_train_encoded.toarray(), columns=feature_names, index=X_train.index)
X_test_encoded_df = pd.DataFrame(X_test_encoded.toarray(), columns=feature_names, index=X_test.index)

# Drop original categorical columns and concatenate with encoded columns
X_train = pd.concat([X_train.drop(columns=categorical_columns), X_train_encoded_df], axis=1)
X_test = pd.concat([X_test.drop(columns=categorical_columns), X_test_encoded_df], axis=1)

# Convert remaining columns to numeric type
X_train = X_train.apply(pd.to_numeric, errors='coerce')
X_test = X_test.apply(pd.to_numeric, errors='coerce')

# Handle any remaining non-numeric columns (if any)
for column in X_train.columns:
    if X_train[column].dtype == 'object':
        X_train[column] = pd.Categorical(X_train[column]).codes
        X_test[column] = pd.Categorical(X_test[column]).codes

print("Data types after conversion:")
print(X_train.dtypes)
print("\nShape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)


Data types after conversion:
Education                              int64
Age                                    int64
DistanceFromHome                       int64
JobSatisfaction                        int64
StockOptionLevel                       int64
WorkLifeBalance                        int64
YearsAtCompany                         int64
YearsSinceLastPromotion                int64
NumCompaniesWorked                     int64
YearsWithCurrManager                   int64
RelationshipSatisfaction               int64
JobLevel                               int64
Attrition                            float64
Department                           float64
OverTime_No                          float64
OverTime_Yes                         float64
MaritalStatus_Divorced               float64
MaritalStatus_Married                float64
MaritalStatus_Single                 float64
BusinessTravel_Non-Travel            float64
BusinessTravel_Travel_Frequently     float64
BusinessTravel_Travel_Rare

In [99]:
# Create a StandardScaler
scaler = StandardScaler()

# Fit the StandardScaler to the training data
scaler.fit(X_train)

# Scale the training and testing data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert scaled arrays back to DataFrames
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

print("Shape of scaled X_train:", X_train_scaled.shape)
print("Shape of scaled X_test:", X_test_scaled.shape)

Shape of scaled X_train: (1176, 31)
Shape of scaled X_test: (294, 31)


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [100]:
# Create a OneHotEncoder for the Department column
department_encoder = OneHotEncoder(drop='first')

# Ensure y_train and y_test are DataFrames with 'Department' column
y_train_df = y_train.to_frame() if isinstance(y_train, pd.Series) else y_train
y_test_df = y_test.to_frame() if isinstance(y_test, pd.Series) else y_test

# Fit the encoder to the training data
department_encoder.fit(y_train_df[['Department']])

# Create two new variables by applying the encoder to the training and testing data
y_train_department_encoded = department_encoder.transform(y_train_df[['Department']])
y_test_department_encoded = department_encoder.transform(y_test_df[['Department']])

print("Shape of encoded Department for training:", y_train_department_encoded.shape)
print("Shape of encoded Department for testing:", y_test_department_encoded.shape)

Shape of encoded Department for training: (1176, 2)
Shape of encoded Department for testing: (294, 2)


In [101]:
# Check if 'Attrition' is in the Series
if 'Attrition' not in y_train.name:
    print("Error: 'Attrition' is not the name of the Series.")
    print("Series name:", y_train.name)
else:
    # Create a OneHotEncoder for the Attrition column
    attrition_encoder = OneHotEncoder(drop='first', sparse=False)

    # Ensure y_train and y_test are DataFrames with 'Attrition' column
    y_train_df = y_train.to_frame()
    y_test_df = y_test.to_frame()

    # Fit the encoder to the training data
    attrition_encoder.fit(y_train_df)

    # Create two new variables by applying the encoder
    # to the training and testing data
    y_train_attrition_encoded = attrition_encoder.transform(y_train_df)
    y_test_attrition_encoded = attrition_encoder.transform(y_test_df)

    print("Shape of encoded Attrition for training:", y_train_attrition_encoded.shape)
    print("Shape of encoded Attrition for testing:", y_test_attrition_encoded.shape)


Error: 'Attrition' is not the name of the Series.
Series name: Department


## Create, Compile, and Train the Model

In [102]:
# Find the number of columns in the X training data
input_shape = X_train.shape[1]

# Import necessary layers from Keras
from tensorflow.keras.layers import Input, Dense

# Create the input layer
inputs = Input(shape=(input_shape,))

# Create at least two shared layers
shared_layer1 = Dense(64, activation='relu')(inputs)
shared_layer2 = Dense(32, activation='relu')(shared_layer1)
shared_layer3 = Dense(16, activation='relu')(shared_layer2)


In [103]:
# Create a branch for Department
# with a hidden layer and an output layer

# Create the hidden layer
department_hidden = Dense(8, activation='relu')(shared_layer3)

# Create the output layer
department_output = Dense(3, activation='softmax', name='department_output')(department_hidden)


In [104]:
# Create a branch for Attrition
# with a hidden layer and an output layer

# Create the hidden layer
attrition_hidden = Dense(8, activation='relu')(shared_layer3)

# Create the output layer
attrition_output = Dense(1, activation='sigmoid', name='attrition_output')(attrition_hidden)


In [105]:
# Create the model
from tensorflow.keras.models import Model
model = Model(inputs=inputs, outputs=[department_output, attrition_output])

# Compile the model
model.compile(optimizer='adam',
              loss={'department_output': 'categorical_crossentropy',
                    'attrition_output': 'binary_crossentropy'},
              metrics={'department_output': 'accuracy',
                       'attrition_output': 'accuracy'})

# Summarize the model
model.summary()


In [106]:
print(X_train.dtypes)

Education                              int64
Age                                    int64
DistanceFromHome                       int64
JobSatisfaction                        int64
StockOptionLevel                       int64
WorkLifeBalance                        int64
YearsAtCompany                         int64
YearsSinceLastPromotion                int64
NumCompaniesWorked                     int64
YearsWithCurrManager                   int64
RelationshipSatisfaction               int64
JobLevel                               int64
Attrition                            float64
Department                           float64
OverTime_No                          float64
OverTime_Yes                         float64
MaritalStatus_Divorced               float64
MaritalStatus_Married                float64
MaritalStatus_Single                 float64
BusinessTravel_Non-Travel            float64
BusinessTravel_Travel_Frequently     float64
BusinessTravel_Travel_Rarely         float64
JobRole_He

In [112]:
# Prepare the target variables
y_department = pd.get_dummies(df['Department'])
y_attrition = df['Attrition'].map({'Yes': 1, 'No': 0})

# Prepare the input features
X = df.drop(['Department', 'Attrition'], axis=1)

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train_department, y_test_department, y_train_attrition, y_test_attrition = train_test_split(
    X, y_department, y_attrition, test_size=0.2, random_state=42
)

# Train the model
history = model.fit(
    X_train, 
    [y_train_department, y_train_attrition],
    epochs=10,
    batch_size=32,
    validation_split=0.2,
    verbose=1
)

# Plot training history
import matplotlib.pyplot as plt

# Plot loss
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')

# Plot accuracy
plt.subplot(1, 2, 2)
plt.plot(history.history['department_output_accuracy'])
plt.plot(history.history['attrition_output_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Department', 'Attrition'], loc='lower right')

plt.tight_layout()
plt.show()


IndexError: Index dimension must be 1 or 2

In [114]:
# Evaluate the model with the testing data

# Print the evaluation results

# Make predictions on the test data

# Get the predicted classes for Department and Attrition

# Calculate confusion matrices


In [115]:
# Print the accuracy for both department and attrition
# Calculate accuracy for department prediction

# Calculate accuracy for attrition prediction

# Calculate and print F1 scores for both predictions


# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?

2. What activation functions did you choose for your output layers, and why?

3. Can you name a few ways that this model might be improved?

# YOUR ANSWERS HERE
 
1. Is accuracy the best metric to use on this data? Why or why not?

2. What activation functions did you choose for your output layers, and why?
 
3. Can you name a few ways that this model might be improved?
