## Part 1: Preprocessing

In [32]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras import layers

#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
attrition_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [33]:
# Determine the number of unique values in each column.
attrition_df.nunique().sort_values(ascending=False)

HourlyRate                  71
Age                         43
TotalWorkingYears           40
YearsAtCompany              37
DistanceFromHome            29
YearsInCurrentRole          19
YearsWithCurrManager        18
YearsSinceLastPromotion     16
PercentSalaryHike           15
NumCompaniesWorked          10
JobRole                      9
TrainingTimesLastYear        7
EducationField               6
JobLevel                     5
Education                    5
EnvironmentSatisfaction      4
JobInvolvement               4
JobSatisfaction              4
RelationshipSatisfaction     4
StockOptionLevel             4
WorkLifeBalance              4
Department                   3
BusinessTravel               3
MaritalStatus                3
OverTime                     2
Attrition                    2
PerformanceRating            2
dtype: int64

In [34]:
# Create y_df with the Attrition and Department columns
y_df = attrition_df[["Attrition", "Department"]]

In [35]:
# Create a list of at least 10 column names to use as X data
X_columns_to_include = [
    column_name
    for column_name in attrition_df.columns
    if column_name not in ("Attrition", "Department")
]

# Create X_df using your selected columns
X_df = attrition_df[X_columns_to_include]

# Show the data types for X_df
X_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   BusinessTravel            1470 non-null   object
 2   DistanceFromHome          1470 non-null   int64 
 3   Education                 1470 non-null   int64 
 4   EducationField            1470 non-null   object
 5   EnvironmentSatisfaction   1470 non-null   int64 
 6   HourlyRate                1470 non-null   int64 
 7   JobInvolvement            1470 non-null   int64 
 8   JobLevel                  1470 non-null   int64 
 9   JobRole                   1470 non-null   object
 10  JobSatisfaction           1470 non-null   int64 
 11  MaritalStatus             1470 non-null   object
 12  NumCompaniesWorked        1470 non-null   int64 
 13  OverTime                  1470 non-null   object
 14  PercentSalaryHike       

In [37]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.2, random_state=42)


In [46]:
from sklearn.preprocessing import LabelEncoder

# Create an instance of the label encoder
le = LabelEncoder()

# List of categorical columns to be encoded
categorical_columns = [
    "OverTime",
    "BusinessTravel",
    "EducationField",
    "JobRole",
    "MaritalStatus",
]

# Fit the label encoder on the combined data from both training and test sets
for column in categorical_columns:
    combined_data = list(X_train[column]) + list(X_test[column])
    le.fit(combined_data)

    # Transform the training data
    X_train[column] = le.transform(X_train[column])

    # Transform the test data
    X_test[column] = le.transform(X_test[column])

# Print value counts for each encoded column in the training set
for column in categorical_columns:
    print(f"Value counts for {column} in training set:")
    print(X_train[column].value_counts())

# Print value counts for each encoded column in the test set
for column in categorical_columns:
    print(f"Value counts for {column} in test set:")
    print(X_test[column].value_counts())

Value counts for OverTime in training set:
OverTime
0    837
1    339
Name: count, dtype: int64
Value counts for BusinessTravel in training set:
BusinessTravel
2    835
1    228
0    113
Name: count, dtype: int64
Value counts for EducationField in training set:
EducationField
1    491
3    369
2    124
5    101
4     69
0     22
Name: count, dtype: int64
Value counts for JobRole in training set:
JobRole
7    254
6    242
2    204
4    107
0    105
3     79
8     75
5     68
1     42
Name: count, dtype: int64
Value counts for MaritalStatus in training set:
MaritalStatus
1    550
2    359
0    267
Name: count, dtype: int64
Value counts for OverTime in test set:
OverTime
0    217
1     77
Name: count, dtype: int64
Value counts for BusinessTravel in test set:
BusinessTravel
5    208
4     49
3     37
Name: count, dtype: int64
Value counts for EducationField in test set:
EducationField
7     115
9      95
8      35
11     31
10     13
6       5
Name: count, dtype: int64
Value counts for Job

In [47]:
# Create a StandardScaler
scaler = StandardScaler()

# Fit the StandardScaler to the training data
X_scaler = scaler.fit(X_train)

# Scale the training and testing data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [48]:
pd.DataFrame(X_train_scaled, columns=X_train.columns).head()

Unnamed: 0,Age,BusinessTravel,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobRole,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,-1.388559,0.589281,1.440396,-0.863356,2.085607,0.279706,-0.472832,-1.01234,-0.932274,-1.008801,...,-0.42929,-0.639822,2.547471,-1.167368,0.157319,0.357435,-0.974263,-0.888208,-0.67611,-1.142448
1,-2.040738,-2.463556,-0.522699,-0.863356,-0.930284,-0.639104,0.309374,0.389912,-0.932274,0.609133,...,-0.42929,1.211176,-0.945525,-1.423397,-0.613546,0.357435,-1.138573,-1.165051,-0.67611,-1.142448
2,-0.845077,0.589281,1.317703,-0.863356,-0.176312,1.198515,-1.059487,0.389912,-0.025447,1.013617,...,-0.42929,1.211176,0.218807,-0.143254,-0.613546,0.357435,-0.645643,-0.611364,-0.67611,-0.575084
3,0.241886,0.589281,0.336155,0.099933,0.577661,1.198515,-0.032841,0.389912,-0.025447,-0.199834,...,2.329427,0.285677,-0.945525,-0.527297,0.157319,0.357435,-0.317023,-0.057676,-0.355244,-1.142448
4,-0.627685,0.589281,1.317703,0.099933,-0.930284,-0.639104,1.09158,0.389912,-0.025447,-1.008801,...,-0.42929,-1.565321,0.218807,-0.143254,-0.613546,0.357435,0.504527,1.0497,-0.67611,-0.575084


In [18]:
# Create a OneHotEncoder for the Department column


# Fit the encoder to the training data


# Create two new variables by applying the encoder
# to the training and testing data




In [19]:
# Create a OneHotEncoder for the Attrition column


# Fit the encoder to the training data


# Create two new variables by applying the encoder
# to the training and testing data



## Create, Compile, and Train the Model

In [20]:
# Find the number of columns in the X training data


# Create the input layer


# Create at least two shared layers


In [21]:
# Create a branch for Department
# with a hidden layer and an output layer

# Create the hidden layer


# Create the output layer



In [22]:
# Create a branch for Attrition
# with a hidden layer and an output layer

# Create the hidden layer


# Create the output layer



In [23]:
# Create the model


# Compile the model


# Summarize the model


In [24]:
# Train the model



In [25]:
# Evaluate the model with the testing data


In [26]:
# Print the accuracy for both department and attrition


# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?

2. What activation functions did you choose for your output layers, and why?

3. Can you name a few ways that this model might be improved?

YOUR ANSWERS HERE

1. 
2. 
3. 