# Data Transformation Exercise - Solutions

In [None]:
import pandas as pd

## HR Analytics: Predict employee attrition

In [None]:
df = pd.read_csv("employee_attrition_.csv")
df.head()

In [None]:
# Check the size of the data

df.shape

## Inspect features

In [None]:
df.info()

## 1.) Drop all rows with missing values

In [None]:
df = df.dropna()

In [None]:
# Check the size of the data

df.shape

---

# Feature Selection

## 2.) Drop the features "EmployeeNumber" and "JobRole".

In [None]:
df = df.drop(["EmployeeNumber", "JobRole"], axis=1)

df.head()

---

# Rename Feature

## 3.) Rename the feature "DistanceFromHome" to "CommuteDistance"

In [None]:
df = df.rename(columns = {"DistanceFromHome": "CommuteDistance"})

df.head()

---

# Feature Transformation

## 4.) Transform "Gender",  "Attrition" and "Department to numerical values.

In [None]:
df["Gender"].unique()

In [None]:
# Binary transformation
df["Gender"] = df["Gender"].replace({'Male':0, 'Female':1})

In [None]:
df["Attrition"].unique()

In [None]:
# Binary transformation
df["Attrition"] = df["Attrition"].replace({'No':0, 'Yes':1})

In [None]:
df["Department"].unique()

In [None]:
# One-Hot transformation
df = pd.get_dummies(df, columns=["Department"], dtype="int")

df.head()

---

# Feature Engineering

## 5.) Create a new feature called "Inconvenience" derived from "CommuteDistance" divided by "JobSatisfaction".
#### The larger the "CommuteDistance" and the smaller the "JobSatisfaction", the larger the job's "Inconvenience".

In [None]:
df["Inconvenience"] = df["CommuteDistance"]/df["JobSatisfaction"]

df["Inconvenience"].head(10)

---

# Reorder Features

## 6.) Move the "Attrition" target variable to the last feature.

In [None]:
df.columns

In [None]:
df = df.iloc[:, [0,1,2,3,4,5,6,8,9,10,11,7]]

df.head()

### Alternative solution
#### List comprehension

In [None]:
feature_order = [feature for feature in list(df.columns) if feature not in ["Attrition"]] + ["Attrition"]

df = df[feature_order]

df.head()