# Data Transformation

In [None]:
import pandas as pd

## HR Analytics: Predict whether an employee will get promoted

In [None]:
df = pd.read_csv("job_promotion_.csv")
df.head()

### Check the size of the dataset

In [None]:
# Use shape() to display the number of rows and columns
# (rows, cols)
    
df.shape

---

## Inspect features

In [None]:
df.info()

## To drop all rows with missing values:

```python
df = df.dropna()
```

---

# Feature Selection

## Drop features (columns)

In [None]:
df = df.drop(["employee_id", "department", "region"], axis=1)

df.head()

---

# Rename Feature

In [None]:
df = df.rename(columns = {"awards_won?": "awards_won"})
df.head()

---

# Feature Transformation

## Transform Categorical Features

### Binary transformation

In [None]:
df["gender"].unique()

In [None]:
df["gender"] = df["gender"].replace({'m':0, 'f':1})

df.head()

### Ordinal transformation

In [None]:
df["education"].unique()

In [None]:
df["education"] = df["education"].replace({"Below Secondary":0, "Bachelor's":1, "Master's & above":2})

df.head()

### One-Hot Encode (Non-ordinal Features)

In [None]:
df["recruitment_channel"].unique()

In [None]:
df = pd.get_dummies(df, columns=["recruitment_channel"], dtype=int)

df.head()

---

# Feature Engineering

### Create a new feature, "awards_per_yr", to communicate the density of accomplishment by an employee.

In [None]:
# To express the awards won relative to how long they've been employed by the company.
# i.e., Awards won per year of service

df["awards_per_yr"] =  df["awards_won"]/df["length_of_service"]

df.head()

### View engineered feature

In [None]:
# How many awards an employee averages per year of employment (for those who have received at least 1 award)

df.loc[df["awards_per_yr"] > 0, "awards_per_yr"].head(20)

---

## Create an "experience" feature by binning "length_of_service" (discretization).

In [None]:
df["length_of_service"].unique()

In [None]:
# Bin "length_of_service" to create an "experience" feature (transfer your knowledge to the machine)
# The bins are inclusive (i.e., 0-5, 6-15, 16-40)

df["experience"] = pd.cut(df["length_of_service"], [0, 5, 15, 40], labels=["limited", "good", "extensive"])
df["experience"].head(10)

### Convert the categorical feature "experience" to ordinal numbers

In [None]:
df["experience"] = df["experience"].replace({'limited':1, 'good':2, "extensive":3})
df["experience"].head(10)

## Drop the "length_of_service" and "awards_won" features

In [None]:
# Engineering the "experience" and "awards per year" features enables us to drop these features

df = df.drop(["length_of_service", "awards_won"], axis=1)

df.head()

---

# Reorder Features
Place the "is_promoted" target variable as the last feature.

In [None]:
df.columns

In [None]:
df = df.iloc[:, [0,1,2,4,5,6,7,8,3]]

df.head()

### Alternative
#### List comprehension

In [None]:
feature_order = [feature for feature in list(df.columns) if feature not in ["is_promoted"]] + ["is_promoted"]

df = df[feature_order]

df.head()

---