# Chapter 7 - Cleaning Messy Data

In [2]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

#import warnings
#warnings.filterwarnings("ignore")

# Exploring Data


In [4]:
# Real link: https://github.com/PacktPublishing/Python-Data-Analysis-Third-Edition/blob/master/Chapter06/employee.csv

# Use the raw link to access the actual CSV file
url = "https://raw.githubusercontent.com/PacktPublishing/Python-Data-Analysis-Third-Edition/master/Chapter06/employee.csv"

# Reading the CSV file
data = pd.read_csv(url)

# Display the first 5 rows of the DataFrame
data.head()

Unnamed: 0,name,age,income,gender,department,grade,performance_score
0,Allen Smith,45.0,,,Operations,G3,723
1,S Kumar,,16000.0,F,Finance,G0,520
2,Jack Morgan,32.0,35000.0,M,Finance,G2,674
3,Ying Chin,45.0,65000.0,F,Sales,G3,556
4,Dheeraj Patel,30.0,42000.0,F,Operations,G2,711


In [5]:
# See last 5 records
data.tail()

Unnamed: 0,name,age,income,gender,department,grade,performance_score
4,Dheeraj Patel,30.0,42000.0,F,Operations,G2,711
5,Satyam Sharma,,62000.0,,Sales,G3,649
6,James Authur,54.0,,F,Operations,G3,53
7,Josh Wills,54.0,52000.0,F,Finance,G3,901
8,Leo Duck,23.0,98000.0,M,Sales,G4,709


In [6]:
# Print list of columns in the data
print(data.columns)

Index(['name', 'age', 'income', 'gender', 'department', 'grade',
       'performance_score'],
      dtype='object')


In [7]:
# Print the shape of a DataFrame
print(data.shape)

(9, 7)


In [8]:
# Check the information of DataFrame
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   name               9 non-null      object 
 1   age                7 non-null      float64
 2   income             7 non-null      float64
 3   gender             7 non-null      object 
 4   department         9 non-null      object 
 5   grade              9 non-null      object 
 6   performance_score  9 non-null      int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 636.0+ bytes


In [9]:
# Check the descriptive statistics
data.describe()

Unnamed: 0,age,income,performance_score
count,7.0,7.0,9.0
mean,40.428571,52857.142857,610.666667
std,12.204605,26028.372797,235.671912
min,23.0,16000.0,53.0
25%,31.0,38500.0,556.0
50%,45.0,52000.0,674.0
75%,49.5,63500.0,711.0
max,54.0,98000.0,901.0


# Filtering Data to Weed Out the Noise

In [11]:
# Filter columns
data.filter(['name', 'department'])

Unnamed: 0,name,department
0,Allen Smith,Operations
1,S Kumar,Finance
2,Jack Morgan,Finance
3,Ying Chin,Sales
4,Dheeraj Patel,Operations
5,Satyam Sharma,Sales
6,James Authur,Operations
7,Josh Wills,Finance
8,Leo Duck,Sales


In [12]:
weed = data.filter(['name', 'department'])

weed

Unnamed: 0,name,department
0,Allen Smith,Operations
1,S Kumar,Finance
2,Jack Morgan,Finance
3,Ying Chin,Sales
4,Dheeraj Patel,Operations
5,Satyam Sharma,Sales
6,James Authur,Operations
7,Josh Wills,Finance
8,Leo Duck,Sales


In [13]:
# Filter column "name" (list)
data['name']

0      Allen Smith
1          S Kumar
2      Jack Morgan
3        Ying Chin
4    Dheeraj Patel
5    Satyam Sharma
6     James Authur
7       Josh Wills
8         Leo Duck
Name: name, dtype: object

In [14]:
# Filter column "name" (Pandas Series)
data[['name']]

Unnamed: 0,name
0,Allen Smith
1,S Kumar
2,Jack Morgan
3,Ying Chin
4,Dheeraj Patel
5,Satyam Sharma
6,James Authur
7,Josh Wills
8,Leo Duck


In [15]:
# Filter two columns: name and department
data[['name','department']]

Unnamed: 0,name,department
0,Allen Smith,Operations
1,S Kumar,Finance
2,Jack Morgan,Finance
3,Ying Chin,Sales
4,Dheeraj Patel,Operations
5,Satyam Sharma,Sales
6,James Authur,Operations
7,Josh Wills,Finance
8,Leo Duck,Sales


# Row-wise filtration

In [17]:
data.head()

Unnamed: 0,name,age,income,gender,department,grade,performance_score
0,Allen Smith,45.0,,,Operations,G3,723
1,S Kumar,,16000.0,F,Finance,G0,520
2,Jack Morgan,32.0,35000.0,M,Finance,G2,674
3,Ying Chin,45.0,65000.0,F,Sales,G3,556
4,Dheeraj Patel,30.0,42000.0,F,Operations,G2,711


In [18]:
# Select rows for the specific index
data.filter([0,1,2],axis=0)

Unnamed: 0,name,age,income,gender,department,grade,performance_score
0,Allen Smith,45.0,,,Operations,G3,723
1,S Kumar,,16000.0,F,Finance,G0,520
2,Jack Morgan,32.0,35000.0,M,Finance,G2,674


In [19]:
# Filter data using slicing
data[2:5]

Unnamed: 0,name,age,income,gender,department,grade,performance_score
2,Jack Morgan,32.0,35000.0,M,Finance,G2,674
3,Ying Chin,45.0,65000.0,F,Sales,G3,556
4,Dheeraj Patel,30.0,42000.0,F,Operations,G2,711


In [20]:
data

Unnamed: 0,name,age,income,gender,department,grade,performance_score
0,Allen Smith,45.0,,,Operations,G3,723
1,S Kumar,,16000.0,F,Finance,G0,520
2,Jack Morgan,32.0,35000.0,M,Finance,G2,674
3,Ying Chin,45.0,65000.0,F,Sales,G3,556
4,Dheeraj Patel,30.0,42000.0,F,Operations,G2,711
5,Satyam Sharma,,62000.0,,Sales,G3,649
6,James Authur,54.0,,F,Operations,G3,53
7,Josh Wills,54.0,52000.0,F,Finance,G3,901
8,Leo Duck,23.0,98000.0,M,Sales,G4,709


In [21]:
# Filter data for specific value
data[data.department=='Sales']

Unnamed: 0,name,age,income,gender,department,grade,performance_score
3,Ying Chin,45.0,65000.0,F,Sales,G3,556
5,Satyam Sharma,,62000.0,,Sales,G3,649
8,Leo Duck,23.0,98000.0,M,Sales,G4,709


In [22]:
# Select data for multiple values
data[data.department.isin(['Sales','Finance'])]

Unnamed: 0,name,age,income,gender,department,grade,performance_score
1,S Kumar,,16000.0,F,Finance,G0,520
2,Jack Morgan,32.0,35000.0,M,Finance,G2,674
3,Ying Chin,45.0,65000.0,F,Sales,G3,556
5,Satyam Sharma,,62000.0,,Sales,G3,649
7,Josh Wills,54.0,52000.0,F,Finance,G3,901
8,Leo Duck,23.0,98000.0,M,Sales,G4,709


In [23]:
data

Unnamed: 0,name,age,income,gender,department,grade,performance_score
0,Allen Smith,45.0,,,Operations,G3,723
1,S Kumar,,16000.0,F,Finance,G0,520
2,Jack Morgan,32.0,35000.0,M,Finance,G2,674
3,Ying Chin,45.0,65000.0,F,Sales,G3,556
4,Dheeraj Patel,30.0,42000.0,F,Operations,G2,711
5,Satyam Sharma,,62000.0,,Sales,G3,649
6,James Authur,54.0,,F,Operations,G3,53
7,Josh Wills,54.0,52000.0,F,Finance,G3,901
8,Leo Duck,23.0,98000.0,M,Sales,G4,709


In [24]:
# Filter employee who has more than 700 performance score
data[(data.performance_score >=700)]

Unnamed: 0,name,age,income,gender,department,grade,performance_score
0,Allen Smith,45.0,,,Operations,G3,723
4,Dheeraj Patel,30.0,42000.0,F,Operations,G2,711
7,Josh Wills,54.0,52000.0,F,Finance,G3,901
8,Leo Duck,23.0,98000.0,M,Sales,G4,709


In [25]:
# Filter employee who has more than 500 and less than 700 performance score
data[(data.performance_score >=500) & (data.performance_score < 700)]

Unnamed: 0,name,age,income,gender,department,grade,performance_score
1,S Kumar,,16000.0,F,Finance,G0,520
2,Jack Morgan,32.0,35000.0,M,Finance,G2,674
3,Ying Chin,45.0,65000.0,F,Sales,G3,556
5,Satyam Sharma,,62000.0,,Sales,G3,649


In [26]:
# Filter employee who has performance score of less than 500
data.query('performance_score<500')

Unnamed: 0,name,age,income,gender,department,grade,performance_score
6,James Authur,54.0,,F,Operations,G3,53


# Handling Missing Values

In [28]:
data

Unnamed: 0,name,age,income,gender,department,grade,performance_score
0,Allen Smith,45.0,,,Operations,G3,723
1,S Kumar,,16000.0,F,Finance,G0,520
2,Jack Morgan,32.0,35000.0,M,Finance,G2,674
3,Ying Chin,45.0,65000.0,F,Sales,G3,556
4,Dheeraj Patel,30.0,42000.0,F,Operations,G2,711
5,Satyam Sharma,,62000.0,,Sales,G3,649
6,James Authur,54.0,,F,Operations,G3,53
7,Josh Wills,54.0,52000.0,F,Finance,G3,901
8,Leo Duck,23.0,98000.0,M,Sales,G4,709


In [29]:
# Drop missing value rows using dropna() functio
data0 = data.dropna()

data0

Unnamed: 0,name,age,income,gender,department,grade,performance_score
2,Jack Morgan,32.0,35000.0,M,Finance,G2,674
3,Ying Chin,45.0,65000.0,F,Sales,G3,556
4,Dheeraj Patel,30.0,42000.0,F,Operations,G2,711
7,Josh Wills,54.0,52000.0,F,Finance,G3,901
8,Leo Duck,23.0,98000.0,M,Sales,G4,709


In [30]:
data

Unnamed: 0,name,age,income,gender,department,grade,performance_score
0,Allen Smith,45.0,,,Operations,G3,723
1,S Kumar,,16000.0,F,Finance,G0,520
2,Jack Morgan,32.0,35000.0,M,Finance,G2,674
3,Ying Chin,45.0,65000.0,F,Sales,G3,556
4,Dheeraj Patel,30.0,42000.0,F,Operations,G2,711
5,Satyam Sharma,,62000.0,,Sales,G3,649
6,James Authur,54.0,,F,Operations,G3,53
7,Josh Wills,54.0,52000.0,F,Finance,G3,901
8,Leo Duck,23.0,98000.0,M,Sales,G4,709


In [31]:
data3 = data

# Fill all the missing values in the age column with mean of the age column
data3['age'] = data3.age.fillna(data3.age.mean())

data3

Unnamed: 0,name,age,income,gender,department,grade,performance_score
0,Allen Smith,45.0,,,Operations,G3,723
1,S Kumar,40.428571,16000.0,F,Finance,G0,520
2,Jack Morgan,32.0,35000.0,M,Finance,G2,674
3,Ying Chin,45.0,65000.0,F,Sales,G3,556
4,Dheeraj Patel,30.0,42000.0,F,Operations,G2,711
5,Satyam Sharma,40.428571,62000.0,,Sales,G3,649
6,James Authur,54.0,,F,Operations,G3,53
7,Josh Wills,54.0,52000.0,F,Finance,G3,901
8,Leo Duck,23.0,98000.0,M,Sales,G4,709


In [32]:
# Fill all the missing values in the income column with a median of the income column
data3['income'] = data3.income.fillna(data3.income.median())

data3

Unnamed: 0,name,age,income,gender,department,grade,performance_score
0,Allen Smith,45.0,52000.0,,Operations,G3,723
1,S Kumar,40.428571,16000.0,F,Finance,G0,520
2,Jack Morgan,32.0,35000.0,M,Finance,G2,674
3,Ying Chin,45.0,65000.0,F,Sales,G3,556
4,Dheeraj Patel,30.0,42000.0,F,Operations,G2,711
5,Satyam Sharma,40.428571,62000.0,,Sales,G3,649
6,James Authur,54.0,52000.0,F,Operations,G3,53
7,Josh Wills,54.0,52000.0,F,Finance,G3,901
8,Leo Duck,23.0,98000.0,M,Sales,G4,709


In [33]:
# Fill all the missing values in the gender column(category column) with the mode of the gender column
data3['gender'] = data3['gender'].fillna(data3['gender'].mode()[0])

data3

Unnamed: 0,name,age,income,gender,department,grade,performance_score
0,Allen Smith,45.0,52000.0,F,Operations,G3,723
1,S Kumar,40.428571,16000.0,F,Finance,G0,520
2,Jack Morgan,32.0,35000.0,M,Finance,G2,674
3,Ying Chin,45.0,65000.0,F,Sales,G3,556
4,Dheeraj Patel,30.0,42000.0,F,Operations,G2,711
5,Satyam Sharma,40.428571,62000.0,F,Sales,G3,649
6,James Authur,54.0,52000.0,F,Operations,G3,53
7,Josh Wills,54.0,52000.0,F,Finance,G3,901
8,Leo Duck,23.0,98000.0,M,Sales,G4,709


# Handling Outliers

In [35]:
# Real link: https://github.com/PacktPublishing/Python-Data-Analysis-Third-Edition/blob/master/Chapter06/employee.csv

# Use the raw link to access the actual CSV file
url = "https://raw.githubusercontent.com/PacktPublishing/Python-Data-Analysis-Third-Edition/master/Chapter06/employee.csv"

# Reading the CSV file
data = pd.read_csv(url)

data

Unnamed: 0,name,age,income,gender,department,grade,performance_score
0,Allen Smith,45.0,,,Operations,G3,723
1,S Kumar,,16000.0,F,Finance,G0,520
2,Jack Morgan,32.0,35000.0,M,Finance,G2,674
3,Ying Chin,45.0,65000.0,F,Sales,G3,556
4,Dheeraj Patel,30.0,42000.0,F,Operations,G2,711
5,Satyam Sharma,,62000.0,,Sales,G3,649
6,James Authur,54.0,,F,Operations,G3,53
7,Josh Wills,54.0,52000.0,F,Finance,G3,901
8,Leo Duck,23.0,98000.0,M,Sales,G4,709


In [36]:
# Dropping the outliers using Standard Deviation

upper_limit= data['performance_score'].mean () + 3 * data['performance_score'].std ()
lower_limit = data['performance_score'].mean () - 3 * data['performance_score'].std ()

datan = data[(data['performance_score'] < upper_limit) & (data['performance_score'] > lower_limit)]

datan

Unnamed: 0,name,age,income,gender,department,grade,performance_score
0,Allen Smith,45.0,,,Operations,G3,723
1,S Kumar,,16000.0,F,Finance,G0,520
2,Jack Morgan,32.0,35000.0,M,Finance,G2,674
3,Ying Chin,45.0,65000.0,F,Sales,G3,556
4,Dheeraj Patel,30.0,42000.0,F,Operations,G2,711
5,Satyam Sharma,,62000.0,,Sales,G3,649
6,James Authur,54.0,,F,Operations,G3,53
7,Josh Wills,54.0,52000.0,F,Finance,G3,901
8,Leo Duck,23.0,98000.0,M,Sales,G4,709


In [37]:
# Drop the outlier observations using Percentiles

upper_limit = data['performance_score'].quantile(.99)
lower_limit = data['performance_score'].quantile(.01)

datap = data[(data['performance_score'] < upper_limit) & (data['performance_score'] > lower_limit)]

datap

Unnamed: 0,name,age,income,gender,department,grade,performance_score
0,Allen Smith,45.0,,,Operations,G3,723
1,S Kumar,,16000.0,F,Finance,G0,520
2,Jack Morgan,32.0,35000.0,M,Finance,G2,674
3,Ying Chin,45.0,65000.0,F,Sales,G3,556
4,Dheeraj Patel,30.0,42000.0,F,Operations,G2,711
5,Satyam Sharma,,62000.0,,Sales,G3,649
8,Leo Duck,23.0,98000.0,M,Sales,G4,709


# Feature Encoding Techniques

### One-hot encoding

In [40]:
# Real link: https://github.com/PacktPublishing/Python-Data-Analysis-Third-Edition/blob/master/Chapter06/employee.csv

# Use the raw link to access the actual CSV file
url = "https://raw.githubusercontent.com/PacktPublishing/Python-Data-Analysis-Third-Edition/master/Chapter06/employee.csv"

# Reading the CSV file
data = pd.read_csv(url)

data

Unnamed: 0,name,age,income,gender,department,grade,performance_score
0,Allen Smith,45.0,,,Operations,G3,723
1,S Kumar,,16000.0,F,Finance,G0,520
2,Jack Morgan,32.0,35000.0,M,Finance,G2,674
3,Ying Chin,45.0,65000.0,F,Sales,G3,556
4,Dheeraj Patel,30.0,42000.0,F,Operations,G2,711
5,Satyam Sharma,,62000.0,,Sales,G3,649
6,James Authur,54.0,,F,Operations,G3,53
7,Josh Wills,54.0,52000.0,F,Finance,G3,901
8,Leo Duck,23.0,98000.0,M,Sales,G4,709


In [41]:
# Dummy encoding
encoded_data = pd.get_dummies(data['gender']).astype(int)

# Join the encoded _data with original dataframe
data = data.join(encoded_data)

data

Unnamed: 0,name,age,income,gender,department,grade,performance_score,F,M
0,Allen Smith,45.0,,,Operations,G3,723,0,0
1,S Kumar,,16000.0,F,Finance,G0,520,1,0
2,Jack Morgan,32.0,35000.0,M,Finance,G2,674,0,1
3,Ying Chin,45.0,65000.0,F,Sales,G3,556,1,0
4,Dheeraj Patel,30.0,42000.0,F,Operations,G2,711,1,0
5,Satyam Sharma,,62000.0,,Sales,G3,649,0,0
6,James Authur,54.0,,F,Operations,G3,53,1,0
7,Josh Wills,54.0,52000.0,F,Finance,G3,901,1,0
8,Leo Duck,23.0,98000.0,M,Sales,G4,709,0,1


In [42]:
# Another Way:

# Import one hot encoder
from sklearn.preprocessing import OneHotEncoder

# Initialize the one-hot encoder object
onehotencoder = OneHotEncoder()

# Fill all the missing values in income column(category column) with mode of age column
data['gender']=data['gender'].fillna(data['gender'].mode()[0])

# Fit and transforms the gender column
onehotencoder.fit_transform(data[['gender']]).toarray()

array([[1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.]])

### Label Encoding

Label encoding is also known as integer encoding. Integer encoding replaces categorical
values with numeric values. Here, the unique values in variables are replaced with a
sequence of integer values.

In [45]:
# Real link: https://github.com/PacktPublishing/Python-Data-Analysis-Third-Edition/blob/master/Chapter06/employee.csv

# Use the raw link to access the actual CSV file
url = "https://raw.githubusercontent.com/PacktPublishing/Python-Data-Analysis-Third-Edition/master/Chapter06/employee.csv"

# Reading the CSV file
data = pd.read_csv(url)

data

Unnamed: 0,name,age,income,gender,department,grade,performance_score
0,Allen Smith,45.0,,,Operations,G3,723
1,S Kumar,,16000.0,F,Finance,G0,520
2,Jack Morgan,32.0,35000.0,M,Finance,G2,674
3,Ying Chin,45.0,65000.0,F,Sales,G3,556
4,Dheeraj Patel,30.0,42000.0,F,Operations,G2,711
5,Satyam Sharma,,62000.0,,Sales,G3,649
6,James Authur,54.0,,F,Operations,G3,53
7,Josh Wills,54.0,52000.0,F,Finance,G3,901
8,Leo Duck,23.0,98000.0,M,Sales,G4,709


In [46]:
# Import LabelEncoder
from sklearn.preprocessing import LabelEncoder

# Instantiate the Label Encoder Object
label_encoder = LabelEncoder()

# Fit and transform the column
encoded_data = label_encoder.fit_transform(data['department'])

# Print the encoded
print(encoded_data)

[1 0 0 2 1 2 1 0 2]


In [47]:
# Perform inverse encoding
inverse_encode=label_encoder.inverse_transform([0, 0, 1, 2])

# Print inverse encode
print(inverse_encode)

['Finance' 'Finance' 'Operations' 'Sales']


### Ordinal Encoder

In [49]:
# Real link: https://github.com/PacktPublishing/Python-Data-Analysis-Third-Edition/blob/master/Chapter06/employee.csv

# Use the raw link to access the actual CSV file
url = "https://raw.githubusercontent.com/PacktPublishing/Python-Data-Analysis-Third-Edition/master/Chapter06/employee.csv"

# Reading the CSV file
data = pd.read_csv(url)

data

Unnamed: 0,name,age,income,gender,department,grade,performance_score
0,Allen Smith,45.0,,,Operations,G3,723
1,S Kumar,,16000.0,F,Finance,G0,520
2,Jack Morgan,32.0,35000.0,M,Finance,G2,674
3,Ying Chin,45.0,65000.0,F,Sales,G3,556
4,Dheeraj Patel,30.0,42000.0,F,Operations,G2,711
5,Satyam Sharma,,62000.0,,Sales,G3,649
6,James Authur,54.0,,F,Operations,G3,53
7,Josh Wills,54.0,52000.0,F,Finance,G3,901
8,Leo Duck,23.0,98000.0,M,Sales,G4,709


In [50]:
from sklearn.preprocessing import OrdinalEncoder

# Initialize OrdinalEncoder with order
order_encoder=OrdinalEncoder(categories=['G0','G1','G2','G3','G4'])

# fit and transform the grade
data['grade_encoded'] = label_encoder.fit_transform(data['grade'])

data

Unnamed: 0,name,age,income,gender,department,grade,performance_score,grade_encoded
0,Allen Smith,45.0,,,Operations,G3,723,2
1,S Kumar,,16000.0,F,Finance,G0,520,0
2,Jack Morgan,32.0,35000.0,M,Finance,G2,674,1
3,Ying Chin,45.0,65000.0,F,Sales,G3,556,2
4,Dheeraj Patel,30.0,42000.0,F,Operations,G2,711,1
5,Satyam Sharma,,62000.0,,Sales,G3,649,2
6,James Authur,54.0,,F,Operations,G3,53,2
7,Josh Wills,54.0,52000.0,F,Finance,G3,901,2
8,Leo Duck,23.0,98000.0,M,Sales,G4,709,3


# Feature Scaling

### Standard Scaling or Z-Score Normalization

$$
z = \frac{x - \mu}{\sigma}
$$

Where:
- \( z \) is the standardized value (z-score),
- \( x \) is the original data point,
- \( \mu \) is the mean of the data,
- \( \sigma \) is the standard deviation of the data.


In [54]:
# Import StandardScaler(or z-score normalization)
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# To scale data
scaler.fit(data['performance_score'].values.reshape(-1,1))
data['performance_std_scaler']=scaler.transform(data['performance_score'].values.reshape(-1,1))

data

Unnamed: 0,name,age,income,gender,department,grade,performance_score,grade_encoded,performance_std_scaler
0,Allen Smith,45.0,,,Operations,G3,723,2,0.505565
1,S Kumar,,16000.0,F,Finance,G0,520,0,-0.408053
2,Jack Morgan,32.0,35000.0,M,Finance,G2,674,1,0.285037
3,Ying Chin,45.0,65000.0,F,Sales,G3,556,2,-0.246032
4,Dheeraj Patel,30.0,42000.0,F,Operations,G2,711,1,0.451558
5,Satyam Sharma,,62000.0,,Sales,G3,649,2,0.172522
6,James Authur,54.0,,F,Operations,G3,53,2,-2.509823
7,Josh Wills,54.0,52000.0,F,Finance,G3,901,2,1.306668
8,Leo Duck,23.0,98000.0,M,Sales,G4,709,3,0.442557


### Min-Max Scaling

$$
x' = \frac{x - \text{min}(x)}{\text{max}(x) - \text{min}(x)}
$$

Where:
- \( x' \) is the normalized value,
- \( x \) is the original data point,
- \( \text{min}(x) \) is the minimum value in the dataset,
- \( \text{max}(x) \) is the maximum value in the dataset.

In [57]:
# Import MinMaxScaler
from sklearn.preprocessing import MinMaxScaler

# Initialise the MinMaxScaler
scaler = MinMaxScaler()

# To scale data
scaler.fit(data['performance_score'].values.reshape(-1,1))

data['performance_minmax_scaler'] = scaler.transform(data['performance_score'].values.reshape(-1,1))

data

Unnamed: 0,name,age,income,gender,department,grade,performance_score,grade_encoded,performance_std_scaler,performance_minmax_scaler
0,Allen Smith,45.0,,,Operations,G3,723,2,0.505565,0.790094
1,S Kumar,,16000.0,F,Finance,G0,520,0,-0.408053,0.550708
2,Jack Morgan,32.0,35000.0,M,Finance,G2,674,1,0.285037,0.732311
3,Ying Chin,45.0,65000.0,F,Sales,G3,556,2,-0.246032,0.59316
4,Dheeraj Patel,30.0,42000.0,F,Operations,G2,711,1,0.451558,0.775943
5,Satyam Sharma,,62000.0,,Sales,G3,649,2,0.172522,0.70283
6,James Authur,54.0,,F,Operations,G3,53,2,-2.509823,0.0
7,Josh Wills,54.0,52000.0,F,Finance,G3,901,2,1.306668,1.0
8,Leo Duck,23.0,98000.0,M,Sales,G4,709,3,0.442557,0.773585


### Robust Scaling

$$
x' = \frac{x - \text{median}(x)}{\text{IQR}(x)}
$$

Where:
- \( x' \) is the scaled value,
- \( x \) is the original data point,
- \( \text{median}(x) \) is the median of the dataset,
- \( \text{IQR}(x) \) is the interquartile range (IQR), calculated as \( Q_3 - Q_1 \) (the difference between the 75th and 25th percentiles).

In [60]:
# Import RobustScaler
from sklearn.preprocessing import RobustScaler

# Initialise the RobustScaler
scaler = RobustScaler()

# To scale data
scaler.fit(data['performance_score'].values.reshape(-1,1))

data['performance_robust_scaler']=scaler.transform(data['performance_score'].values.reshape(-1,1))

data

Unnamed: 0,name,age,income,gender,department,grade,performance_score,grade_encoded,performance_std_scaler,performance_minmax_scaler,performance_robust_scaler
0,Allen Smith,45.0,,,Operations,G3,723,2,0.505565,0.790094,0.316129
1,S Kumar,,16000.0,F,Finance,G0,520,0,-0.408053,0.550708,-0.993548
2,Jack Morgan,32.0,35000.0,M,Finance,G2,674,1,0.285037,0.732311,0.0
3,Ying Chin,45.0,65000.0,F,Sales,G3,556,2,-0.246032,0.59316,-0.76129
4,Dheeraj Patel,30.0,42000.0,F,Operations,G2,711,1,0.451558,0.775943,0.23871
5,Satyam Sharma,,62000.0,,Sales,G3,649,2,0.172522,0.70283,-0.16129
6,James Authur,54.0,,F,Operations,G3,53,2,-2.509823,0.0,-4.006452
7,Josh Wills,54.0,52000.0,F,Finance,G3,901,2,1.306668,1.0,1.464516
8,Leo Duck,23.0,98000.0,M,Sales,G4,709,3,0.442557,0.773585,0.225806


# Feature Transformation

Feature transformation alters features so that they're in the required form. It also reduces
the effect of outliers, handles skewed data, and makes the model more robust. The
following list shows the different kinds of feature transformation:

- **Log transformation** is the most common mathematical transformation used to
transform skewed data into a normal distribution. Before applying the log
transform, ensure that all the data values ​only contain positive values; otherwise,
this will throw an exception or error message.
    
- **Square and cube transformation** has a moderate effect on distribution shape. It
can be used to reduce left skewness.
    
- **Square and cube root** transformation has a fairly strong transformation effect on
the distribution shape but it is weaker than logarithms. It can be applied to rightskewed data.
    
- **Discretization** can also be used to transform a numeric column or attribute. For
example, the age of a group of candidates can be grouped into intervals such as
0-10, 11-20, and so on. We can also use discretization to assign conceptual labels
instead of intervals such as youth, adult, and senior

If the feature is right-skewed or positively skewed or grouped at lower values, then we can
apply the square root, cube root, and logarithmic transformations, while if the feature is
left-skewed or negative skewed or grouped at higher values, then we can apply the cube,
square, and so on.

In [65]:
# Real link: https://github.com/PacktPublishing/Python-Data-Analysis-Third-Edition/blob/master/Chapter06/employee.csv

# Use the raw link to access the actual CSV file
url = "https://raw.githubusercontent.com/PacktPublishing/Python-Data-Analysis-Third-Edition/master/Chapter06/employee.csv"

# Reading the CSV file
data = pd.read_csv(url)

data

Unnamed: 0,name,age,income,gender,department,grade,performance_score
0,Allen Smith,45.0,,,Operations,G3,723
1,S Kumar,,16000.0,F,Finance,G0,520
2,Jack Morgan,32.0,35000.0,M,Finance,G2,674
3,Ying Chin,45.0,65000.0,F,Sales,G3,556
4,Dheeraj Patel,30.0,42000.0,F,Operations,G2,711
5,Satyam Sharma,,62000.0,,Sales,G3,649
6,James Authur,54.0,,F,Operations,G3,53
7,Josh Wills,54.0,52000.0,F,Finance,G3,901
8,Leo Duck,23.0,98000.0,M,Sales,G4,709


In [66]:
# Create performance grade function
def performance_grade(score):
    if score>=700:
        return 'A'
    elif score<700 and score >= 500:
        return 'B'
    else:
        return 'C'

# Apply performance grade function on whole DataFrame using apply() function.
data['performance_grade'] = data.performance_score.apply(performance_grade)

# See initial 5 records
data.head()

Unnamed: 0,name,age,income,gender,department,grade,performance_score,performance_grade
0,Allen Smith,45.0,,,Operations,G3,723,A
1,S Kumar,,16000.0,F,Finance,G0,520,B
2,Jack Morgan,32.0,35000.0,M,Finance,G2,674,B
3,Ying Chin,45.0,65000.0,F,Sales,G3,556,B
4,Dheeraj Patel,30.0,42000.0,F,Operations,G2,711,A


# Feature Splitting

In [68]:
# Split the name column in first and last name
data['first_name']=data.name.str.split(" ").map(lambda var: var[0])
data['last_name']=data.name.str.split(" ").map(lambda var: var[1])

# Check top-5 records
data.head()

Unnamed: 0,name,age,income,gender,department,grade,performance_score,performance_grade,first_name,last_name
0,Allen Smith,45.0,,,Operations,G3,723,A,Allen,Smith
1,S Kumar,,16000.0,F,Finance,G0,520,B,S,Kumar
2,Jack Morgan,32.0,35000.0,M,Finance,G2,674,B,Jack,Morgan
3,Ying Chin,45.0,65000.0,F,Sales,G3,556,B,Ying,Chin
4,Dheeraj Patel,30.0,42000.0,F,Operations,G2,711,A,Dheeraj,Patel
