# Lab 7: Feature Engineering Assignment

## SRINATH KRISHNAN
## SMK220008


## Setup (Run this cell first)

This cell imports necessary libraries and creates sample datasets for use throughout the assignment.

In [1]:
pip install category_encoders



In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from category_encoders import BinaryEncoder
from category_encoders import TargetEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import PowerTransformer

# **Dataset 1:  Categorical Features**
np.random.seed(42)
n_samples = 20000
data_cat = pd.DataFrame({
    'Color': np.random.choice(['Red', 'Green', 'Blue'], n_samples),
    'Size': np.random.choice(['Small', 'Medium', 'Large'], n_samples),
    'Shape': np.random.choice(['Circle', 'Square', 'Triangle'], n_samples),
    'Target': np.random.choice([0, 1], n_samples) # For target encoding example
})
data_cat['Ordinal_Feature'] = np.random.choice(['Low', 'Medium', 'High'], n_samples, p=[0.2, 0.5, 0.3])
# rearange columns
data_cat = data_cat[['Color', 'Size', 'Shape', 'Ordinal_Feature', 'Target']]

# **Dataset 2: Numerical Features**
data_num = pd.DataFrame({
    'Feature1': np.random.normal(loc=50, scale=10, size=n_samples),
    'Feature2': np.random.uniform(low=0, high=1, size=n_samples),
    'Feature3': np.random.exponential(scale=5, size=n_samples),  # Skewed data
    'Feature4': np.random.randint(1000, 2000, size=n_samples)
})

# **Dataset 3: Time Series Data**
date_rng = pd.date_range(start='2023-01-01', end='2023-01-20', freq='D')
data_time_series = pd.DataFrame(date_rng, columns=['Date'])
data_time_series['Value'] = np.random.randint(10, 50, size=len(date_rng))
data_time_series['Category'] = np.random.choice(['A', 'B'], len(date_rng))
data_time_series = data_time_series.set_index('Date')

## 1. Categorical Variable Encoding - 10 points

**Requirement:**
 1. **One-Hot Encoding:** Apply one-hot encoding to the 'Color' and 'Shape' features in `data_cat`.  Create new columns with appropriate prefixes (e.g., 'Color_Red', 'Shape_Square').
 2. **Ordinal Encoding:**  Encode the 'Ordinal_Feature' column in `data_cat` using an ordinal encoding scheme, assuming the order 'Low' < 'Medium' < 'High'. Map these to numerical values 0, 1, and 2, respectively.  Overwrite the original 'Ordinal_Feature' column with the encoded values.
 3. **Binary Encoding:** Encode the 'Size' feature in `data_cat` using binary encoding.
 4. **Target Encoding:**  Encode the 'Color' feature using target encoding, based on the 'Target' variable in `data_cat`. Use a smoothing parameter of 1.0.  Create a new column named 'Color_TargetEncoded'.
 5. **Create** Python list variable that will contain all columns from the original dataframe
 6. **Create** Python list variable that will contain all columns generate using above techniques

In [3]:
# 1. One-Hot Encoding
ohe = OneHotEncoder(sparse_output=False, drop=None)
encoded_features = ohe.fit_transform(data_cat[['Color', 'Shape']])
feature_names = ohe.get_feature_names_out(['Color', 'Shape'])
one_hot_df = pd.DataFrame(encoded_features, columns=feature_names)
data_cat = pd.concat([data_cat, one_hot_df], axis=1)

# 2. Ordinal Encoding
ordinal_mapping = {'Low': 0, 'Medium': 1, 'High': 2}
data_cat['Ordinal_Feature'] = data_cat['Ordinal_Feature'].map(ordinal_mapping)

# 3. Binary Encoding
binary_encoder = BinaryEncoder(cols=['Size'])
binary_encoded = binary_encoder.fit_transform(data_cat[['Size']])
data_cat = pd.concat([data_cat, binary_encoded], axis=1)

# 4. Target Encoding
target_encoder = TargetEncoder(cols=['Color'], smoothing=1.0)
data_cat['Color_TargetEncoded'] = target_encoder.fit_transform(data_cat[['Color']], data_cat['Target'])

# 5. List of original columns
original_columns = ['Color', 'Size', 'Shape', 'Ordinal_Feature', 'Target']

# 6. List of all generated columns
generated_columns = list(feature_names) + ['Size_0', 'Size_1'] + ['Color_TargetEncoded']

In [4]:
data_cat.head()

Unnamed: 0,Color,Size,Shape,Ordinal_Feature,Target,Color_Blue,Color_Green,Color_Red,Shape_Circle,Shape_Square,Shape_Triangle,Size_0,Size_1,Color_TargetEncoded
0,Blue,Medium,Square,2,1,1.0,0.0,0.0,0.0,1.0,0.0,0,1,0.506216
1,Red,Medium,Square,1,0,0.0,0.0,1.0,0.0,1.0,0.0,0,1,0.503421
2,Blue,Large,Circle,1,1,1.0,0.0,0.0,1.0,0.0,0.0,1,0,0.506216
3,Blue,Medium,Square,2,0,1.0,0.0,0.0,0.0,1.0,0.0,0,1,0.506216
4,Red,Large,Triangle,1,0,0.0,0.0,1.0,0.0,0.0,1.0,1,0,0.503421


In [5]:
original_columns, generated_columns

(['Color', 'Size', 'Shape', 'Ordinal_Feature', 'Target'],
 ['Color_Blue',
  'Color_Green',
  'Color_Red',
  'Shape_Circle',
  'Shape_Square',
  'Shape_Triangle',
  'Size_0',
  'Size_1',
  'Color_TargetEncoded'])

## 2. Scaling and Centering - 10 points

**Requirement:**

Create train and test dataframes from `data_num`. Name the new dataframes: `data_num_train` and `data_num_test`. The split should be 80/20.  Fit the transformers on train data only, and modify both dataframes to reflect below scaling criteria.
1. **Standard Scaling:** Apply standard scaling to 'Feature1' in `data_num`. Create a new column called 'Feature1_StandardScaled'.
2. **Min-Max Scaling:** Apply Min-Max scaling to 'Feature2' in `data_num`. Create a new column called 'Feature2_MinMaxScaled'.
3. **Robust Scaling:** Apply robust scaling to 'Feature3' in `data_num`. Create a new column called 'Feature3_RobustScaled'.

In [6]:
# Split the data into training and testing sets (80/20 split)
data_num_train, data_num_test = train_test_split(data_num, test_size=0.2, random_state=42)

# 1. Standard Scaling for 'Feature1'
scaler_standard = StandardScaler()
# Fit on training data only
scaler_standard.fit(data_num_train[['Feature1']])
# Transform both training and testing data
data_num_train['Feature1_StandardScaled'] = scaler_standard.transform(data_num_train[['Feature1']])
data_num_test['Feature1_StandardScaled'] = scaler_standard.transform(data_num_test[['Feature1']])

# 2. Min-Max Scaling for 'Feature2'
scaler_minmax = MinMaxScaler()
scaler_minmax.fit(data_num_train[['Feature2']])
data_num_train['Feature2_MinMaxScaled'] = scaler_minmax.transform(data_num_train[['Feature2']])
data_num_test['Feature2_MinMaxScaled'] = scaler_minmax.transform(data_num_test[['Feature2']])

# 3. Robust Scaling for 'Feature3'
scaler_robust = RobustScaler()
scaler_robust.fit(data_num_train[['Feature3']])
data_num_train['Feature3_RobustScaled'] = scaler_robust.transform(data_num_train[['Feature3']])
data_num_test['Feature3_RobustScaled'] = scaler_robust.transform(data_num_test[['Feature3']])

In [7]:
data_num_train.head()

Unnamed: 0,Feature1,Feature2,Feature3,Feature4,Feature1_StandardScaled,Feature2_MinMaxScaled,Feature3_RobustScaled
5894,48.896091,0.988099,1.637564,1461,-0.09948,0.988197,-0.334571
3728,26.362504,0.394938,2.868517,1377,-2.349972,0.39484,-0.107006
8958,55.286933,0.719138,1.128584,1259,0.538791,0.719147,-0.428666
7671,65.632615,0.285527,7.494816,1631,1.572042,0.285393,0.748254
5999,44.946805,0.365599,1.330639,1928,-0.493906,0.365491,-0.391312


In [8]:
data_num_test.head()

Unnamed: 0,Feature1,Feature2,Feature3,Feature4,Feature1_StandardScaled,Feature2_MinMaxScaled,Feature3_RobustScaled
10650,47.331632,0.214735,6.068874,1303,-0.255727,0.214577,0.484641
2041,63.170566,0.660861,8.109707,1774,1.326151,0.660851,0.861928
8668,62.228101,0.18872,2.257348,1654,1.232024,0.188554,-0.219992
1114,68.680775,0.62533,6.035895,1489,1.876471,0.625308,0.478545
13902,48.471684,0.895132,0.367055,1272,-0.141867,0.8952,-0.569449


## 3. Numerical to Categorical Transformation - 10 points

**Requirement:**
1. **Equal-Width Discretization:**  Transform 'Feature4' in `data_num` into 4 equal-width bins.  Create a new column called 'Feature4_EqualWidth'.  Use the `.codes` attribute to get the bin numbers (0-3).
2. **Equal-Frequency Discretization:** Transform 'Feature1' in `data_num` into 3 equal-frequency bins. Create a new column called 'Feature1_EqualFreq'. Use the `.codes` attribute to get the bin numbers.

In [9]:
# 1. Equal-Width Discretization for 'Feature4'
discretizer_equal_width = KBinsDiscretizer(n_bins=4, encode='ordinal', strategy='uniform')
data_num['Feature4_EqualWidth'] = discretizer_equal_width.fit_transform(data_num[['Feature4']]).astype(int)

# 2. Equal-Frequency Discretization for 'Feature1'
discretizer_equal_freq = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='quantile')
data_num['Feature1_EqualFreq'] = discretizer_equal_freq.fit_transform(data_num[['Feature1']]).astype(int)

In [10]:
data_num.head()

Unnamed: 0,Feature1,Feature2,Feature3,Feature4,Feature4_EqualWidth,Feature1_EqualFreq
0,57.225872,0.114425,5.937229,1735,2,2
1,36.779208,0.670091,1.754833,1674,2,0
2,65.409806,0.543863,5.967664,1802,3,2
3,47.538073,0.34325,11.690331,1524,2,1
4,58.077475,0.159063,10.219802,1234,0,2


## 4. Log Transformation or Scale Change - 10 points

**Requirement:**
1. **Log Transformation:** Apply a log transformation (using `np.log1p`) to 'Feature3' in `data_num` to reduce its skewness. Create a new column called 'Feature3_LogTransformed'.
2. **Power Transformer:** Apply a PowerTransformer (Yeo-Johnson method) to 'Feature1' in data_num.


In [11]:
# 1. Log Transformation for 'Feature3'
data_num['Feature3_LogTransformed'] = np.log1p(data_num['Feature3'])

# 2. Power Transformer (Yeo-Johnson) for 'Feature1'
pt = PowerTransformer(method="yeo-johnson")
data_num['Feature1_PowerTransformed'] = pt.fit_transform(data_num[['Feature1']])

In [12]:
data_num.head()

Unnamed: 0,Feature1,Feature2,Feature3,Feature4,Feature4_EqualWidth,Feature1_EqualFreq,Feature3_LogTransformed,Feature1_PowerTransformed
0,57.225872,0.114425,5.937229,1735,2,2,1.936902,0.731866
1,36.779208,0.670091,1.754833,1674,2,0,1.013357,-1.319709
2,65.409806,0.543863,5.967664,1802,3,2,1.94128,1.539779
3,47.538073,0.34325,11.690331,1524,2,1,2.54084,-0.23356
4,58.077475,0.159063,10.219802,1234,0,2,2.41768,0.816241


## 5. Lag Features with Window Functions - 10 points

**Requirement:**
1. **Lag Feature:** Create a lag feature for 'Value' in `data_time_series` with a lag of 1 day.  Name the new column 'Value_Lag1'.
2. **Rolling Window Mean:** Create a rolling window mean of 'Value' with a window size of 3 days.  Name the new column 'Value_RollingMean3'.
3. **Exponential moving average:** Create an exponential moving average of previous 30 observation of column "Value". Name the new column 'Value_ExpMovingAvg30'.
4. **Lag Feature with Grouping:** Create a lag feature for 'Value' with a lag of 1 within each group of 'Category' in `data_time_series`.

In [13]:
# 1. Lag Feature
data_time_series['Value_Lag1'] = data_time_series['Value'].shift(1)

# 2. Rolling Window Mean
data_time_series['Value_RollingMean3'] = data_time_series['Value'].rolling(window=3, center=False).mean()

# 3. Exponential Moving Average
data_time_series['Value_ExpMovingAvg30'] = data_time_series['Value'].ewm(span=30, adjust=False).mean()

# 4. Lag Feature with Grouping
data_time_series['Value_Lag1_Grouped'] = data_time_series.groupby('Category')['Value'].shift(1)

In [14]:
data_time_series.head()

Unnamed: 0_level_0,Value,Category,Value_Lag1,Value_RollingMean3,Value_ExpMovingAvg30,Value_Lag1_Grouped
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-01-01,27,A,,,27.0,
2023-01-02,47,A,27.0,,28.290323,27.0
2023-01-03,19,A,47.0,31.0,27.690947,47.0
2023-01-04,45,B,19.0,37.0,28.80766,
2023-01-05,18,A,45.0,27.333333,28.110392,19.0


## 6. Creating New Features - 10 points


**Requirement:**
1. **Numerical Feature Interaction:**  Create a new feature called 'Feature1_x_Feature2' in `data_num` that is the product of 'Feature1' and 'Feature2'.
2. **Combining Categorical Features:** Create a new feature named `Combined_Cat` in the `data_cat` DataFrame by concatenating the `Color`, and `Shape` columns with a separator `_`. For example, if `Color` is `Red` and `Shape` is `Circle`, the `Combined_Cat` should be `Red_Circle`.
3. **Create target encoding**:  create target encoding for a new feature `Combined_Cat`

In [15]:
# 1. Numerical Feature Interaction
data_num['Feature1_x_Feature2'] = data_num['Feature1'] * data_num['Feature2']

# 2. Combining Categorical Features
data_cat['Combined_Cat'] = data_cat['Color'] + '_' + data_cat['Shape']

# 3. Target Encoding for 'Combined_Cat'
target_encoder = TargetEncoder(cols=['Combined_Cat'], smoothing=1.0)  # Adjust smoothing if needed
data_cat['Combined_Cat_TargetEncoded'] = target_encoder.fit_transform(data_cat[['Combined_Cat']], data_cat['Target'])

In [16]:
data_cat.head()

Unnamed: 0,Color,Size,Shape,Ordinal_Feature,Target,Color_Blue,Color_Green,Color_Red,Shape_Circle,Shape_Square,Shape_Triangle,Size_0,Size_1,Color_TargetEncoded,Combined_Cat,Combined_Cat_TargetEncoded
0,Blue,Medium,Square,2,1,1.0,0.0,0.0,0.0,1.0,0.0,0,1,0.506216,Blue_Square,0.507556
1,Red,Medium,Square,1,0,0.0,0.0,1.0,0.0,1.0,0.0,0,1,0.503421,Red_Square,0.512576
2,Blue,Large,Circle,1,1,1.0,0.0,0.0,1.0,0.0,0.0,1,0,0.506216,Blue_Circle,0.495775
3,Blue,Medium,Square,2,0,1.0,0.0,0.0,0.0,1.0,0.0,0,1,0.506216,Blue_Square,0.507556
4,Red,Large,Triangle,1,0,0.0,0.0,1.0,0.0,0.0,1.0,1,0,0.503421,Red_Triangle,0.505936
