## Feature Engineering

by Rina Buoy, PhD

### 1. Generating dummy data

In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np

# Sample dataset creation
data = {
    'age': [25, 32, 47, 51, 62],
    'salary': [50000, 60000, 120000, 90000, 140000],
    'gender': ['Male', 'Female', 'Female', 'Male', 'Female'],
    'city': ['Osaka', 'Tokyo', 'Tokyo', 'Osaka', 'Kyoto'],
    'bought_product': [1, 0, 1, 0, 1]  # Target variable
}


# Create a DataFrame
df = pd.DataFrame(data)


### 2. Feature Engineering

In [4]:


# Step 1: Feature Engineering - Adding new features
# Add a new feature: salary per age
df['salary_per_age'] = df['salary'] / df['age']

# Step 2: Encoding Categorical Data
# Manually encode 'gender' (Male -> 1, Female -> 0)
df['gender_encoded'] = df['gender'].map({'Male': 1, 'Female': 0})

# Manually encode 'city' using One-Hot Encoding
df = pd.get_dummies(df, columns=['city'], drop_first=True)  # Drop the first to avoid multicollinearity

# Step 3: Normalization (Min-Max Scaling)
def min_max_scaling(series):
    return (series - series.min()) / (series.max() - series.min())

# Apply normalization to 'salary' and 'salary_per_age'
df['salary'] = min_max_scaling(df['salary'])
df['salary_per_age'] = min_max_scaling(df['salary_per_age'])

# Step 4: Generating Polynomial Features
def add_polynomial_features(df, columns, degree=2):
    for col in columns:
        for power in range(2, degree + 1):
            df[f'{col}_pow_{power}'] = df[col] ** power
    return df

# Add polynomial features for 'age' and 'salary'
df = add_polynomial_features(df, ['age', 'salary'], degree=2)




### 3. Final Features

In [5]:
# Final dataset after feature engineering, normalization, and encoding
print("Transformed Dataset:")
print(df)

# Optional: Split features (X) and target (y)
X = df.drop(columns=['bought_product', 'gender', 'salary_per_age'])  # Removing raw and unneeded columns
y = df['bought_product']

print("\nFeatures (X):")
print(X)

print("\nTarget (y):")
print(y)

Transformed Dataset:
   age    salary  gender  bought_product  salary_per_age  gender_encoded  \
0   25  0.000000    Male               1        0.298413               1   
1   32  0.111111  Female               0        0.139881               0   
2   47  0.777778  Female               1        1.000000               0   
3   51  0.444444    Male               0        0.000000               1   
4   62  1.000000  Female               1        0.625704               0   

   city_Osaka  city_Tokyo  age_pow_2  salary_pow_2  
0        True       False        625      0.000000  
1       False        True       1024      0.012346  
2       False        True       2209      0.604938  
3        True       False       2601      0.197531  
4       False       False       3844      1.000000  

Features (X):
   age    salary  gender_encoded  city_Osaka  city_Tokyo  age_pow_2  \
0   25  0.000000               1        True       False        625   
1   32  0.111111               0       False   