# Label Encoding

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import numpy as np

In [2]:
# 1. Prepare Sample Data
data = {
    'Size': ['Small', 'Medium', 'Large', 'Medium', 'Extra Large', 'Small'],
    'Price': [10, 20, 30, 25, 40, 15]
}
df = pd.DataFrame(data)

In [3]:
print("Original Data:")
print(df)
print("-" * 30)

Original Data:
          Size  Price
0        Small     10
1       Medium     20
2        Large     30
3       Medium     25
4  Extra Large     40
5        Small     15
------------------------------


In [4]:
# 2. Initialize the LabelEncoder
le = LabelEncoder()

In [5]:
# 3. Fit and Transform the Categorical Column
# The 'fit' step learns the unique labels (Small, Medium, Large, etc.)
# The 'transform' step converts them to integers (0, 1, 2, etc.)
df['Size_Encoded'] = le.fit_transform(df['Size'])

In [6]:
# 4. Display the Result
print("Data after Label Encoding:")
print(df)
print("-" * 30)

Data after Label Encoding:
          Size  Price  Size_Encoded
0        Small     10             3
1       Medium     20             2
2        Large     30             1
3       Medium     25             2
4  Extra Large     40             0
5        Small     15             3
------------------------------


In [7]:
# 5. Accessing the learned classes (optional)
print("Learned Classes (Original Labels):")
print(le.classes_)

Learned Classes (Original Labels):
['Extra Large' 'Large' 'Medium' 'Small']


## Inverse Transform

In [8]:
# Use the fitted encoder 'le' from the previous step
encoded_values = df['Size_Encoded']

original_labels = le.inverse_transform(encoded_values)

print("Original Labels recovered from Encoded Values:")
print(original_labels)

Original Labels recovered from Encoded Values:
['Small' 'Medium' 'Large' 'Medium' 'Extra Large' 'Small']


# Ordinal Encoding

In [9]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder

# 1. Prepare Sample Data
data = {
    'Education_Level': ['High School', 'Master\'s', 'Bachelor\'s', 'PhD', 'High School'],
    'Salary': [40000, 100000, 60000, 150000, 35000]
}
df = pd.DataFrame(data)

print("Original Data:")
print(df)
print("-" * 50)

Original Data:
  Education_Level  Salary
0     High School   40000
1        Master's  100000
2      Bachelor's   60000
3             PhD  150000
4     High School   35000
--------------------------------------------------


In [10]:
# 2. Define the Intrinsic Order
# The order is defined from lowest rank to highest rank (0 to N-1).
# We must include ALL unique categories present in the data.
education_categories = ['High School', 'Bachelor\'s', 'Master\'s', 'PhD']


In [11]:
# 3. Initialize the OrdinalEncoder
# The 'categories' parameter takes a list of lists.
# Since we are only encoding one column, we pass [education_categories].
encoder = OrdinalEncoder(categories=[education_categories])


In [12]:
# 4. Fit and Transform the Data
# The encoder learns the mapping: High School=0, Bachelor's=1, Master's=2, PhD=3
df['Education_Encoded'] = encoder.fit_transform(df[['Education_Level']])

print("Data after Ordinal Encoding (Correct Order Maintained):")
print(df)
print("-" * 50)

Data after Ordinal Encoding (Correct Order Maintained):
  Education_Level  Salary  Education_Encoded
0     High School   40000                0.0
1        Master's  100000                2.0
2      Bachelor's   60000                1.0
3             PhD  150000                3.0
4     High School   35000                0.0
--------------------------------------------------


# OneHotEncoder

## Using pandas.get_dummies() (Simple and Fast)

In [13]:
# 1. Prepare Sample Data
data = {
    'City': ['New York', 'London', 'Paris', 'London', 'New York'],
    'Gender': ['Male', 'Female', 'Female', 'Male', 'Female'],
    'Age': [30, 25, 40, 35, 22]
}
df = pd.DataFrame(data)

In [14]:
print("Original Data:")
print(df)
print("-" * 50)

Original Data:
       City  Gender  Age
0  New York    Male   30
1    London  Female   25
2     Paris  Female   40
3    London    Male   35
4  New York  Female   22
--------------------------------------------------


In [15]:
# 2. Apply One Hot Encoding
# The 'columns' argument specifies which columns to encode.
# The 'drop_first=True' argument is optional but highly recommended to avoid multicollinearity.
df_encoded = pd.get_dummies(df, columns=['City', 'Gender'], drop_first=True)

In [16]:
print("Data after One Hot Encoding (using pandas.get_dummies):")
print(df_encoded)

Data after One Hot Encoding (using pandas.get_dummies):
   Age  City_New York  City_Paris  Gender_Male
0   30           True       False         True
1   25          False       False        False
2   40          False        True        False
3   35          False       False         True
4   22           True       False        False


##  Using sklearn.preprocessing.OneHotEncoder (Best for Pipelines)

In [17]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Sample Data
data = pd.DataFrame({
    'City': ['New York', 'London', 'Paris', 'London', 'New York'],
    'Gender': ['Male', 'Female', 'Female', 'Male', 'Female'],
    'Age': [30, 25, 40, 35, 22]
})

# Identify categorical features to encode
categorical_features = ['City', 'Gender']

In [18]:
# 1. Initialize the Encoder
# drop='first' is used to avoid multicollinearity.
# sparse_output=False ensures a dense NumPy array is returned.
ohe = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')

In [19]:
# 2. Fit and Transform the Categorical Data
# We fit only on the categorical columns
encoded_array = ohe.fit_transform(data[categorical_features])


In [20]:
# 3. Get the new feature names
# Use get_feature_names_out to create column names like 'City_London', 'Gender_Male'
feature_names = ohe.get_feature_names_out(categorical_features)

In [21]:
# 4. Create a DataFrame from the encoded array
df_encoded_cats = pd.DataFrame(encoded_array, columns=feature_names, index=data.index)

In [22]:
# 5. Concatenate with the original numerical data
# Drop the original categorical columns before concatenating
df_final = pd.concat([data.drop(columns=categorical_features), df_encoded_cats], axis=1)

print("Final DataFrame after OHE:")
print(df_final)

Final DataFrame after OHE:
   Age  City_New York  City_Paris  Gender_Male
0   30            1.0         0.0          1.0
1   25            0.0         0.0          0.0
2   40            0.0         1.0          0.0
3   35            0.0         0.0          1.0
4   22            1.0         0.0          0.0


# Target Encoding

In [23]:
# !pip install category-encoders --run this 

## Simple Implementation (Without Cross-Validation)

In [24]:
from category_encoders import TargetEncoder

# 1. Prepare Sample Data
# 'City' is the high-cardinality categorical feature.
# 'Default' is the binary target variable (0 or 1).
data = {
    'City': ['London', 'Paris', 'New York', 'London', 'Berlin', 'Paris', 'London', 'Berlin'],
    'Annual_Income': [50000, 60000, 70000, 45000, 80000, 55000, 65000, 75000],
    'Default': [0, 1, 0, 0, 1, 1, 0, 1]  # Target variable (e.g., loan default)
}
df = pd.DataFrame(data)

In [25]:
# Separate features (X) and target (y)
X = df[['City', 'Annual_Income']]
y = df['Default']
print("Original Data:")
print(df)
print("-" * 60)

Original Data:
       City  Annual_Income  Default
0    London          50000        0
1     Paris          60000        1
2  New York          70000        0
3    London          45000        0
4    Berlin          80000        1
5     Paris          55000        1
6    London          65000        0
7    Berlin          75000        1
------------------------------------------------------------


In [26]:
# 2. Initialize the TargetEncoder
# 'smoothing' adds a regularization term to mitigate overfitting.
# The default smoothing parameter is often a good starting point.
encoder = TargetEncoder(cols=['City'], smoothing=1.0)


In [27]:
# 3. Fit and Transform the Data
# The encoder learns the mean of 'Default' for each 'City'.
X_encoded = encoder.fit_transform(X, y)

print("Data after Target Encoding:")
print(X_encoded)
print("-" * 60)

Data after Target Encoding:
   City  Annual_Income
0   0.5          50000
1   0.5          60000
2   0.5          70000
3   0.5          45000
4   0.5          80000
5   0.5          55000
6   0.5          65000
7   0.5          75000
------------------------------------------------------------


In [28]:
# 4. Understanding the Encoding (Mean Values)
# Calculate the mean of 'Default' for each 'City' manually to confirm:
# London: (0+0+0)/3 = 0.0
# Paris: (1+1)/2 = 1.0
# New York: 0/1 = 0.0
# Berlin: (1+1)/2 = 1.0

In [29]:
# The encoded values (e.g., for City) will be slightly different from the exact mean
# due to the 'smoothing' parameter, which pulls the mean slightly towards the global mean.
print("Learned Target Means (Approximated due to Smoothing):")
for category, encoded_value in zip(df['City'].unique(), X_encoded['City'].unique()):
    print(f"{category}: {encoded_value:.4f}")

Learned Target Means (Approximated due to Smoothing):
London: 0.5000
Paris: 0.5000
New York: 0.5000


## Implementation within Cross-Validation (Best Practice)

In [30]:
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

# 2. Define the Pipeline
# Step 1: Target Encode the 'City' column
# Step 2: Train a Logistic Regression model
pipeline = Pipeline([
    ('encoder', TargetEncoder(cols=['City'], smoothing=2.0)),
    ('model', LogisticRegression())
])

In [31]:
# 3. Use Cross-Validation to evaluate performance safely
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
scores = []

for fold, (train_index, test_index) in enumerate(cv.split(X, y)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Crucial Step: The encoder is fitted ONLY on the training data (X_train, y_train)
    pipeline.fit(X_train, y_train)

    # The transformation is applied to the test data using the means learned from the training data
    score = pipeline.score(X_test, y_test)
    scores.append(score)

    print(f"Fold {fold+1} Accuracy: {score:.4f}")

print("-" * 60)
print(f"Average Cross-Validation Accuracy: {np.mean(scores):.4f}")

Fold 1 Accuracy: 0.6667
Fold 2 Accuracy: 0.6667
Fold 3 Accuracy: 0.5000
------------------------------------------------------------
Average Cross-Validation Accuracy: 0.6111
