In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

In [2]:
# Sample dataset
data = {
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'],
    'Education': ['Bachelors', 'Masters', 'PhD', 'Bachelors', 'Masters'],
    'Gender': ['Male', 'Female', 'Female', 'Male', 'Male'],
    'Experience': [5, 10, 15, 20, 25],
    'Salary': [50000, 60000, 80000, 90000, 120000]
}

In [3]:
df = pd.DataFrame(data)

In [None]:
# Split the data into features (X) and target (y)
X = df[['City', 'Education', 'Gender', 'Experience']]  # Features
y = df['Salary']  # Target

In [None]:
# OneHotEncoder for the 'City' column (nominal categorical data)
# OrdinalEncoder for the 'Education' column (ordinal categorical data)
# LabelEncoder can be applied to the 'Gender' column, but we'll do it separately later

# First, define which encoding to apply for each feature
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(), ['City']),              # OneHotEncoder for nominal data
        ('ordinal', OrdinalEncoder(categories=[['Bachelors', 'Masters', 'PhD']]), ['Education'])  # Ordinal encoding
    ])

In [None]:
# Define the pipeline with the preprocessor
pipe = Pipeline(steps=[('preprocessor', preprocessor)])

In [None]:
# Split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Fit and transform the training data
X_train_transformed = pipe.fit_transform(X_train)
print("Transformed Training Data (after encoding):\n", X_train_transformed)

In [None]:
# Transform the test data
X_test_transformed = pipe.transform(X_test)  # Here, you will get error.

print("Transformed Test Data (after encoding):\n", X_test_transformed)   

# The error ValueError: Found unknown categories ['Los Angeles'] in column 0 during transform occurs because OneHotEncoder is trying to transform the test data (X_test), but it encounters a category ('Los Angeles') that was not seen during the training phase (fit_transform on X_train).                                        

# To prevent this, you can use the handle_unknown='ignore' parameter in the OneHotEncoder.

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'), ['City']),  # handle unknown categories in test set
        ('ordinal', OrdinalEncoder(categories=[['Bachelors', 'Masters', 'PhD']]), ['Education'])  # Ordinal encoding
    ])

In [None]:
# Define the pipeline with the preprocessor
pipe = Pipeline(steps=[('preprocessor', preprocessor)])

In [None]:
# Split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Fit and transform the training data
X_train_transformed = pipe.fit_transform(X_train)
print("Transformed Training Data (after encoding):\n", X_train_transformed)

In [None]:
# Transform the test data
X_test_transformed = pipe.transform(X_test)

print("Transformed Test Data (after encoding):\n", X_test_transformed)   # Here you will get error. 


In [None]:
# Applying LabelEncoder separately to the 'Gender' column
le = LabelEncoder()
X_train['Gender'] = le.fit_transform(X_train['Gender'])
X_test['Gender'] = le.transform(X_test['Gender'])  # Make sure you use `transform` and not `fit_transform`

In [None]:
# Adding the LabelEncoded Gender back to transformed data
print("\nEncoded Gender Column for Training:\n", X_train['Gender'])
print("\nEncoded Gender Column for Test:\n", X_test['Gender'])