In [2]:
# ============================================================
# DATA PREPROCESSING
# One-Hot Encoding (Features) + Label Encoding (Target)
# BEGINNER FRIENDLY
# ============================================================

import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder



df = pd.read_csv("customer_purchase_data.csv")

print("Original Dataset")
print(df)

# ------------------------------------------------------------
# 2. Separate Features (X) and Target (y)
# ------------------------------------------------------------
X = df.drop("Purchased", axis=1)
y = df["Purchased"]

# ------------------------------------------------------------
# 3. Label Encode the Target Column
# ------------------------------------------------------------
# ML models need numerical target values
# Yes → 1, No → 0

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print("\nEncoded Target (y)")
print(y_encoded)

# ------------------------------------------------------------
# 4. Identify Categorical and Numerical Columns
# ------------------------------------------------------------
categorical_columns = ["Gender", "City", "Product", "Education"]
numerical_columns = ["Age"]

# ------------------------------------------------------------
# 5. One-Hot Encode Categorical Feature Columns
# ------------------------------------------------------------
encoder = OneHotEncoder(
    drop="first",
    sparse_output=False
)

encoded_features = encoder.fit_transform(X[categorical_columns])

encoded_df = pd.DataFrame(
    encoded_features,
    columns=encoder.get_feature_names_out(categorical_columns)
)

# ------------------------------------------------------------
# 6. Combine Encoded Features with Numerical Columns
# ------------------------------------------------------------
X_final = pd.concat(
    [encoded_df, X[numerical_columns].reset_index(drop=True)],
    axis=1
)

# ------------------------------------------------------------
# 7. Final Data Ready for Modeling
# ------------------------------------------------------------
print("\nFinal Feature Matrix (X_final)")
print(X_final)

print("\nFinal Target Vector (y_encoded)")
print(y_encoded)


Original Dataset
     Gender       City Product    Education  Age Purchased
0      Male     Mumbai  Laptop     Bachelor   34        No
1    Female    Chennai  Tablet  High School   31        No
2      Male    Chennai   Phone     Bachelor   38        No
3      Male  Bangalore   Phone       Master   32       Yes
4      Male     Mumbai   Phone  High School   43        No
..      ...        ...     ...          ...  ...       ...
495    Male    Chennai  Tablet     Bachelor   40        No
496    Male    Chennai  Laptop       Master   38       Yes
497  Female  Bangalore  Tablet     Bachelor   49        No
498    Male  Bangalore  Tablet     Bachelor   26        No
499  Female     Mumbai  Tablet     Bachelor   52        No

[500 rows x 6 columns]

Encoded Target (y)
[0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 1
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1
 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 1 1 0 0 0 0
 0 0 0