# ==================================
# SIMPLE ELT PIPELINE - RETAIL DATA
# ==================================


In [7]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer



## EXTRACT

In [6]:
df = pd.read_csv("Customer_report_cleaned_data.csv", encoding="latin1")

In [9]:
print("Data extracted successfully")
print("Initial shape:", df.shape)

Data extracted successfully
Initial shape: (18482, 14)



## TRANSFORM



In [11]:
# Separate numerical and categorical columns
numerical_cols = df.select_dtypes(include=["int64", "float64"]).columns
categorical_cols = df.select_dtypes(include=["object"]).columns

In [12]:

# Numerical data pipeline
numerical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),     # Handle missing values
    ("scaler", StandardScaler())                     # Scale numerical data
])

In [13]:

# Categorical data pipeline
categorical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),  # Handle missing values
    ("encoder", OneHotEncoder(handle_unknown="ignore"))    # Encode categories
])

In [14]:
# Combine both pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_pipeline, numerical_cols),
        ("cat", categorical_pipeline, categorical_cols)
    ]
)


In [15]:
# Apply transformations
processed_data = preprocessor.fit_transform(df)

print("Data transformation completed")

Data transformation completed


## LOAD

In [16]:
# Save transformed data
np.save("final_customer_data.npy", processed_data)

print("Processed data saved successfully")
print("Final dataset ready for analysis or ML")


Processed data saved successfully
Final dataset ready for analysis or ML
