In [4]:
# --- Step 1: Imports ---
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

# --- Step 2: Load raw data ---
df_raw = pd.read_csv("/content/data.csv")   # adjust path
df_raw["TransactionStartTime"] = pd.to_datetime(df_raw["TransactionStartTime"], errors="coerce")

# --- Step 3: Aggregate features per customer ---
snapshot_date = df_raw["TransactionStartTime"].max() + pd.Timedelta(days=1)

features = df_raw.groupby("CustomerId").agg(
    TotalAmount=("Amount","sum"),
    AvgAmount=("Amount","mean"),
    TxnCount=("Amount","count"),
    StdAmount=("Amount","std"),
    FirstTxn=("TransactionStartTime","min"),
    LastTxn=("TransactionStartTime","max"),
    UniqueProducts=("ProductCategory","nunique"),
    UniqueChannels=("ChannelId","nunique"),
    UniqueProviders=("ProviderId","nunique")
).reset_index()

# Derived features
features["Recency_days"] = (snapshot_date - features["LastTxn"]).dt.days
features["Lifetime_days"] = (features["LastTxn"] - features["FirstTxn"]).dt.days

# Extract time features
features["FirstTxn_month"] = features["FirstTxn"].dt.month
features["FirstTxn_year"] = features["FirstTxn"].dt.year

# --- Step 4: Define feature groups ---
numeric_cols = ["TotalAmount","AvgAmount","TxnCount","StdAmount",
                "UniqueProducts","UniqueChannels","UniqueProviders",
                "Recency_days","Lifetime_days"]
categorical_cols = ["FirstTxn_month","FirstTxn_year"]

# --- Step 5: Build preprocessing pipeline ---
numeric_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline, numeric_cols),
        ("cat", categorical_pipeline, categorical_cols)
    ],
    remainder="passthrough"   # âœ… keeps CustomerId
)

# --- Step 6: Apply pipeline ---
X_processed = preprocessor.fit_transform(features)

# Get all feature names automatically
feature_names = preprocessor.get_feature_names_out()

# Build DataFrame
processed_df = pd.DataFrame(X_processed, columns=feature_names)

# Rename passthrough CustomerId
if "remainder__CustomerId" in processed_df.columns:
    processed_df = processed_df.rename(columns={"remainder__CustomerId": "CustomerId"})

# --- Step 7: Save Task 3 dataset ---
processed_df.to_csv("/content/processed_task3.csv", index=False)
print(processed_df.head())


  num__TotalAmount num__AvgAmount num__TxnCount num__StdAmount  \
0        -0.066891      -0.153364     -0.253459      -0.095504   
1        -0.066891      -0.153364     -0.253459      -0.095504   
2        -0.055849       -0.06987     -0.212186      -0.083421   
3        -0.061655      -0.091435     -0.150278      -0.145414   
4        -0.055849      -0.073846     -0.201868      -0.088882   

  num__UniqueProducts num__UniqueChannels num__UniqueProviders  \
0           -1.153977           -1.404749            -1.382737   
1           -1.153977           -1.404749            -1.382737   
2           -0.114953            0.450075             0.392594   
3           -0.114953            0.450075            -0.495072   
4           -0.114953            0.450075             0.392594   

  num__Recency_days num__Lifetime_days cat__FirstTxn_month_1  \
0          1.937605          -0.705687                   0.0   
1          1.937605          -0.705687                   0.0   
2          2.1