In [167]:
# E-commerce Return Rate Reduction Analysis

In [None]:
import pandas as pd
import numpy as np

In [None]:
# Load data
df = pd.read_csv("/content/ecommerce_customer_data_large.csv")

In [None]:
print("Rows, cols:", df.shape)
print("Columns:", list(df.columns)[:40])

Rows, cols: (250000, 13)
Columns: ['Customer ID', 'Purchase Date', 'Product Category', 'Product Price', 'Quantity', 'Total Purchase Amount', 'Payment Method', 'Customer Age', 'Returns', 'Customer Name', 'Age', 'Gender', 'Churn']


In [None]:
df.head()

Unnamed: 0,Customer ID,Purchase Date,Product Category,Product Price,Quantity,Total Purchase Amount,Payment Method,Customer Age,Returns,Customer Name,Age,Gender,Churn
0,44605,2023-05-03 21:30:02,Home,177,1,2427,PayPal,31,1.0,John Rivera,31,Female,0
1,44605,2021-05-16 13:57:44,Electronics,174,3,2448,PayPal,31,1.0,John Rivera,31,Female,0
2,44605,2020-07-13 06:16:57,Books,413,1,2345,Credit Card,31,1.0,John Rivera,31,Female,0
3,44605,2023-01-17 13:14:36,Electronics,396,3,937,Cash,31,0.0,John Rivera,31,Female,0
4,44605,2021-05-01 11:29:27,Books,259,4,2598,PayPal,31,1.0,John Rivera,31,Female,0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 13 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Customer ID            250000 non-null  int64  
 1   Purchase Date          250000 non-null  object 
 2   Product Category       250000 non-null  object 
 3   Product Price          250000 non-null  int64  
 4   Quantity               250000 non-null  int64  
 5   Total Purchase Amount  250000 non-null  int64  
 6   Payment Method         250000 non-null  object 
 7   Customer Age           250000 non-null  int64  
 8   Returns                202618 non-null  float64
 9   Customer Name          250000 non-null  object 
 10  Age                    250000 non-null  int64  
 11  Gender                 250000 non-null  object 
 12  Churn                  250000 non-null  int64  
dtypes: float64(1), int64(7), object(5)
memory usage: 24.8+ MB


In [None]:
# Normalize Returns -> binary 0/1
df['returns_flag'] = df['Returns'].astype(str).str.lower().isin(['1','true','yes','y']).astype(int)
print("Shape of df after normalizing returns:", df.shape)

Shape of df after normalizing returns: (250000, 14)


In [None]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)


Train shape: (200000, 7) Test shape: (50000, 7)


In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

In [None]:
# Identify categorical and numerical columns
categorical_cols = ['Product Category', 'Payment Method', 'Gender']
numeric_cols = ['Product Price', 'Quantity', 'Total Purchase Amount', 'Customer Age']

In [None]:
# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),   # scale numeric
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)  # encode categorical
    ]
)


In [162]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [163]:
# Create pipeline (preprocessing + model)
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])


In [164]:
model.fit(X_train, y_train)


In [165]:
y_pred = model.predict(X_test)

In [166]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8005

Confusion Matrix:
 [[40025     0]
 [ 9975     0]]

Classification Report:
               precision    recall  f1-score   support

           0       0.80      1.00      0.89     40025
           1       0.00      0.00      0.00      9975

    accuracy                           0.80     50000
   macro avg       0.40      0.50      0.44     50000
weighted avg       0.64      0.80      0.71     50000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
