In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
import joblib

# Load data
df = pd.read_csv("Amazon.csv")
print("Initial rows:", df.shape[0])

# Check missing values
print("\nMissing values per column:\n", df.isnull().sum())

# Map categorical values
category_map = {"Electronics": 0, "Clothing": 1, "Home": 2, "Books": 3, "Beauty": 4}
tier_map = {"Bronze": 0, "Silver": 1, "Gold": 2, "Platinum": 3}
location_map = {"North": 0, "South": 1, "East": 2, "West": 3, "Central": 4}
reason_map = {"None": 0, "Defective": 1, "Wrong Item": 2, "Changed Mind": 3, "Late Delivery": 4, "Other": 5}
is_cod_map = {"Yes": 1, "No": 0}

# Apply mappings
df['product_category'] = df['product_category'].map(category_map)
df['customer_tier'] = df['customer_tier'].map(tier_map)
df['customer_location'] = df['customer_location'].map(location_map)
df['return_reason'] = df['return_reason'].map(reason_map)
df['is_cod'] = df['is_cod'].map(is_cod_map)

# Fill missing values instead of dropping
df.fillna({
    'product_rating': df['product_rating'].mean(),
    'price': df['price'].mean(),
    'delivery_days': df['delivery_days'].mode()[0],
    'customer_tier': 0,
    'return_reason': 0,
    'product_weight_grams': df['product_weight_grams'].mean(),
    'days_to_return': 0,
    'is_cod': 0,
    'product_category': 0,
    'customer_location': 0
}, inplace=True)

# Drop rows only if 'is_returned' is missing (target column)
df.dropna(subset=['is_returned'], inplace=True)

print("Rows after cleaning:", df.shape[0])

# Features and target
X = df[[
    'product_category',
    'price',
    'delivery_days',
    'customer_tier',
    'is_cod',
    'product_rating',
    'customer_location',
    'return_reason',
    'product_weight_grams',
    'days_to_return'
]]
y = df['is_returned']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

print("✅ Training Accuracy:", model.score(X_train_scaled, y_train))
print("✅ Testing Accuracy:", model.score(X_test_scaled, y_test))

# Save
joblib.dump(model, 'customer_return_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

print("🎉 Model and scaler saved successfully.")


Initial rows: 500

Missing values per column:
 product_category          0
price                     0
delivery_days             0
customer_tier             0
is_cod                    0
product_rating            0
is_returned               0
customer_location         0
return_reason           345
product_weight_grams      0
days_to_return            0
dtype: int64
Rows after cleaning: 500
✅ Training Accuracy: 1.0
✅ Testing Accuracy: 1.0
🎉 Model and scaler saved successfully.


In [3]:
print(df.columns)


Index(['product_category', 'price', 'delivery_days', 'customer_tier', 'is_cod',
       'product_rating', 'is_returned', 'customer_location', 'return_reason',
       'product_weight_grams', 'days_to_return'],
      dtype='object')


In [5]:
df = df.dropna()


In [6]:
df = df.dropna(subset=[
    'product_category', 'price', 'delivery_days', 'customer_tier',
    'is_cod', 'product_rating', 'customer_location', 'return_reason',
    'product_weight_grams', 'days_to_return', 'is_returned'
])
