In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv("shopping_trends.csv")

In [3]:
print(df.columns)

Index(['Customer ID', 'Age', 'Gender', 'Item Purchased', 'Category',
       'Purchase Amount (USD)', 'Location', 'Size', 'Color', 'Season',
       'Review Rating', 'Subscription Status', 'Payment Method',
       'Shipping Type', 'Discount Applied', 'Promo Code Used',
       'Previous Purchases', 'Preferred Payment Method',
       'Frequency of Purchases'],
      dtype='object')


In [35]:
df.fillna(df.median(numeric_only=True), inplace=True)

In [36]:
label_encoders = {}
for col in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [45]:
important_features = ["Category", "Purchase Amount (USD)", "Location", "Season", "Previous Purchases", "Frequency of Purchases"]
X = df[important_features]

y = df["Purchase Amount (USD)"]

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

In [48]:
# sales_model = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=100, learning_rate=0.1)
# sales_model.fit(X_train, y_train)
sales_model = xgb.XGBRegressor(
    objective="reg:squarederror",
    n_estimators=300,      # More trees for better learning
    learning_rate=0.05,    # Lower learning rate for stability
    max_depth=6,           # Control tree depth to prevent overfitting
    subsample=0.8,         # Use 80% of data per tree for regularization
    colsample_bytree=0.8,  # Use 80% of features per tree
    random_state=42
)


In [49]:
sales_model.fit(X_train, y_train)


In [50]:
y_pred_sales = sales_model.predict(X_test)

In [51]:
print("\n🔹 Sales Trends Forecasting Performance:")
print(f"MAE: {mean_absolute_error(y_test, y_pred_sales)}")
print(f"MSE: {mean_squared_error(y_test, y_pred_sales)}")
print(f"R² Score: {r2_score(y_test, y_pred_sales)}")


🔹 Sales Trends Forecasting Performance:
MAE: 0.7324298222859701
MSE: 1.0514177287092306
R² Score: 0.9981210827827454


In [52]:
sales_pred = sales_model.predict(X_test)
print("Sales Prediction MSE:", mean_squared_error(y_test, sales_pred))


Sales Prediction MSE: 1.0514177287092306


In [14]:
print(df['Purchase Amount (USD)'].describe())


count    3900.000000
mean       59.764359
std        23.685392
min        20.000000
25%        39.000000
50%        60.000000
75%        81.000000
max       100.000000
Name: Purchase Amount (USD), dtype: float64


In [25]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

# Example: Customer data
customer_data = pd.DataFrame({
    'Customer ID': [1234],
    'Age': [30],
    'Gender': ['Male'],
    'Item Purchased': ['Product B'],
    'Category': ['Clothing'],
    'Location': ['Los Angeles'],
    'Size': ['L'],
    'Color': ['Blue'],
    'Season': ['Winter'],
    'Review Rating': [5],
    'Subscription Status': ['Active'],
    'Payment Method': ['PayPal'],
    'Shipping Type': ['Express'],
    'Discount Applied': [False],
    'Promo Code Used': [True],
    'Previous Purchases': [10],
    'Preferred Payment Method': ['Credit Card'],
    'Frequency of Purchases': [5]
})

# One-Hot Encoding for categorical columns
customer_data = pd.get_dummies(customer_data, drop_first=True)

# Step 1: Extract the feature names from the model
model_feature_names = sales_model.get_booster().feature_names

# Step 2: Align the input data with the model's expected feature names

# Create a DataFrame with columns from the model's feature names
aligned_data = pd.DataFrame(columns=model_feature_names)

# Fill the aligned data with values from the customer data where possible
for col in customer_data.columns:
    if col in aligned_data.columns:
        aligned_data[col] = customer_data[col]

# Step 3: Ensure all required features are present in the aligned data
# Fill missing columns with zeros (or use NaN if your model can handle it)
aligned_data = aligned_data.fillna(0)

# Step 4: Predict customer's future purchase amount
y_customer_pred = sales_model.predict(aligned_data)

print(f"Predicted Purchase Amount for Customer 1234: ${y_customer_pred[0]:.2f}")


Predicted Purchase Amount for Customer 1234: $24.33


  aligned_data = aligned_data.fillna(0)


In [69]:
df["Predicted_Amount"] = sales_model.predict(X)

In [81]:
# Compute Sales Per Season
seasonal_sales = df.groupby("Season")["Predicted_Amount"].sum().sort_values(ascending=False)

# Display Seasonal Trends
print("Sales by Season:")
print(seasonal_sales)


Sales by Season:
Season
Fall      60031.578125
Spring    58714.769531
Winter    58625.097656
Summer    55772.769531
Name: Predicted_Amount, dtype: float32


In [73]:
df["Total_Purchase"] = df.groupby("Customer ID")["Predicted_Amount"].transform("sum")


In [74]:

# Recreate the encoder and fit it on the original dataset
encoder = LabelEncoder()
df["Item Purchased"] = encoder.fit_transform(df["Item Purchased"])  # Encoding step

# Get the mapping back
item_mapping = dict(zip(encoder.transform(encoder.classes_), encoder.classes_))

# Replace numbers with original names
df["Item Purchased"] = df["Item Purchased"].map(item_mapping)

# Check the result
print(df[["Item Purchased"]].head())

  Item Purchased
0         Blouse
1        Sweater
2          Jeans
3        Sandals
4         Blouse


In [75]:
# Compute Sales Per Item
popular_items = df.groupby("Item Purchased")["Predicted_Amount"].sum().sort_values(ascending=False)

# Display Top 5 Selling Items
print("Top 5 Popular Items:")
print(popular_items.head(5))


Top 5 Popular Items:
Item Purchased
Blouse     10426.342773
Shirt      10337.475586
Dress      10312.915039
Pants      10087.345703
Jewelry    10009.652344
Name: Predicted_Amount, dtype: float32


In [61]:
# Check unique values in 'Item Purchased'
print(df["Item Purchased"].unique())


[ 2 23 11 14 20 16 18  4  7 17  5 19 22 13 10  9 12 24 15  8 21  0  1  3
  6]


In [64]:
dfs = pd.read_csv("shopping_trends.csv")

In [65]:
print(dfs["Item Purchased"].unique())


['Blouse' 'Sweater' 'Jeans' 'Sandals' 'Sneakers' 'Shirt' 'Shorts' 'Coat'
 'Handbag' 'Shoes' 'Dress' 'Skirt' 'Sunglasses' 'Pants' 'Jacket' 'Hoodie'
 'Jewelry' 'T-shirt' 'Scarf' 'Hat' 'Socks' 'Backpack' 'Belt' 'Boots'
 'Gloves']


In [66]:

# Recreate the encoder and fit it on the original dataset
encoder = LabelEncoder()
df["Item Purchased"] = encoder.fit_transform(df["Item Purchased"])  # Encoding step

# Get the mapping back
item_mapping = dict(zip(encoder.transform(encoder.classes_), encoder.classes_))

# Replace numbers with original names
df["Item Purchased"] = df["Item Purchased"].map(item_mapping)

# Check the result
print(df[["Item Purchased"]].head())

  Item Purchased
0         Blouse
1        Sweater
2          Jeans
3        Sandals
4         Blouse


Customer Segmentation using XGBoost