SHOPPING TRENDS

In [2]:
pip install pandas xgboost scikit-learn flask


Collecting xgboost
  Downloading xgboost-2.1.4-py3-none-win_amd64.whl.metadata (2.1 kB)
Collecting flask
  Downloading flask-3.1.0-py3-none-any.whl.metadata (2.7 kB)
Collecting Werkzeug>=3.1 (from flask)
  Using cached werkzeug-3.1.3-py3-none-any.whl.metadata (3.7 kB)
Collecting Jinja2>=3.1.2 (from flask)
  Downloading jinja2-3.1.5-py3-none-any.whl.metadata (2.6 kB)
Collecting itsdangerous>=2.2 (from flask)
  Downloading itsdangerous-2.2.0-py3-none-any.whl.metadata (1.9 kB)
Collecting click>=8.1.3 (from flask)
  Downloading click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Collecting blinker>=1.9 (from flask)
  Downloading blinker-1.9.0-py3-none-any.whl.metadata (1.6 kB)
Collecting MarkupSafe>=2.0 (from Jinja2>=3.1.2->flask)
  Using cached MarkupSafe-3.0.2-cp312-cp312-win_amd64.whl.metadata (4.1 kB)
Downloading xgboost-2.1.4-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB ? et

In [33]:
import pandas as pd
import numpy as np
import xgboost as xgb
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, mean_squared_error


In [26]:
df = pd.read_csv("shopping.csv")

In [5]:
df.drop(["Customer ID"], axis=1, inplace=True)

In [6]:
label_encoders = {}
for col in df.select_dtypes(include=["object"]).columns:
    label_encoders[col] = LabelEncoder()
    df[col] = label_encoders[col].fit_transform(df[col])

In [7]:
# 🔹 Define Features & Target
X = df.drop("Purchase Amount (USD)", axis=1)  # Features
y = df["Purchase Amount (USD)"]  # Target variable

In [8]:
# 🔹 Split the dataset into 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
# 🔹 Train an XGBoost Regressor
model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
model.fit(X_train, y_train)

In [10]:
y_pred = model.predict(X_test)

In [11]:
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")

Mean Squared Error: 594.90


In [30]:
with open("model.pkl", "wb") as f:
    pickle.dump(model, f)

In [29]:
# 🔹 Save Label Encoders for later use
with open("label_encoders.pkl", "wb") as f:
    pickle.dump(label_encoders, f)

print("✅ Model training complete! Model saved as 'model.pkl'.")

✅ Model training complete! Model saved as 'model.pkl'.


In [14]:
# Encode categorical variables
label_encoders = {}
for col in ['Gender', 'Item Purchased', 'Category', 'Location', 'Size', 'Color', 'Season', 'Shipping Type']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [15]:
features = ['Age', 'Gender', 'Item Purchased', 'Category', 'Purchase Amount (USD)', 
            'Location', 'Size', 'Color', 'Season', 'Review Rating', 'Subscription Status', 'Shipping Type']

In [17]:
X = df[features]

In [18]:
y_sales = df['Purchase Amount (USD)']
X_train, X_test, y_train, y_test = train_test_split(X, y_sales, test_size=0.2, random_state=42)
sales_model = xgb.XGBRegressor(objective='reg:squarederror')
sales_model.fit(X_train, y_train)
sales_pred = sales_model.predict(X_test)
print("Sales Prediction MSE:", mean_squared_error(y_test, sales_pred))

Sales Prediction MSE: 6.749417735818497e-08


In [28]:
# 🎯 2. Inventory Demand Prediction (Regression)
y_inv = df['Purchase Amount (USD)']
X_train, X_test, y_train, y_test = train_test_split(X, y_inv, test_size=0.2, random_state=42)
inv_model = xgb.XGBRegressor(objective='reg:squarederror')
inv_model.fit(X_train, y_train)
inv_pred = inv_model.predict(X_test)
print("Inventory Demand MSE:", mean_squared_error(y_test, inv_pred))

Inventory Demand MSE: 6.749417735818497e-08


In [41]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score

# Encode the target variable
label_encoder = LabelEncoder()
df['Category_encoded'] = label_encoder.fit_transform(df['Category'])  # Convert categories to numbers
y_cust = df['Category_encoded']  # Use encoded labels

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y_cust, test_size=0.2, random_state=42)

# Train the model
cust_model = xgb.XGBClassifier(eval_metric='mlogloss')  # Removed use_label_encoder
cust_model.fit(X_train, y_train)

# Predictions
cust_pred = cust_model.predict(X_test)

# Accuracy score
print("Customer Segmentation Accuracy:", accuracy_score(y_test, cust_pred))


Customer Segmentation Accuracy: 1.0


In [24]:
# 🎯 4. Best-Selling Product Prediction (Classification)
y_best = df['Item Purchased']
X_train, X_test, y_train, y_test = train_test_split(X, y_best, test_size=0.2, random_state=42)
best_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
best_model.fit(X_train, y_train)
best_pred = best_model.predict(X_test)
print("Best-Selling Product Classification Accuracy:", accuracy_score(y_test, best_pred))


Parameters: { "use_label_encoder" } are not used.



Best-Selling Product Classification Accuracy: 1.0


In [43]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score

# Encode the target variable
label_encoder = LabelEncoder()
df['Subscription_Status_encoded'] = label_encoder.fit_transform(df['Subscription Status'])  # Convert 'Yes'/'No' to 1/0
y_marketing = df['Subscription_Status_encoded']  # Use encoded labels

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y_marketing, test_size=0.2, random_state=42)

# Train the model
marketing_model = xgb.XGBClassifier(eval_metric='logloss')  # Removed use_label_encoder
marketing_model.fit(X_train, y_train)

# Predictions
marketing_pred = marketing_model.predict(X_test)

# Accuracy score
print("Targeted Marketing Accuracy:", accuracy_score(y_test, marketing_pred))


Targeted Marketing Accuracy: 1.0


In [44]:
from sklearn.metrics import mean_absolute_error, r2_score
import numpy as np


In [45]:

# Sales Model Evaluation
mse_sales = mean_squared_error(y_test, sales_pred)
rmse_sales = np.sqrt(mse_sales)
mae_sales = mean_absolute_error(y_test, sales_pred)
r2_sales = r2_score(y_test, sales_pred)


In [46]:
print("\n🔹 Sales Model Evaluation:")
print(f"MSE: {mse_sales:.4f}")
print(f"RMSE: {rmse_sales:.4f}")
print(f"MAE: {mae_sales:.4f}")
print(f"R² Score: {r2_sales:.4f}")



🔹 Sales Model Evaluation:
MSE: 3920.9048
RMSE: 62.6171
MAE: 57.9654
R² Score: -19255.9883


In [47]:
# Inventory Demand Model Evaluation
mse_inv = mean_squared_error(y_test, inv_pred)
rmse_inv = np.sqrt(mse_inv)
mae_inv = mean_absolute_error(y_test, inv_pred)
r2_inv = r2_score(y_test, inv_pred)


In [48]:
print("\n🔹 Inventory Demand Model Evaluation:")
print(f"MSE: {mse_inv:.4f}")
print(f"RMSE: {rmse_inv:.4f}")
print(f"MAE: {mae_inv:.4f}")
print(f"R² Score: {r2_inv:.4f}")


🔹 Inventory Demand Model Evaluation:
MSE: 3920.9048
RMSE: 62.6171
MAE: 57.9654
R² Score: -19255.9883


In [60]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

def evaluate_classification(y_true, y_pred, model_name):
    # Compute classification metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    cm = confusion_matrix(y_true, y_pred)

    # Format confusion matrix as a DataFrame
    cm_df = pd.DataFrame(cm, 
                         index=[f"Actual {i}" for i in range(cm.shape[0])], 
                         columns=[f"Pred {i}" for i in range(cm.shape[1])])

    # Print evaluation results
    print(f"\n🔹 {model_name} Model Evaluation:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}\n")
    
    print("Confusion Matrix:")
    print(cm_df.to_string())  # Display matrix in a readable format

# Evaluate models
evaluate_classification(y_test, cust_pred, "Customer Segmentation")
evaluate_classification(y_test, best_pred, "Best-Selling Products")
evaluate_classification(y_test, marketing_pred, "Targeted Marketing")



🔹 Customer Segmentation Model Evaluation:
Accuracy: 0.3513
Precision: 0.5904
Recall: 0.3513
F1 Score: 0.4118

Confusion Matrix:
          Pred 0  Pred 1  Pred 2  Pred 3
Actual 0     178     250      85      45
Actual 1      71      96      37      18
Actual 2       0       0       0       0
Actual 3       0       0       0       0

🔹 Best-Selling Products Model Evaluation:
Accuracy: 0.0308
Precision: 0.5380
Recall: 0.0308
F1 Score: 0.0571

Confusion Matrix:
           Pred 0  Pred 1  Pred 2  Pred 3  Pred 4  Pred 5  Pred 6  Pred 7  Pred 8  Pred 9  Pred 10  Pred 11  Pred 12  Pred 13  Pred 14  Pred 15  Pred 16  Pred 17  Pred 18  Pred 19  Pred 20  Pred 21  Pred 22  Pred 23  Pred 24
Actual 0       18      34      29      25      21      23      21      23      24      20       24       21       16       23       20       16       25       19       16       20       21       26       26       24       23
Actual 1        8       6       4      11      10       8       6      11      12      