In [1]:
# Product Category Prediction

import numpy as np
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [2]:
# Load dataset

df = pd.read_csv(r"C:\Users\vishw\OneDrive\Desktop\Machine learning\Project automated product category classification\product_sales_dataset_final.csv")


In [3]:
# 1. CHECK BALANCE

print(df['Category'].value_counts())

Category
Clothing & Apparel    62298
Electronics           51230
Home & Furniture      50564
Accessories           35908
Name: count, dtype: int64


In [4]:
# 2. BALANCE DATASET

df_ele   = df[df['Category'] == 'Electronics'].sample(20000, random_state=42)
df_cloth = df[df['Category'] == 'Clothing & Apparel'].sample(20000, random_state=42)
df_home  = df[df['Category'] == 'Home & Furniture'].sample(20000, random_state=42)
df_acc   = df[df['Category'] == 'Accessories'].sample(20000, random_state=42)

df_new = pd.concat([df_ele, df_cloth, df_home, df_acc])
df_new = df_new.sample(frac=1, random_state=42)

print(df_new['Category'].value_counts())

Category
Home & Furniture      20000
Accessories           20000
Electronics           20000
Clothing & Apparel    20000
Name: count, dtype: int64


In [5]:
df_new.columns = df_new.columns.str.strip().str.lower()

In [6]:
df_new

Unnamed: 0,order_id,order_date,customer_name,city,state,region,country,category,sub_category,product_name,quantity,unit_price,revenue,profit
25444,25445,11-09-23,Michael Wilson,Chandler,Arizona,West,United States,Home & Furniture,Furniture,Office Chair,1,368.11,368.11,79.30
16100,16101,05-14-24,Max Franklin,Lincoln,Nebraska,Centre,United States,Home & Furniture,Home Decor,Throw Pillows,3,312.29,936.87,211.26
181814,181815,06-29-23,Melanie Simpson,Dallas,Texas,South,United States,Accessories,Bags,Wallet,2,70.10,140.20,26.20
82760,82761,11-13-23,Jeffrey Hill,Mesa,Arizona,West,United States,Accessories,Bags,Wallet,1,200.72,200.72,38.85
27394,27395,06-30-23,Luis Gordon,Colorado Springs,Colorado,West,United States,Home & Furniture,Kitchenware,Cookware Set,5,853.00,4265.00,672.23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111967,111968,01-22-23,Tammy Mcdowell,Austin,Texas,South,United States,Electronics,Wearables,Apple Watch,6,934.78,5608.68,614.91
123396,123397,05-04-24,Julie Christensen,Chandler,Arizona,West,United States,Home & Furniture,Bedding,Tempur-Pedic Mattress,4,880.13,3520.52,1042.17
89201,89202,05-29-24,Laura Turner,Albuquerque,New Mexico,West,United States,Accessories,Bags,Tote Bag,2,95.76,191.52,55.88
103621,103622,06-29-23,Ashley Lewis,Pittsburgh,Pennsylvania,East,United States,Electronics,Laptops,Dell XPS 13,1,694.87,694.87,69.25


In [7]:
df_new['unit_price'] = df_new['revenue'] / df_new['quantity']

In [8]:
margin_map = {
    "Electronics": 0.15,          # 15%
    "Clothing & Apparel": 0.60,   # 60%
    "Home & Furniture": 0.35,     # 35%
    "Accessories": 0.50           # 50%
}


In [9]:
df_new["purchase_price"] = df_new.apply(
    lambda row: row["unit_price"] * (1 - margin_map[row["category"]]),
    axis=1
)

In [10]:
df_new["revenue"] = df_new["quantity"] * df_new["unit_price"]
df_new["profit"] = df_new["revenue"] - (df_new["quantity"] * df_new["purchase_price"])

In [11]:
df_new.head()

Unnamed: 0,order_id,order_date,customer_name,city,state,region,country,category,sub_category,product_name,quantity,unit_price,revenue,profit,purchase_price
25444,25445,11-09-23,Michael Wilson,Chandler,Arizona,West,United States,Home & Furniture,Furniture,Office Chair,1,368.11,368.11,128.8385,239.2715
16100,16101,05-14-24,Max Franklin,Lincoln,Nebraska,Centre,United States,Home & Furniture,Home Decor,Throw Pillows,3,312.29,936.87,327.9045,202.9885
181814,181815,06-29-23,Melanie Simpson,Dallas,Texas,South,United States,Accessories,Bags,Wallet,2,70.1,140.2,70.1,35.05
82760,82761,11-13-23,Jeffrey Hill,Mesa,Arizona,West,United States,Accessories,Bags,Wallet,1,200.72,200.72,100.36,100.36
27394,27395,06-30-23,Luis Gordon,Colorado Springs,Colorado,West,United States,Home & Furniture,Kitchenware,Cookware Set,5,853.0,4265.0,1492.75,554.45


In [12]:
# 3. FEATURES & TARGET

X = df_new[['quantity', 'unit_price', 'purchase_price' ,'revenue', 'profit']]
y = df_new['category']

In [13]:
# 4. ENCODING

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_enc = le.fit_transform(y)

In [14]:
# 5. SCALING (MANDATORY FOR SVM)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [15]:
# 6. TRAIN TEST SPLIT

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_enc, test_size=0.2, random_state=42
)


In [16]:
# Feature Selection using RFECV
# RFECV (Recursive Feature Elimination with Cross-Validation) is a powerful technique for selecting the most relevant features for a model. It works by recursively removing features and evaluating the model's performance using cross-validation to determine the optimal number of features.

from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.svm import LinearSVC

# enable probability so scorer can use predict_proba (optional but safe)
svc = SVC(
    kernel='rbf',
    C=10,
    gamma=0.1,
    class_weight='balanced',
    probability=True
)

rfecv = RFECV(
    estimator=svc,
    step=1,
    cv=StratifiedKFold(5),
    scoring='roc_auc_ovr',   # use multiclass-compatible scorer ('ovr' or 'ovo')
    n_jobs=-1
)


# RFECV needs an estimator that exposes coef_ or feature_importances_
linear_svc = LinearSVC(C=1.0, class_weight='balanced', max_iter=10000)
rfecv.estimator = linear_svc


# LinearSVC has no predict_proba — use a scorer that doesn't require probabilities
rfecv.scoring = 'f1_weighted'   # alternatives: 'accuracy', 'balanced_accuracy'
rfecv.fit(X_train, y_train)

print("Optimal number of features:", rfecv.n_features_)


Optimal number of features: 2


In [17]:
X_train_selected = rfecv.transform(X_train)
X_test_selected = rfecv.transform(X_test)

print("Before:", X_train.shape)
print("After :", X_train_selected.shape)


Before: (64000, 5)
After : (64000, 2)


In [18]:
# 7. MODEL TRAINING

from sklearn.svm import SVC

model = SVC(
    kernel='rbf',
    C=10,
    gamma=0.1,
    class_weight='balanced',
    probability=True
)

model.fit(X_train_selected, y_train)

0,1,2
,"C  C: float, default=1.0 Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. The penalty is a squared l2 penalty. For an intuitive visualization of the effects of scaling the regularization parameter C, see :ref:`sphx_glr_auto_examples_svm_plot_svm_scale_c.py`.",10
,"kernel  kernel: {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} or callable, default='rbf' Specifies the kernel type to be used in the algorithm. If none is given, 'rbf' will be used. If a callable is given it is used to pre-compute the kernel matrix from data matrices; that matrix should be an array of shape ``(n_samples, n_samples)``. For an intuitive visualization of different kernel types see :ref:`sphx_glr_auto_examples_svm_plot_svm_kernels.py`.",'rbf'
,"degree  degree: int, default=3 Degree of the polynomial kernel function ('poly'). Must be non-negative. Ignored by all other kernels.",3
,"gamma  gamma: {'scale', 'auto'} or float, default='scale' Kernel coefficient for 'rbf', 'poly' and 'sigmoid'. - if ``gamma='scale'`` (default) is passed then it uses  1 / (n_features * X.var()) as value of gamma, - if 'auto', uses 1 / n_features - if float, must be non-negative. .. versionchanged:: 0.22  The default value of ``gamma`` changed from 'auto' to 'scale'.",0.1
,"coef0  coef0: float, default=0.0 Independent term in kernel function. It is only significant in 'poly' and 'sigmoid'.",0.0
,"shrinking  shrinking: bool, default=True Whether to use the shrinking heuristic. See the :ref:`User Guide `.",True
,"probability  probability: bool, default=False Whether to enable probability estimates. This must be enabled prior to calling `fit`, will slow down that method as it internally uses 5-fold cross-validation, and `predict_proba` may be inconsistent with `predict`. Read more in the :ref:`User Guide `.",True
,"tol  tol: float, default=1e-3 Tolerance for stopping criterion.",0.001
,"cache_size  cache_size: float, default=200 Specify the size of the kernel cache (in MB).",200
,"class_weight  class_weight: dict or 'balanced', default=None Set the parameter C of class i to class_weight[i]*C for SVC. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``.",'balanced'


In [19]:
# 8. EVALUATION

from sklearn.metrics import classification_report, accuracy_score

y_pred = model.predict(X_test_selected)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.99975
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4008
           1       1.00      1.00      1.00      3934
           2       1.00      1.00      1.00      3982
           3       1.00      1.00      1.00      4076

    accuracy                           1.00     16000
   macro avg       1.00      1.00      1.00     16000
weighted avg       1.00      1.00      1.00     16000



In [20]:
# 9. SAVE MODEL

# joblib already imported at the top of the notebook
joblib.dump(rfecv, "rfecv_selector.pkl")
joblib.dump(model, "svm_model.pkl")
joblib.dump(scaler, "scaler.pkl")
# use the fitted LabelEncoder instance `le`
joblib.dump(le, "label_encoder.pkl")

print("Model, scaler, encoder saved successfully")

Model, scaler, encoder saved successfully


check krne ke liye  : 

* **Quantity = 2**
* **Unit_price = 203**
* **purchase_price = 110**

* **output : accessories**

# --------------------------------------------------------

* **Quantity = 4**
* **Unit_price = 377**
* **purchase_price = 322**

* **output : different category**