# Categorisation Ecommerce Text: Using Transfer learning (pre-trained model + classification model) 

## Load data and tidy up

Dataset taken from: https://www.kaggle.com/datasets/saurabhshahane/ecommerce-text-classification

In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', None)

In [5]:
# Define the file path
data_path = "../data/ecommerceDataset.csv"

# Load the Excel file into a Pandas dataframe
data_df = pd.read_csv(data_path,header=None)

In [6]:
data_df.head()

Unnamed: 0,0,1
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...


In [8]:
# Rename the columns
data_df = data_df.rename(columns={0: 'category', 1: 'product_description'})
data_df.head()

Unnamed: 0,category,product_description
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...


In [27]:
data_df.dtypes

category               object
product_description    object
dtype: object

In [9]:
# Check the number of categories is 4 as stated on Kaggle
data_df.category.nunique()

4

In [11]:
# List the categories
data_df.category.unique()

array(['Household', 'Books', 'Clothing & Accessories', 'Electronics'],
      dtype=object)

In [12]:
# How many data points
data_df.shape

(50425, 2)

In [46]:
from sklearn.model_selection import train_test_split

# split the data into train and test sets while ensuring a proportional distribution of categories
train_set, test_set = train_test_split(data_df, test_size=0.5, stratify=data_df['category'], random_state=42)

# use only a subset of the training set to speed up the training
train_set = train_set.sample(n=2000, random_state=42)
test_set = test_set.sample(n=2000, random_state=42)


In [47]:
train_set.shape

(2000, 2)

In [48]:
test_set.shape

(2000, 2)

## Clean Data

In [49]:
import re

pattern = r"(?:\b\w+\b|['\"“”‘’])"
url_pattern = re.compile(r'http\S+|www\S+')
punct_pattern = re.compile(r'[^\w\s]')
digit_pattern = re.compile(r'\d+')
non_ascii_pattern = re.compile(r'[^\x00-\x7F]+')

def clean_text(text):
    
    # Ensure string type (just in case)
    text = str(text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = url_pattern.sub('', text)
    
    # Remove punctuation
    text = punct_pattern.sub(' ', text)
    
    # Remove digits
    text = digit_pattern.sub('', text)
    
    # Remove non-ASCII characters
    text = non_ascii_pattern.sub('', text)   
    
    # Tokenize words and remove single characters
    words = [word for word in re.findall(pattern, text) if len(word) > 1]
    return ' '.join(words)

In [50]:
text = "This is an example text with a URL https://www.example.com, some #hashtag."
cleaned_text = clean_text(text)
print(cleaned_text)

this is an example text with url some hashtag


In [51]:
# Apply it both train and test dataset
train_set["clean_desc"] = train_set["product_description"].apply(lambda x: clean_text(x))
test_set["clean_desc"] = test_set["product_description"].apply(lambda x: clean_text(x))

## Use Pre-Trained Transformers to get embeddings

In [52]:
from sentence_transformers import SentenceTransformer

# Load SentenceTransformer model
model = SentenceTransformer('paraphrase-mpnet-base-v2')

In [53]:
# example encoding:
sentence = ['This is a sample sentence for encoding.']
embedding = model.encode(sentence)

In [54]:
print(embedding)

[[ 4.99693975e-02 -1.26025528e-01 -9.15094614e-02  1.19477045e-02
   9.89145786e-02  9.02947485e-02  1.74566925e-01  1.84450839e-02
  -1.04984418e-01  9.41362581e-04  2.15029195e-01 -2.24005133e-02
  -5.11866026e-02 -7.63527304e-02  2.01650299e-02 -1.40251443e-01
   7.42609948e-02 -1.23938002e-01  8.18222240e-02  1.42685948e-02
   6.02980256e-02 -1.12931551e-02  2.31014867e-03 -3.89685109e-02
  -1.72300726e-01  8.78935121e-03  8.23219214e-03 -2.21334491e-02
   1.09611750e-01  1.55119881e-01  4.06741947e-02  5.67847863e-02
  -3.65059003e-02 -2.21612006e-01  1.61704160e-02 -1.37415916e-01
   1.55395195e-02 -3.26561332e-02 -3.26523185e-01  9.87549722e-02
  -8.87650102e-02  6.88517168e-02 -4.42437492e-02  1.34186819e-02
  -9.72367451e-02  1.23327538e-01  3.90452951e-01 -1.41173169e-01
  -1.36965215e-01  7.42332786e-02 -9.65975672e-02  1.11825109e-01
  -2.45796114e-01  6.72860593e-02  1.88730732e-01  1.16232140e-02
  -4.27793786e-02 -5.41380756e-02  7.80701414e-02  1.07632458e-01
   2.51457

In [55]:
# Generate embeddings for the training data
train_desc = train_set['clean_desc'].tolist()
train_embeddings = model.encode(train_desc)
X_train = np.array(train_embeddings)

## Train the Classifier Model

In [56]:
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

In [57]:
# Encode the target variable as categorical with one-hot encoding
le = LabelEncoder()
y_train = le.fit_transform(train_set['category'])

In [60]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Define the parameter grid for grid search cross-validation
param_grid = {
    'max_depth': [7],
    'n_estimators': [300],
    'learning_rate': [ 0.2, 0.5],
}

# Train an XGBoost model with grid search cross-validation
xgb_model = xgb.XGBClassifier(objective='multi:softmax', num_class=4)
grid_search = GridSearchCV(xgb_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=0)
grid_search.fit(X_train, y_train)

In [61]:
# Print the best hyperparameters found by grid search
print("Best hyperparameters:", grid_search.best_params_)

# Make predictions on the validation data using the best model
y_pred = grid_search.predict(X_val)

# Decode the predicted target variable
y_pred = le.inverse_transform(y_pred)

# Evaluate the performance of the model
accuracy = np.mean(y_pred == le.inverse_transform(y_val))
print(f"Accuracy: {accuracy:.2f}")

Best hyperparameters: {'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 300}
Accuracy: 0.94


## Run Predictions on the Test Data

In [62]:
test_set.head()

Unnamed: 0,category,product_description,clean_desc
39506,Clothing & Accessories,Iuhan Nowborn 2Pcs Infant Baby Girls Fresh Sty...,iuhan nowborn pcs infant baby girls fresh styl...
45224,Electronics,AmazonBasics Apple Certified Lightning to USB ...,amazonbasics apple certified lightning to usb ...
33965,Clothing & Accessories,Yovi Empire Girl's Tapeta Silk Semi-Stitched G...,yovi empire girl tapeta silk semi stitched gow...
31958,Clothing & Accessories,"MSGH Kids Foldable Sunglasses, Multicolour(Pac...",msgh kids foldable sunglasses multicolour pack...
3835,Household,"All Time Plastic Waste Paper Basket, Granite B...",all time plastic waste paper basket granite bi...


In [63]:
import pickle
# Save the trained model to a file
filename = 'xgb_model.sav'
pickle.dump(grid_search, open(filename, 'wb'))


# Load the model from file
loaded_model = pickle.load(open(filename, 'rb'))

# Define function to make predictions on new data
def predict_gl_code(text):
    # Preprocess the text data
    text = clean_text(text)
    
    # Generate embeddings for the text data
    embeddings = model.encode([text], show_progress_bar=False)
    X = np.array(embeddings)
    
    # Make predictions using the loaded model
    y_pred = loaded_model.predict(X)
    gl_code = le.inverse_transform(y_pred)[0]
    return gl_code

# Add a new column for predictions to the test dataframe
test_set['preds'] = test_set['clean_desc'].apply(predict_gl_code)

# Compute the accuracy of the model on the test data
accuracy = np.mean(test_set['preds'] == test_set['category'])
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.91


In [64]:
test_set.head(5)

Unnamed: 0,category,product_description,clean_desc,preds
39506,Clothing & Accessories,Iuhan Nowborn 2Pcs Infant Baby Girls Fresh Sty...,iuhan nowborn pcs infant baby girls fresh styl...,Clothing & Accessories
45224,Electronics,AmazonBasics Apple Certified Lightning to USB ...,amazonbasics apple certified lightning to usb ...,Electronics
33965,Clothing & Accessories,Yovi Empire Girl's Tapeta Silk Semi-Stitched G...,yovi empire girl tapeta silk semi stitched gow...,Clothing & Accessories
31958,Clothing & Accessories,"MSGH Kids Foldable Sunglasses, Multicolour(Pac...",msgh kids foldable sunglasses multicolour pack...,Clothing & Accessories
3835,Household,"All Time Plastic Waste Paper Basket, Granite B...",all time plastic waste paper basket granite bi...,Household


In [65]:
# How many did we get wrong
test_set[test_set['preds'] != test_set['category']].shape

(174, 4)

In [66]:
# Let's take a look
test_set[test_set['preds'] != test_set['category']].head(5)

Unnamed: 0,category,product_description,clean_desc,preds
23745,Books,Lifestyle-You® Adjustable Shoe Slots Organizer...,lifestyle you adjustable shoe slots organizer ...,Household
48643,Electronics,Xodi Eagle/National Roti maker Eagle Made Life...,xodi eagle national roti maker eagle made life...,Household
25677,Books,Generic 1pc 3 Finger Pool Shooters Billiard Gl...,generic pc finger pool shooters billiard glove...,Electronics
25102,Books,"Rocksport CLIROCL003 Climbing Holds, Pack of 5...",rocksport clirocl climbing holds pack of multi...,Household
25548,Books,Ariel Matic Top Load Detergent Washing Powder ...,ariel matic top load detergent washing powder ...,Household


### Libraries Required

In [69]:
# !pip install jupyter 
# !pip install pandas  
# !pip install scikit-learn
# !pip install torch
# !pip install sentence-transformers
# !pip install transformers
# !pip install xgboost