In [1]:
from itertools import chain

import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import scipy


from scipy.sparse.csgraph import laplacian

from sklearn.cluster import SpectralClustering
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import rbf_kernel, cosine_similarity
from sklearn.cluster import SpectralClustering
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.gaussian_process.kernels import RBF
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_curve, auc, roc_auc_score, ConfusionMatrixDisplay, confusion_matrix, make_scorer, f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import pairwise_distances
from sklearn.neighbors import KNeighborsClassifier


from sklearn.model_selection import train_test_split

import feature, utils

In [2]:
# Load data
df_data = pd.read_csv('../data/ice-cat-office-products.csv.gz', dtype=str, index_col=0)

category_counts = df_data["category_name"].value_counts()
large_enough_categories = category_counts[category_counts > 20].index.tolist()

df_data = df_data[df_data.category_name.isin(large_enough_categories)]

# Filter out small product categories
# top_categories = utils.find_top_n_categories(df_data, top_n=3)
top_categories = [
    'Folders',                     # N=645
    'Self-Adhesive Labels',        # N=324
    'Multimedia Carts & Stands',   # N=317
    'Calculators',                 # N=305
    'Writing Notebooks'            # N=300
    'Ring Binders',                # N=298
    'Printer Cabinets & Stands',   # N=252
    'Whiteboards',                 # N=232
    'Fax Machines',                # N=210
    'File Storage Boxes'           # N=192
    'Binding Covers',              # N=190
    'Self-Adhesive Note Paper',    # N=172
    'Staplers',                    # N=162
    'Desk Trays/Organizers',       # N=159
    'Board Accessories',           # N=154
    'Seals',                       # N=145
    'Computer Desks',              # N=144
    'Hole Punches',                # N=136
    'Hanging Folders',             # N=132
    'Interactive Whiteboards',     # N=131
    'Laminator Pouches',           # N=128
    'Stationery Tapes',            # N=124
    'Bulletin Boards',             # N=123
    'Tab Indexes',                 # N=114
    'Folder Binding Accessories',  # N=109
    'Non-Adhesive Labels',         # N=103
]
df_data = df_data[df_data.category_name.isin(top_categories)]
print(f"Number of categories: {len(df_data.category_name.unique())}")

Number of categories: 22


In [3]:
df_train, df_test = utils.split_train_test(df_data)

In [4]:
feateure_transformer = feature.BasicIceCatFeatureTransformer(output_size=128)

In [5]:
feateure_transformer.fit(df_train)

Number of sparsely populated columns: 847
Pseudo-key Columns: 
set()
Number of valid columns: 339
Number of columns: 1358


BasicIceCatFeatureTransformer()

In [6]:
X_train = feateure_transformer.transform(df_train)

In [7]:
X_train.shape

(3456, 128)

In [8]:
X_test = feateure_transformer.transform(df_test)



In [9]:
X_test.shape

(865, 128)

In [10]:
label_encoder = LabelEncoder()

In [11]:
label_encoder.fit(df_train.category_name)

LabelEncoder()

In [12]:
y_train = label_encoder.transform(df_train.category_name)
y_test = label_encoder.transform(df_test.category_name)

In [13]:
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

In [14]:
print(classification_report(y_true=y_test, y_pred=y_pred, target_names=label_encoder.classes_))

                            precision    recall  f1-score   support

         Board Accessories       0.90      0.87      0.89        31
           Bulletin Boards       0.86      0.96      0.91        25
               Calculators       1.00      0.98      0.99        61
            Computer Desks       0.87      0.90      0.88        29
     Desk Trays/Organizers       0.92      0.72      0.81        32
              Fax Machines       0.93      0.95      0.94        42
Folder Binding Accessories       0.89      0.77      0.83        22
                   Folders       0.93      0.86      0.90       129
           Hanging Folders       0.83      0.96      0.89        26
              Hole Punches       0.96      0.93      0.94        27
   Interactive Whiteboards       0.96      1.00      0.98        26
         Laminator Pouches       0.81      0.96      0.88        26
 Multimedia Carts & Stands       0.98      0.95      0.97        64
       Non-Adhesive Labels       0.95      0.95

In [15]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print(classification_report(y_true=y_test, y_pred=y_pred, target_names=label_encoder.classes_))

                            precision    recall  f1-score   support

         Board Accessories       0.83      0.97      0.90        31
           Bulletin Boards       0.89      0.96      0.92        25
               Calculators       1.00      0.93      0.97        61
            Computer Desks       0.83      0.86      0.85        29
     Desk Trays/Organizers       1.00      0.88      0.93        32
              Fax Machines       1.00      0.95      0.98        42
Folder Binding Accessories       0.92      0.55      0.69        22
                   Folders       0.86      0.96      0.91       129
           Hanging Folders       0.96      0.92      0.94        26
              Hole Punches       1.00      1.00      1.00        27
   Interactive Whiteboards       1.00      1.00      1.00        26
         Laminator Pouches       0.93      1.00      0.96        26
 Multimedia Carts & Stands       0.97      0.97      0.97        64
       Non-Adhesive Labels       1.00      1.00