In [1]:
import warnings

from itertools import chain

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import cleanlab


from cleanlab.latent_estimation import compute_confident_joint, estimate_latent
from cleanlab.noise_generation import generate_noise_matrix_from_trace, noise_matrix_is_valid
from sklearn.linear_model import LogisticRegression
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_curve, auc, roc_auc_score, ConfusionMatrixDisplay, confusion_matrix, make_scorer, f1_score
from sklearn.metrics import classification_report, pairwise_distances
from sklearn.model_selection import train_test_split
from tqdm.notebook import trange, tqdm

import feature, utils

In [2]:
# Load data
df_data = pd.read_csv('../data/ice-cat-office-products.csv.gz', dtype=str, index_col=0)

category_counts = df_data["category_name"].value_counts()
large_enough_categories = category_counts[category_counts > 20].index.tolist()

df_data = df_data[df_data.category_name.isin(large_enough_categories)]

# Filter out small product categories
# top_categories = utils.find_top_n_categories(df_data, top_n=3)
top_categories = [
    'Folders',                                           # N=645
#     'Self-Adhesive Labels',                              # N=324
#     'Multimedia Carts & Stands',                         # N=317
#     'Calculators',                                       # N=305
#     'Writing Notebooks',                                 # N=300
#     'Ring Binders',                                      # N=298
#     'Printer Cabinets & Stands',                         # N=252
#     'Whiteboards',                                       # N=232
#     'Fax Machines',                                      # N=210
#     'File Storage Boxes',                                # N=192
    'Binding Covers',                                    # N=190
#     'Self-Adhesive Note Paper',                          # N=172
#     'Staplers',                                          # N=162
#     'Desk Trays/Organizers',                             # N=159
#     'Board Accessories',                                 # N=154
#     'Seals',                                             # N=145
#     'Computer Desks',                                    # N=144
#     'Hole Punches',                                      # N=136
#     'Hanging Folders',                                   # N=132
#     'Interactive Whiteboards',                           # N=131
#     'Laminator Pouches',                                 # N=128
#     'Stationery Tapes',                                  # N=124
#     'Bulletin Boards',                                   # N=123
#     'Tab Indexes',                                       # N=114
#     'Folder Binding Accessories',                        # N=109
#     'Non-Adhesive Labels',                               # N=103
#     'Office & Computer Chairs',                          # N=91
#     'Pencil Cases',                                      # N=88
#     'Staples',                                           # N=87
#     'Document Holders',                                  # N=85
#     'Markers',                                           # N=82
#     'Planning Boards',                                   # N=77
#     'Laminators',                                        # N=76
#     'Smart Card Readers',                                # N=75
#     'Stationery & Craft Scissors',                       # N=63
#     'Foot Rests',                                        # N=60
#     'Paper Shredders',                                   # N=60
#     'Felt Pens',                                         # N=56
#     'Lamination Films',                                  # N=55
#     'Sheet Protectors',                                  # N=53
#     'Lecterns',                                          # N=51
#     'Tape Dispensers',                                   # N=50
#     'Pencil Sharpeners',                                 # N=46
#     'Desk Drawer Organizers',                            # N=44
#     'Paper Cutters',                                     # N=42
#     'Ink Pads',                                          # N=42
#     'Dividers',                                          # N=41
#     'Packages',                                          # N=39
#     'Colour Pencils',                                    # N=38
#     'Dry Erase Boards',                                  # N=36
#     'Showcases',                                         # N=36
#     'Erasers',                                           # N=34
#     'Report Covers',                                     # N=34
#     'Computer Furniture Parts',                          # N=33
#     'Workspace Dividers',                                # N=32
#     'Charging Station Organizers',                       # N=31
#     'Rulers',                                            # N=31
#     'Multimedia Cart Accessories',                       # N=30
#     'Magnetic Boards',                                   # N=30
#     'Easels',                                            # N=30
#     'Fax Supplies',                                      # N=29
#     'Crayons',                                           # N=27
#     'Typewriters',                                       # N=24
#     'Interactive Whiteboard Accessories',                # N=24
#     'Paint Markers',                                     # N=22
#     'Pen & Pencil Holders',                              # N=21
#     'Graphite Pencils',                                  # N=21
#     'Ballpoint Pens',                                    # N=21
#     'Pen Refills',                                       # N=21
]
# df_data = df_data[df_data.category_name.isin(top_categories)]
print(f"Number of categories: {len(df_data.category_name.unique())}")

Number of categories: 69


In [3]:
label_encoder = LabelEncoder().fit(df_data["category_name"])
y_given = label_encoder.transform(df_data["category_name"])

In [4]:
n_classes = label_encoder.classes_.shape[0]

In [5]:
class1_index = np.where(label_encoder.classes_ == 'Folders')[0][0]
class2_index = np.where(label_encoder.classes_ == 'Binding Covers')[0][0]

In [6]:
noise_matrix = np.zeros(shape=(n_classes,n_classes))

In [7]:
noise_matrix[class1_index, class2_index] = 0.6
# noise_matrix[class2_index, class1_index] = 0.6

In [8]:
noise_matrix = noise_matrix + np.diag(1 - noise_matrix.sum(axis=1))

In [9]:
py = (df_data.category_name.value_counts() / df_data.shape[0]).to_numpy()
noise_matrix_is_valid(noise_matrix, py)

False