In [1]:
import json, warnings
from typing import List

from itertools import chain

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import cleanlab


from cleanlab.latent_estimation import compute_confident_joint, estimate_latent
from cleanlab.noise_generation import generate_noise_matrix_from_trace, noise_matrix_is_valid, generate_noisy_labels
from sklearn.linear_model import LogisticRegression
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_curve, auc, roc_auc_score, ConfusionMatrixDisplay, confusion_matrix, make_scorer, f1_score
from sklearn.metrics import classification_report, pairwise_distances
from sklearn.model_selection import train_test_split
from tqdm.notebook import trange, tqdm

import feature, utils

In [2]:
import sys
from os import path

project_dir = path.abspath(path.join(".."))
if project_dir not in sys.path:
    sys.path.insert(0, project_dir)
    
from app.data.datasets import DataSet, get_dataset

In [3]:
   
def load_noise_matrix_from_file(file_path: str, classes: List[str]):
    classes = np.array(classes)
    n_classes = classes.shape[0]
    
    with open(file_path, mode="r") as fp:
        noise_config = json.load(fp)
        
    noise_matrix = np.zeros(shape=(n_classes, n_classes))
    
    if noise_config["type"] == "fixed":
        for transition in noise_config["transitions"]:
            from_index = np.where(classes == transition["from"])[0][0]
            to_index = np.where(classes == transition["to"])[0][0]
            noise_matrix[from_index, to_index] = transition["rate"]
            if transition["symmetric"]:
                noise_matrix[to_index, from_index] = transition["rate"]
    else:
        raise Exception("Unknown type: " + str(noise_config["type"]))
    
    # Ensure that the noise matrix is a column stochastic matrix i.e., column entries sum to 1.
    noise_matrix = noise_matrix + np.diag(1 - noise_matrix.sum(axis=1))
    
    return noise_matrix


In [4]:
# Load data
df_data = pd.read_csv('../data/ice-cat-office-products.csv.gz', dtype=str, index_col=0)

category_counts = df_data["category_name"].value_counts()
large_enough_categories = category_counts[category_counts >= 20].index.tolist()

df_data = df_data[df_data.category_name.isin(large_enough_categories)]

# Filter out small product categories
# top_categories = utils.find_top_n_categories(df_data, top_n=3)
top_categories = [
    'Folders',                                           # N=645
#     'Self-Adhesive Labels',                              # N=324
#     'Multimedia Carts & Stands',                         # N=317
#     'Calculators',                                       # N=305
#     'Writing Notebooks',                                 # N=300
#     'Ring Binders',                                      # N=298
#     'Printer Cabinets & Stands',                         # N=252
#     'Whiteboards',                                       # N=232
#     'Fax Machines',                                      # N=210
#     'File Storage Boxes',                                # N=192
    'Binding Covers',                                    # N=190
#     'Self-Adhesive Note Paper',                          # N=172
#     'Staplers',                                          # N=162
#     'Desk Trays/Organizers',                             # N=159
#     'Board Accessories',                                 # N=154
#     'Seals',                                             # N=145
#     'Computer Desks',                                    # N=144
#     'Hole Punches',                                      # N=136
#     'Hanging Folders',                                   # N=132
#     'Interactive Whiteboards',                           # N=131
#     'Laminator Pouches',                                 # N=128
#     'Stationery Tapes',                                  # N=124
#     'Bulletin Boards',                                   # N=123
#     'Tab Indexes',                                       # N=114
#     'Folder Binding Accessories',                        # N=109
#     'Non-Adhesive Labels',                               # N=103
#     'Office & Computer Chairs',                          # N=91
#     'Pencil Cases',                                      # N=88
#     'Staples',                                           # N=87
#     'Document Holders',                                  # N=85
#     'Markers',                                           # N=82
#     'Planning Boards',                                   # N=77
#     'Laminators',                                        # N=76
#     'Smart Card Readers',                                # N=75
#     'Stationery & Craft Scissors',                       # N=63
#     'Foot Rests',                                        # N=60
#     'Paper Shredders',                                   # N=60
#     'Felt Pens',                                         # N=56
#     'Lamination Films',                                  # N=55
#     'Sheet Protectors',                                  # N=53
#     'Lecterns',                                          # N=51
#     'Tape Dispensers',                                   # N=50
#     'Pencil Sharpeners',                                 # N=46
#     'Desk Drawer Organizers',                            # N=44
#     'Paper Cutters',                                     # N=42
#     'Ink Pads',                                          # N=42
#     'Dividers',                                          # N=41
#     'Packages',                                          # N=39
#     'Colour Pencils',                                    # N=38
#     'Dry Erase Boards',                                  # N=36
#     'Showcases',                                         # N=36
#     'Erasers',                                           # N=34
#     'Report Covers',                                     # N=34
#     'Computer Furniture Parts',                          # N=33
#     'Workspace Dividers',                                # N=32
#     'Charging Station Organizers',                       # N=31
#     'Rulers',                                            # N=31
#     'Multimedia Cart Accessories',                       # N=30
#     'Magnetic Boards',                                   # N=30
#     'Easels',                                            # N=30
#     'Fax Supplies',                                      # N=29
#     'Crayons',                                           # N=27
#     'Typewriters',                                       # N=24
#     'Interactive Whiteboard Accessories',                # N=24
#     'Paint Markers',                                     # N=22
#     'Pen & Pencil Holders',                              # N=21
#     'Graphite Pencils',                                  # N=21
#     'Ballpoint Pens',                                    # N=21
#     'Pen Refills',                                       # N=21
]
# df_data = df_data[df_data.category_name.isin(top_categories)]
print(f"Number of categories: {len(df_data.category_name.unique())}")

Number of categories: 72


## Load Noise Matrix from File

In [5]:
label_encoder = LabelEncoder().fit(df_data["category_name"])

In [6]:
y_given = label_encoder.transform(df_data["category_name"])

In [7]:
!cat ../config/noise/ice-cat-office/folder-binding-covers-sym-0.1.json

{
    "type": "fixed",
    "transitions": [
        {
            "from": "Folders",
            "to": "Binding Covers",
            "symmetric": true,
            "rate": 0.1
        }
    ]
}


In [8]:
noise_matrix = load_noise_matrix_from_file(file_path="../config/noise/ice-cat-office/folder-binding-covers-sym-0.1.json", classes=label_encoder.classes_)

In [9]:
noise_matrix

array([[1. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0.9, 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 1. , ..., 0. , 0. , 0. ],
       ...,
       [0. , 0. , 0. , ..., 1. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 1. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 1. ]])

In [10]:
unique, counts = np.unique(y_given, return_counts=True)
py = counts / counts.sum()
noise_matrix_is_valid(noise_matrix, py)

True

## Generate Noise Matrix manually

In [11]:
n_classes = label_encoder.classes_.shape[0]

In [12]:
class1_index = np.where(label_encoder.classes_ == 'Folders')[0][0]
class2_index = np.where(label_encoder.classes_ == 'Binding Covers')[0][0]

In [13]:
noise_matrix = np.zeros(shape=(n_classes,n_classes))

noise_matrix[class1_index, class2_index] = 0.1
noise_matrix[class2_index, class1_index] = 0.1

In [14]:
noise_matrix = noise_matrix + np.diag(1 - noise_matrix.sum(axis=1))

In [15]:
unique, counts = np.unique(y_given, return_counts=True)
py = counts / counts.sum()
noise_matrix_is_valid(noise_matrix, py)

True

## Generate Noisy Labels

In [16]:
y_modified = generate_noisy_labels(y=y_given, noise_matrix=noise_matrix)

In [17]:
y_modified.shape

(7329,)

In [18]:
d, = np.where(y_modified != y_given)

In [19]:
d

array([ 117,  168,  181,  210,  263,  496,  593,  630,  760,  805,  845,
        891,  894,  919,  921,  927,  929,  960,  972,  976,  990, 1092,
       1093, 1217, 1218, 1219, 1368, 1513, 1765, 1812, 1816, 1874, 2180,
       2184, 2395, 2429, 2433, 2539, 2789, 2885, 3114, 3134, 3136, 3154,
       3162, 3332, 3336, 3337, 3384, 3386, 3406, 3483, 3485, 3654, 3666,
       3677, 3778, 4288, 4406, 4446, 4447, 4455, 4549, 4594, 4595, 4616,
       4787, 5147, 5693, 5708, 5829, 5981, 6024, 6055, 6079, 6090, 6224,
       6509, 6537, 7190, 7237, 7276, 7281])

In [20]:
len([label_encoder.classes_[l] for l in y_modified])

7329

In [21]:
for i in range(y_modified.shape[0]):
    if y_modified[i] != y_given[i]:
        print(f"[{i}] Changed from {y_given[i]} to {y_modified[i]}")

[117] Changed from 25 to 1
[168] Changed from 25 to 1
[181] Changed from 25 to 1
[210] Changed from 25 to 1
[263] Changed from 25 to 1
[496] Changed from 25 to 1
[593] Changed from 25 to 1
[630] Changed from 25 to 1
[760] Changed from 25 to 1
[805] Changed from 1 to 25
[845] Changed from 1 to 25
[891] Changed from 1 to 25
[894] Changed from 1 to 25
[919] Changed from 1 to 25
[921] Changed from 1 to 25
[927] Changed from 1 to 25
[929] Changed from 1 to 25
[960] Changed from 1 to 25
[972] Changed from 1 to 25
[976] Changed from 1 to 25
[990] Changed from 1 to 25
[1092] Changed from 25 to 1
[1093] Changed from 25 to 1
[1217] Changed from 25 to 1
[1218] Changed from 25 to 1
[1219] Changed from 25 to 1
[1368] Changed from 25 to 1
[1513] Changed from 25 to 1
[1765] Changed from 25 to 1
[1812] Changed from 25 to 1
[1816] Changed from 25 to 1
[1874] Changed from 25 to 1
[2180] Changed from 25 to 1
[2184] Changed from 25 to 1
[2395] Changed from 25 to 1
[2429] Changed from 25 to 1
[2433] Change