In [1]:
import utils

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.cluster import SpectralClustering
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.metrics.pairwise import rbf_kernel

In [2]:
# Load data
df_data = pd.read_csv('../data/ice-cat-office-products.csv.gz', dtype=str, index_col=0)

# Filter out small product categories
top_categories = utils.find_top_n_categories(df_data, top_n=10)
df_data = df_data[df_data.category_name.isin(top_categories)]
df_data.sort_values('category_name', inplace=True)

# Remove attributes with very few records
df_cleaned_data = utils.filter_columns(df_data)

In [3]:
# Apply dtype for each column
df_cleaned_data = utils.detect_and_fix_column_types(df_cleaned_data)

# Determine categorical and numerical features
dt = df_cleaned_data.dtypes
numeric_features = list(dt[dt==int].index) + list(dt[dt==float].index)
categorical_features = list(dt[dt=="category"].index)

# Create data preprocessor
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Preprocess data
X = preprocessor.fit_transform(df_cleaned_data)

In [4]:
X.shape

(3075, 875)

In [5]:
pairwise_distances = rbf_kernel(X)

In [6]:
pairwise_distances.shape

(3075, 3075)

In [7]:
pairwise_distances

array([[1.        , 0.95387048, 0.98184264, ..., 0.94631572, 0.95357669,
        0.97144086],
       [0.95387048, 1.        , 0.97156239, ..., 0.9616567 , 0.95509077,
        0.95676042],
       [0.98184264, 0.97156239, 1.        , ..., 0.95938595, 0.96672868,
        0.98481732],
       ...,
       [0.94631572, 0.9616567 , 0.95938595, ..., 1.        , 0.98882883,
        0.96663075],
       [0.95357669, 0.95509077, 0.96672868, ..., 0.98882883, 1.        ,
        0.97723842],
       [0.97144086, 0.95676042, 0.98481732, ..., 0.96663075, 0.97723842,
        1.        ]])

In [8]:
pairwise_distances[0].argmin()

2454

In [9]:
pairwise_distances[0].mean()

0.9431649230528735

In [10]:
df_X_before_transform = df_cleaned_data.reset_index(drop=True)

In [11]:
df_X_before_transform

Unnamed: 0,supplier_name,AC input frequency,AC input voltage,Adhesive type,Answering machine,Anti-scratch coating,Auto document feeder (ADF) input capacity,Auto power off,Auto power off after,Auto reduction,...,Type,USB port,Vertical resolution fine (lines/mm),Vertical resolution standard,VESA mounting,Volume,Weight,Weight (without accessories),Width,Zoom capability
0,Sharp,(N/A),(N/A),(N/A),False,False,0,True,0,False,...,Financial,False,(N/A),(N/A),False,0.0,0.00,0.0,0.0,(N/A)
1,Sharp,(N/A),(N/A),(N/A),False,False,0,False,0,False,...,Printing,False,(N/A),(N/A),False,0.0,997.70,0.0,0.0,(N/A)
2,Sharp,(N/A),(N/A),(N/A),False,False,0,False,0,False,...,Printing,False,(N/A),(N/A),False,0.0,1.32,0.0,0.0,(N/A)
3,Sharp,(N/A),(N/A),(N/A),False,False,0,False,0,False,...,Basic,False,(N/A),(N/A),False,0.0,65.00,0.0,0.0,(N/A)
4,Sharp,(N/A),(N/A),(N/A),False,False,0,False,0,False,...,Basic,False,(N/A),(N/A),False,0.0,81.00,0.0,87.0,(N/A)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3070,Leitz,(N/A),(N/A),(N/A),False,False,0,False,0,False,...,(N/A),False,(N/A),(N/A),False,0.0,540.00,0.0,240.0,(N/A)
3071,Leitz,(N/A),(N/A),(N/A),False,False,0,False,0,False,...,(N/A),False,(N/A),(N/A),False,0.0,540.00,0.0,230.0,(N/A)
3072,Leitz,(N/A),(N/A),(N/A),False,False,0,False,0,False,...,(N/A),False,(N/A),(N/A),False,0.0,540.00,0.0,230.0,(N/A)
3073,Leitz,(N/A),(N/A),(N/A),False,False,0,False,0,False,...,(N/A),False,(N/A),(N/A),False,0.0,290.00,0.0,181.0,(N/A)


In [12]:
df_X_before_transform.loc[1260]

supplier_name                   Leitz
AC input frequency              (N/A)
AC input voltage                (N/A)
Adhesive type                   (N/A)
Answering machine               False
                                ...  
Volume                            0.0
Weight                          540.0
Weight (without accessories)      0.0
Width                           240.0
Zoom capability                 (N/A)
Name: 1260, Length: 255, dtype: object