# Preprocessing the OttoProduct dataset

In [1]:
import os

import numpy as np

Path to data folder.

In [2]:
PATH_TO_EXP = '/cobrain/groups/ml_group/experiments/dustpelt/imc_exp/'
PATH_DATA = os.path.join(PATH_TO_EXP, 'data/otto_product')

Download the `dna` from `libsvm`.

In [3]:
filename_raw_train = os.path.join(PATH_DATA, "train.csv")

# if not os.path.exists(filename_raw_train):
#     !wget -O {filename_raw_train} -t inf \
#         https://www.kaggle.com/c/otto-group-product-classification-challenge/download/train.csv
            
# can't correctly download and unzip data :(

The data is in `libsvm` input file format, therefore we use `sklearn`'s interface.

In [8]:
import pandas as pd

ds = pd.read_csv(filename_raw_train, index_col=['id'])
ds.head()

Unnamed: 0_level_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,feat_10,...,feat_85,feat_86,feat_87,feat_88,feat_89,feat_90,feat_91,feat_92,feat_93,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,Class_1
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,Class_1
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,Class_1
4,1,0,0,1,6,1,5,0,0,1,...,0,1,2,0,0,0,0,0,0,Class_1
5,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,Class_1


Data info:
* of classes: 9
* of data: 61878
* of features: 93

In [23]:
y_str = ds['target'].values

y = []
for _y in y_str:
    y_stripped = _y.strip('Class_')
    y.append(int(y_stripped))
y = np.array(y)

classes, sizes = np.unique(y, return_counts=True)

In [24]:
X_full = ds.drop(['target'], axis=1).values

In [26]:
n_elem = 1000
for n in sizes:
    assert n_elem <= n

Xs_grouped = []
y_grouped = []
for cls, szs in zip(classes, sizes):
    X_group = X_full[y == cls]
    
    idxs = np.arange(0, szs)
    rnd_idxs = np.random.permutation(idxs)
    
    X_group = X_group[idxs][:n_elem]
    
    for x in X_group:
        Xs_grouped.append(x)
        y_grouped.append(cls)
        
X = np.array(Xs_grouped)
y = np.array(y_grouped)

In [27]:
n_objects, n_features = n_elem * len(classes), 93

assert n_objects == len(y), """Unexpected dimensions."""
assert (n_objects, n_features) == X.shape, """Unexpected dimensions."""

Create the target dataset for supervised clustering:
$$ R_{ij}
    = \begin{cases}
        +1 & \text{ if } y_i = y_j\,, \\
        -1 & \text{ otherwise.}
\end{cases}$$
We fill in only the negative class `-1`.

In [28]:
import tqdm

R = np.ones((n_objects, n_objects))
for i, yi in enumerate(tqdm.tqdm(y)):
    R[i, np.flatnonzero(y != yi)] = -1

100%|██████████| 9000/9000 [00:00<00:00, 26110.86it/s]


The row side-features matrix is already in CSR sparse format.

In [29]:
from scipy.sparse import coo_matrix
X = coo_matrix(X)

The column side-features are an identity matrix.

In [30]:
from scipy.sparse import dia_matrix

Y = dia_matrix((np.ones(n_objects), 0), shape=(n_objects, n_objects))
Y = Y.tocsr()

Save the dataset into a gzipped pickle.

In [31]:
filename_staged = os.path.join(PATH_DATA, "staged_dataset.gz")

import gzip
import pickle

with gzip.open(filename_staged, "wb+", 4) as fout:
    pickle.dump((X, Y, R), fout)