# Preprocessing the CovType dataset

In [1]:
import os

import numpy as np

Path to data folder.

In [2]:
PATH_TO_EXP = '/cobrain/groups/ml_group/experiments/dustpelt/imc_exp/'
PATH_DATA = os.path.join(PATH_TO_EXP, 'data/covtype')

Download the `dna` from `libsvm`.

In [10]:
filename_zip_train = os.path.join(PATH_DATA, "dataset_train.bz2")
filename_unzip_train = filename_zip_train.strip('.bz2')
filename_raw_train = filename_unzip_train + '.libsvm'

if not os.path.exists(filename_zip_train):
    !wget -O {filename_zip_train} -t inf \
        https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/covtype.scale01.bz2
            
!bzip2 -d {filename_zip_train}
!mv {filename_unzip_train} {filename_raw_train}

The data is in `libsvm` input file format, therefore we use `sklearn`'s interface.

In [18]:
from sklearn.datasets import load_svmlight_file

X_train, y_train = load_svmlight_file(filename_raw_train, dtype=np.float64, query_id=False)

X_full = X_train.toarray()

classes, sizes = np.unique(y_train, return_counts=True)

Data info:
* of classes: 7
* of data: 581012
* of features: 54

In [25]:
n_elem = 1000
for n in sizes:
    assert n_elem <= n

Xs_grouped = []
y = []
for cls, szs in zip(classes, sizes):
    X_group = X_full[y_train == cls]
    
    idxs = np.arange(0, szs)
    rnd_idxs = np.random.permutation(idxs)
    
    X_group = X_group[idxs][:n_elem]
    
    for x in X_group:
        Xs_grouped.append(x)
        y.append(cls)
        
X = np.array(Xs_grouped)
y = np.array(y)

In [26]:
n_objects, n_features = n_elem * len(classes), 54

assert n_objects == len(y), """Unexpected dimensions."""
assert (n_objects, n_features) == X.shape, """Unexpected dimensions."""

Create the target dataset for supervised clustering:
$$ R_{ij}
    = \begin{cases}
        +1 & \text{ if } y_i = y_j\,, \\
        -1 & \text{ otherwise.}
\end{cases}$$
We fill in only the negative class `-1`.

In [27]:
import tqdm

R = np.ones((n_objects, n_objects))
for i, yi in enumerate(tqdm.tqdm(y)):
    R[i, np.flatnonzero(y != yi)] = -1

100%|██████████| 7000/7000 [00:00<00:00, 36346.06it/s]


The row side-features matrix is already in CSR sparse format.

In [29]:
from scipy.sparse import coo_matrix
X = coo_matrix(X)

The column side-features are an identity matrix.

In [30]:
from scipy.sparse import dia_matrix

Y = dia_matrix((np.ones(n_objects), 0), shape=(n_objects, n_objects))
Y = Y.tocsr()

Save the dataset into a gzipped pickle.

In [31]:
filename_staged = os.path.join(PATH_DATA, "staged_dataset.gz")

import gzip
import pickle

with gzip.open(filename_staged, "wb+", 4) as fout:
    pickle.dump((X, Y, R), fout)