In [None]:
# bzip2 -d covtype.scale01.bz2

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
data_path = '/home/dustpelt/Desktop/imc_exp_data/covtype/'
data = pd.read_csv(data_path + 'covtype.scale01', sep = ' ', header=None)
data.drop(columns=[data.columns[-1]], inplace=True)

In [3]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,5,1:0.368684,2:0.141667,3:0.0454545,4:0.184681,5:0.223514,6:0.0716594,7:0.870079,8:0.913386,9:0.582677,10:0.875366,11:1,43:1
1,5,1:0.365683,2:0.155556,3:0.030303,4:0.151754,5:0.215762,6:0.0547984,7:0.866142,8:0.925197,9:0.594488,10:0.867838,11:1,43:1
2,2,1:0.472736,2:0.386111,3:0.136364,4:0.19184,5:0.307494,6:0.446817,7:0.92126,8:0.937008,9:0.531496,10:0.853339,11:1,26:1
3,2,1:0.463232,2:0.430556,3:0.272727,4:0.173228,5:0.375969,6:0.434172,7:0.937008,8:0.937008,9:0.480315,10:0.865886,11:1,44:1
4,5,1:0.368184,2:0.125,3:0.030303,4:0.10952,5:0.222222,6:0.0549389,7:0.866142,8:0.92126,9:0.590551,10:0.860449,11:1,43:1


In [4]:
def pars_elem(obj):
    if obj is None:
        return None
    elif type(obj) is float:
        if np.isnan(obj):
            return None
        else:
            assert False
    else:
        for i in range(len(obj)):
            if obj[i] == ':':
                return i

In [5]:
# initial data
data_arr = data.as_matrix()

features_arr = data_arr[:,1:]
last_ftrs = data_arr[:,-1]

atr_arr = data_arr[:,0].astype(dtype='int32')

In [6]:
# deleting string with nan(s)

In [7]:
# subsampling
uniq_atr, uniq_counts = np.unique(atr_arr, return_counts=True)

part_of_elem = 0.4
num_of_elem = int(np.min(uniq_counts) * part_of_elem)

data_by_atribs = []
for atr in uniq_atr:
    atr_data = data_arr[data_arr[:,0] == atr,:]
    data_by_atribs.append(atr_data)
data_by_atribs = np.array(data_by_atribs)

subsampled_data = np.zeros((1,data_arr.shape[1]))
for atr_data in data_by_atribs:
    ids_to_subsmpl = np.unique(np.random.choice(len(atr_data), size=num_of_elem))
    subsampled_data = np.concatenate((subsampled_data, atr_data[ids_to_subsmpl,:]), axis=0)
    
subsampled_data = np.delete(subsampled_data, 0, axis=0)

In [8]:
# subsampled data
data_arr = subsampled_data

features_arr = data_arr[:,1:]
last_ftrs = data_arr[:,-1]

atr_arr = data_arr[:,0].astype(dtype='int32')

In [9]:
# find the maximal number of feature
max_ftr = 0
for elem in last_ftrs:
    div_idx = pars_elem(elem)
    if not div_idx is None:
        ftr_idx = int(elem[:div_idx])
        if ftr_idx >= max_ftr:
            max_ftr = ftr_idx

In [10]:
# creating matrix of features
prep_features_arr = np.zeros((features_arr.shape[0], max_ftr), dtype='float32')

for i in range(features_arr.shape[0]):
    for j in range(features_arr.shape[1]):
        elem = features_arr[i,j]
        div_idx = pars_elem(elem)
        if not div_idx is None:
            ftr_idx = int(elem[:div_idx])
            ftr_val = float(elem[div_idx+1:])
            prep_features_arr[i, ftr_idx-1] = ftr_val
            
X = prep_features_arr
R = np.zeros((len(data_arr), len(data_arr)))

# creating matrix for semi-supervised clustering
for i in range(len(atr_arr)):
    for j in range(len(atr_arr)):
        if atr_arr[i] == atr_arr[j]: R[i,j] = 1
        else: R[i,j] = -1

In [11]:
R.shape

(7370, 7370)

In [12]:
X = np.array(X, dtype='float64')

np.save(data_path + 'R.npy', R)
np.save(data_path + 'X.npy', X)