In [1]:
from scipy import sparse as sp
import numpy as np

def spcategorical(n_cat_ids):
    '''
    Returns a dummy matrix given an array of categorical variables.
    Parameters
    ----------
    n_cat_ids    : array
                   A 1d vector of the categorical labels for n observations.

    Returns
    --------
    dummy        : array
                   A sparse matrix of dummy (indicator/binary) variables for the
                   categorical data.  

    '''
    if np.squeeze(n_cat_ids).ndim == 1:
        cat_set = np.unique(n_cat_ids)
        n = len(n_cat_ids)
        row_map = dict((id, np.where(cat_set == id)[0]) for id in
                n_cat_ids)
        index = np.array([row_map[row] for row in n_cat_ids]).flatten()
        indptr = np.arange(n+1, dtype=int) 
        return sp.csr_matrix((np.ones(n), index, indptr))
    else:
        raise IndexError("The index %s is not understood" % col)

def spcategorical2(n_cat_ids):
    '''
    Returns a dummy matrix given an array of categorical variables.
    Parameters
    ----------
    n_cat_ids    : array
                   A 1d vector of the categorical labels for n observations.

    Returns
    --------
    dummy        : array
                   A sparse matrix of dummy (indicator/binary) variables for the
                   categorical data.  

    '''
    if np.squeeze(n_cat_ids).ndim == 1:
        cat_set = np.unique(n_cat_ids)
        n = len(n_cat_ids)
        index = [np.where(cat_set == id)[0].tolist()[0] for id in n_cat_ids]
        indptr = np.arange(n+1, dtype=int) 
        return sp.csr_matrix((np.ones(n), index, indptr))
    else:
        raise IndexError("The index %s is not understood" % col)



In [27]:
#x = np.random.randint(1,10, 25)
x = ['a', 'b', 'a', 'c', 'a', 'a', 'b', 'a', 'c', 'a']

In [28]:
%timeit a = spcategorical(x)

10000 loops, best of 3: 107 µs per loop


In [29]:
%timeit b = spcategorical2(x)

10000 loops, best of 3: 98.9 µs per loop


In [12]:
np.allclose(a.toarray(), b.toarray())

True

In [13]:
a

<5x3 sparse matrix of type '<type 'numpy.float64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [14]:
b

<5x3 sparse matrix of type '<type 'numpy.float64'>'
	with 5 stored elements in Compressed Sparse Row format>