In [1]:
sampleOne = [(0, 'mouse'), (1, 'black')]
sampleTwo = [(0, 'cat'), (1, 'tabby'), (2, 'mouse')]
sampleThree =  [(0, 'bear'), (1, 'black'), (2, 'salmon')]
sampleDataRDD = sc.parallelize([sampleOne, sampleTwo, sampleThree])

In [2]:
sampleOHEDictManual = {}
sampleOHEDictManual[sampleOne[0]] = 0
sampleOHEDictManual[sampleOne[1]] = 1
sampleOHEDictManual[sampleTwo[0]] = 2
sampleOHEDictManual[sampleTwo[1]] = 3
sampleOHEDictManual[sampleTwo[2]] = 4
sampleOHEDictManual[sampleThree[0]] = 5
sampleOHEDictManual[sampleThree[1]] = 6
sampleOHEDictManual[sampleThree[2]] = 7

In [3]:
assert (0, 'mouse') in sampleOHEDictManual, "(0, 'mouse') not in sampleOHEDictManual"
assert (0, 'cat') in sampleOHEDictManual, "(0, 'cat') not in sampleOHEDictManual"
assert (0, 'bear') in sampleOHEDictManual, "(0, 'bear') not in sampleOHEDictManual"
assert (1, 'black') in sampleOHEDictManual, "(1, 'black') not in sampleOHEDictManual"
assert (1, 'tabby') in sampleOHEDictManual, "(1, 'tabby') not in sampleOHEDictManual"
assert (2, 'mouse') in sampleOHEDictManual, "(2, 'mouse') not in sampleOHEDictManual"
assert (2, 'salmon') in sampleOHEDictManual, "(2, 'salmon') not in sampleOHEDictManual"

In [4]:
import numpy as np
from pyspark.mllib.linalg import SparseVector

In [5]:
aDense = np.array([0., 3., 0., 4.])
aSparse = SparseVector(aDense.size, aDense.argpartition(0), aDense)

bDense = np.array([0., 0., 0., 1.])
bSparse = SparseVector(bDense.size, bDense.argpartition(0), bDense)

w = np.array([0.4, 3.1, -1.4, -.5])
print aDense.dot(w)
print aSparse.dot(w)
print bDense.dot(w)
print bSparse.dot(w)

7.300000000000001
7.300000000000001
-0.5
-0.5


In [6]:
assert isinstance(aSparse, SparseVector), 'aSparse needs to be an instance of SparseVector'
assert isinstance(bSparse, SparseVector), 'aSparse needs to be an instance of SparseVector'
assert aDense.dot(w) == aSparse.dot(w), 'dot product of aDense and w should equal dot product of aSparse and w'
assert bDense.dot(w) == bSparse.dot(w), 'dot product of bDense and w should equal dot product of bSparse and w'

In [7]:
sampleOneOHEFeatManual = SparseVector(len(sampleOHEDictManual), [sampleOHEDictManual.get(x) for x in sampleOne if x in sampleOHEDictManual], [1.0 for x in sampleOne if x in sampleOHEDictManual])
sampleTwoOHEFeatManual = SparseVector(len(sampleOHEDictManual), [sampleOHEDictManual.get(x) for x in sampleTwo if x in sampleOHEDictManual], [1.0 for x in sampleTwo if x in sampleOHEDictManual])
sampleThreeOHEFeatManual = SparseVector(len(sampleOHEDictManual), [sampleOHEDictManual.get(x) for x in sampleThree if x in sampleOHEDictManual], [1.0 for x in sampleThree if x in sampleOHEDictManual])

In [8]:
assert isinstance(sampleOneOHEFeatManual, SparseVector), 'sampleOneOHEFeatManual needs to be a SparseVector'
assert isinstance(sampleTwoOHEFeatManual, SparseVector), 'sampleTwoOHEFeatManual needs to be a SparseVector'
assert isinstance(sampleThreeOHEFeatManual, SparseVector), 'sampleThreeOHEFeatManual needs to be a SparseVector'

In [9]:
def oneHotEncoding(rawFeats, OHEDict, numOHEFeats):
    """Produce a one-hot-encoding from a list of features and an OHE dictionary.

    Note:
        You should ensure that the indices used to create a SparseVector are sorted.

    Args:
        rawFeats (list of (int, str)): The features corresponding to a single observation.  Each
            feature consists of a tuple of featureID and the feature's value. (e.g. sampleOne)
        OHEDict (dict): A mapping of (featureID, value) to unique integer.
        numOHEFeats (int): The total number of unique OHE features (combinations of featureID and
            value).

    Returns:
        SparseVector: A SparseVector of length numOHEFeats with indicies equal to the unique
            identifiers for the (featureID, value) combinations that occur in the observation and
            with values equal to 1.0.
    """
    return SparseVector(numOHEFeats, [(OHEDict[x], 1) for x in rawFeats if x in OHEDict])

# Calculate the number of features in sampleOHEDictManual
numSampleOHEFeats = len(sampleOHEDictManual)

# Run oneHotEnoding on sampleOne
sampleOneOHEFeat = oneHotEncoding(sampleOne, sampleOHEDictManual, numSampleOHEFeats)

print (sampleOneOHEFeat)

(7,[0,6],[1.0,1.0])


In [10]:
assert sampleOneOHEFeat == sampleOneOHEFeatManual, 'sampleOneOHEFeat should equal sampleOneOHEFeatManual'

In [13]:
sampleOHEData = sampleDataRDD.map(lambda x:oneHotEncoding(x, sampleOHEDictManual, len(sampleOHEDictManual)))
print sampleOHEData.collect()

[SparseVector(7, {0: 1.0, 6: 1.0}), SparseVector(7, {2: 1.0, 3: 1.0, 4: 1.0}), SparseVector(7, {5: 1.0, 6: 1.0, 7: 1.0})]


In [15]:
sampleOHEDataValues = sampleOHEData.collect()
assert len(sampleOHEDataValues) == 3, 'sampleOHEData should have three elements'

In [16]:
sampleDistinctFeats = (sampleDataRDD
                       .flatMap(lambda x:x)
                       .distinct()
                      )

In [18]:
assert sorted(sampleDistinctFeats.collect()) == [(0, 'bear'), (0, 'cat'), (0, 'mouse'), (1, 'black'),(1, 'tabby'), (2, 'mouse'), (2, 'salmon')], 'incorrect value for sampleDistinctFeats'

In [19]:
sampleOHEDict = (sampleDistinctFeats
                .zipWithIndex()
                .collectAsMap())
print(sampleOHEDict)

{(2, 'mouse'): 2, (0, 'cat'): 5, (0, 'bear'): 1, (2, 'salmon'): 4, (1, 'tabby'): 3, (1, 'black'): 6, (0, 'mouse'): 0}


In [20]:
assert sorted(sampleOHEDict.keys()) == [(0, 'bear'), (0, 'cat'), (0, 'mouse'), (1, 'black'),(1, 'tabby'), (2, 'mouse'), (2, 'salmon')], 'sampleOHEDict has unexpected keys'
assert sorted(sampleOHEDict.values()) == list(range(7)), 'sampleOHEDict has unexpected values'

In [21]:
def createOneHotDict(inputData):
    """Creates a one-hot-encoder dictionary based on the input data.

    Args:
        inputData (RDD of lists of (int, str)): An RDD of observations where each observation is
            made up of a list of (featureID, value) tuples.

    Returns:
        dict: A dictionary where the keys are (featureID, value) tuples and map to values that are
            unique integers.
    """
    return (inputData
             .flatMap(lambda x:x)
             .distinct()
             .zipWithIndex()
             .collectAsMap()
             )

sampleOHEDictAuto = createOneHotDict(sampleDataRDD)
print (sampleOHEDictAuto)

{(2, 'mouse'): 2, (0, 'cat'): 5, (0, 'bear'): 1, (2, 'salmon'): 4, (1, 'tabby'): 3, (1, 'black'): 6, (0, 'mouse'): 0}


In [22]:
assert sorted(sampleOHEDictAuto.keys()) == [(0, 'bear'), (0, 'cat'), (0, 'mouse'), (1, 'black'), (1, 'tabby'), (2, 'mouse'), (2, 'salmon')], 'sampleOHEDictAuto has unexpected keys'
assert sorted(sampleOHEDictAuto.values()) == list(range(7)), 'sampleOHEDictAuto has unexpected values'