In [2]:
import os
import numpy as np

def parseRDD(point):
    """ Parser for the current dataset. It receives a data point and return
        a sentence (third field).
    Args:
        point (str): input data point
    Returns:
        str: a string
    """    
    data = point.split('\t')
    return (int(data[0]),data[2])

def notempty(point):
    """ Returns whether the point string is not empty
    Args:
        point (str): input string
    Returns:
        bool: True if it is not empty
    """   
    return len(point[1])>0

filename = os.path.join("/home/rafael/","MovieReviews2.tsv")
rawRDD = sc.textFile(filename,100)
header = rawRDD.take(1)[0]

dataRDD = (rawRDD
           #.sample(False, 0.1, seed=42)
           .filter(lambda x: x!=header)
           .map(parseRDD)
           .filter(notempty)
           #.sample( False, 0.1, 42 )
           )

print ('Read {} lines'.format(dataRDD.count()))
print ('Sample line: {}'.format(dataRDD.takeSample(False, 1)[0]))

Read 8528 lines
Sample line: (98521, u"Maid in Manhattan might not look so appealing on third or fourth viewing down the road ... But as a high concept vehicle for two bright stars of the moment who can rise to fans ' lofty expectations , the movie passes inspection .")


In [3]:
import re

split_regex = r'\W+'

stopfile = os.path.join("/home/rafael/","stopwords.txt")
stopwords = set(sc.textFile(stopfile).collect())

def tokenize(string):
    """ An implementation of input string tokenization that excludes stopwords
    Args:
        string (str): input string
    Returns:
        list: a list of tokens without stopwords
    """
    words = re.split(string=string, pattern=split_regex)
    return [w.lower() for w in words if w.lower() not in stopwords and len(w) > 0]

wordsRDD = dataRDD.map(lambda x: tokenize(x[1]))

print (wordsRDD.take(1)[0])


[u'quiet', u'introspective', u'entertaining', u'independent', u'worth', u'seeking']


In [4]:
assert wordsRDD.take(1)[0]==[u'quiet', u'introspective', u'entertaining', u'independent', u'worth', u'seeking'], 'lista incorreta!'

In [5]:
from pyspark.mllib.feature import Word2Vec

model = Word2Vec().setVectorSize(5).setSeed(42).fit(wordsRDD)

print (model.transform(u'entertaining'))
print (list(model.findSynonyms(u'entertaining', 2)))

[0.013683137483894825,0.003714576829224825,-0.1357858031988144,0.04758540168404579,0.04148530960083008]
[(u'god', 0.9973623156547546), (u'shows', 0.9930634498596191)]


In [6]:
dist = np.abs(model.transform(u'entertaining')-np.array([0.0136831374839,0.00371457682922,-0.135785803199,0.047585401684,0.0414853096008])).mean()
assert dist<1e-6, 'valores incorretos'
assert list(model.findSynonyms(u'entertaining', 1))[0][0] == 'god', 'valores incorretos'

In [95]:
uniqueWords = (wordsRDD
               .flatMap(lambda l:[(x,1) for x in l])
               .reduceByKey(lambda x,y:x+y)
               .filter(lambda x:x[1] >= 5)
               .map(lambda x:x[0])
               .collect()
               )

print ('{} tokens únicos'.format(len(uniqueWords)))

w2v = {}
for w in uniqueWords:
    w2v[w] = model.transform(w)
w2vb = sc.broadcast(w2v)  # acesse como w2vb.value[w]     
print ('Vetor entertaining: {}'.format( w2v[u'entertaining']))

vectorsRDD = (wordsRDD
              .map(lambda xs:np.matrix([w2vb.value[x] for x in xs if x in uniqueWords]))
             )
recs = vectorsRDD.take(2)
firstRec, secondRec = recs[0], recs[1]
print (firstRec.shape, secondRec.shape)

3388 tokens únicos
Vetor entertaining: [0.013683137483894825,0.003714576829224825,-0.1357858031988144,0.04758540168404579,0.04148530960083008]
((5, 5), (10, 5))


In [96]:
assert len(uniqueWords) == 3388,  'valor incorreto'
assert np.mean(np.abs(w2v[u'entertaining']-[0.0136831374839,0.00371457682922,-0.135785803199,0.047585401684,0.0414853096008]))<1e-6,'valor incorreto'
assert secondRec.shape == (10,5)

In [107]:
from  pyspark.mllib.clustering import KMeans

vectors2RDD = sc.parallelize(np.array(list(w2v.values())),1)
print ('Sample vector: {}'.format(vectors2RDD.take(1)))

modelK = KMeans.train(seed=42, k=200, rdd=vectors2RDD)

clustersRDD = vectors2RDD.map(lambda x:modelK.predict(x))
print ('10 first clusters allocation: {}'.format(clustersRDD.take(10)))

Sample vector: [array([ 0.01593647,  0.05436355, -0.0969881 ,  0.00835642,  0.06551303])]
10 first clusters allocation: [38, 81, 7, 149, 87, 43, 57, 17, 153, 85]


In [None]:
assert clustersRDD.take(10)==[142, 83, 42, 0, 87, 52, 190, 17, 56, 0], 'valor incorreto'

In [110]:
def quantizador(point, model, k, w2v):
    key = point[0]
    words = tokenize(point[1])
    matrix = np.array( [w2v[x] for x in words if x in w2v] )
    features = np.zeros(k)
    for v in matrix:
        c = model.predict(v)
        features[c] += 1
    return (key, features)
    
quantRDD = dataRDD.map(lambda x: quantizador(x, modelK, 500, w2v))

print quantRDD.take(1)

[(64, array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 

In [111]:
assert quantRDD.take(1)[0][1].sum() == 5, 'valores incorretos'