In [4]:
# SPARK_HOME="/Users/ssv/spark-2.2.0-bin-hadoop2.7"
import os
import sys
os.environ["PYSPARK_SUBMIT_ARGS"]='--packages com.databricks:spark-csv_2.10:1.2.0 pyspark-shell'
os.environ["PYSPARK_PYTHON"]='python3'
# os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["SPARK_HOME"]='/Users/ssv/spark-2.2.0-bin-hadoop2.7'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.4-src.zip'))
os.environ["PYSPARK_PYTHON"] = 'python3'
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.2.0
      /_/

Using Python version 3.6.2 (default, Sep 21 2017 18:29:43)
SparkSession available as 'spark'.


In [5]:
sc

## Vectors, Matrices

https://spark.apache.org/docs/latest/api/python/pyspark.mllib.html#module-pyspark.mllib.linalg

https://spark.apache.org/docs/latest/mllib-data-types.html

** Local Vectors **: 
- stored on a single machine
- integer-typed and 0-based indices
- double-typed values

** dense **: 
- backed by a double array representing its entry values

** sparse **
- backed by two parallel arrays: indices and values


** Example **
- vector (1.0, 0.0, 3.0)
- [1.0, 0.0, 3.0] in dense format
- (3, [0, 2], [1.0, 3.0]) in sparse format 

### dense vectors

In [6]:
from pyspark.mllib.linalg import Vectors, Matrices

In [7]:
v = Vectors.dense([1.0, 2.0])
u = Vectors.dense([3.0, 4.0])

In [8]:
u

DenseVector([3.0, 4.0])

In [9]:
v.dot(u)

11.0

In [10]:
v.toArray()

array([ 1.,  2.])

### sparse vectors

In [8]:
vs = Vectors.sparse(10, [1,5], [12.0, -7])

In [9]:
vs

SparseVector(10, {1: 12.0, 5: -7.0})

In [10]:
us = Vectors.sparse(10, [2,5], [5, 3])
us

SparseVector(10, {2: 5.0, 5: 3.0})

In [11]:
vs.dot(us)

-21.0

In [12]:
vv = Vectors.dense([1,2,3,4,5,6,7,8,9,10])

In [13]:
vs.dot(vv)

-18.0

In [14]:
vs.toArray()

array([  0.,  12.,   0.,   0.,   0.,  -7.,   0.,   0.,   0.,   0.])

** more on vectors **

In [15]:
Vectors.parse("[1,5,3,2,0,0,7]")

DenseVector([1.0, 5.0, 3.0, 2.0, 0.0, 0.0, 7.0])

In [16]:
Vectors.parse("(100, [1,5,25], [1,2,2])")

SparseVector(100, {1: 1.0, 5: 2.0, 25: 2.0})

In [17]:
v.norm(2)

2.2360679774997898

In [18]:
v.squared_distance(u)

8.0

## matrices

In [19]:
m = Matrices.dense(2, 2, range(4))
m

DenseMatrix(2, 2, [0.0, 1.0, 2.0, 3.0], False)

In [20]:
m.toArray()

array([[ 0.,  2.],
       [ 1.,  3.]])

In [21]:
m.values

array([ 0.,  1.,  2.,  3.])

** [1.4] added sparse matrices ** https://spark.apache.org/docs/latest/api/python/pyspark.mllib.html#pyspark.mllib.linalg.SparseMatrix

** [1.5] added distributed matrices ** https://spark.apache.org/docs/latest/api/python/pyspark.mllib.html#module-pyspark.mllib.linalg.distributed

In [22]:
m.toSparse()

SparseMatrix(2, 2, [0, 1, 3], [1, 0, 1], [1.0, 2.0, 3.0], False)

In [23]:
ms = Matrices.dense(2,3,[1.0,2.0,0.0,0.0,3.0,0.0])
ms

DenseMatrix(2, 3, [1.0, 2.0, 0.0, 0.0, 3.0, 0.0], False)

In [24]:
ms.toArray()

array([[ 1.,  0.,  3.],
       [ 2.,  0.,  0.]])

CSC format 

http://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csc_matrix.html

http://www.scipy-lectures.org/advanced/scipy_sparse/csc_matrix.html


In [25]:
Matrices.sparse(2, 3, [0,2,2,3], [0,1,0], [1.0,2.0,3.0]).toArray()

array([[ 1.,  0.,  3.],
       [ 2.,  0.,  0.]])

In [26]:
Matrices.sparse(4, 4, [0,1,2,3,4], [0, 1, 2, 3], [1,2,3,4]).toArray()

array([[ 1.,  0.,  0.,  0.],
       [ 0.,  2.,  0.,  0.],
       [ 0.,  0.,  3.,  0.],
       [ 0.,  0.,  0.,  4.]])

In [27]:
Matrices.sparse(4, 4, [0,2,2,3,4], [0, 1, 0, 3], [1,2,3,4]).toArray()

array([[ 1.,  0.,  3.,  0.],
       [ 2.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  4.]])

In [28]:
Matrices.sparse(3,3, [0,1,2,4], [2,1,2,0], [3.0,2.0,2.0,12.0]).toArray()

array([[  0.,   0.,  12.],
       [  0.,   2.,   0.],
       [  3.,   0.,   2.]])

## TF\*IDF

https://spark.apache.org/docs/latest/mllib-feature-extraction.html

https://spark.apache.org/docs/latest/api/python/pyspark.mllib.html#module-pyspark.mllib.feature

HashingTF takes the hash code of each word modulo a desired vector size, S, and thus maps each word to a number between 0 and S–1. This always yields an S-dimensional vector, and in practice is quite robust even if multiple
words map to the same hash code. The MLlib developers recommend setting S between 2^18 and 2^20

** Toy example **

In [29]:
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.feature import IDF

In [30]:
tf = HashingTF(100)
doc1 = "dog dog cat dog bird cat"
doc1_words = doc1.split()

In [31]:
doc1_words

['dog', 'dog', 'cat', 'dog', 'bird', 'cat']

In [32]:
doc2 = "dog ate cat"
doc2_words = doc2.split()

In [33]:
doc2_words

['dog', 'ate', 'cat']

In [34]:
doc_tf = tf.transform(sc.parallelize([doc1_words, doc2_words]))

In [35]:
doc_tf.collect()

[SparseVector(100, {7: 3.0, 56: 1.0, 84: 2.0}),
 SparseVector(100, {7: 1.0, 75: 1.0, 84: 1.0})]

In [36]:
idf = IDF()
model = idf.fit(doc_tf)

In [37]:
tfidf = model.transform(doc_tf)

In [38]:
tfidf.collect()

[SparseVector(100, {7: 0.0, 56: 0.4055, 84: 0.0}),
 SparseVector(100, {7: 0.0, 75: 0.4055, 84: 0.0})]

** More real-life example **

In [39]:
doc1 = """
An information retrieval process begins when a user enters a query into the system. Queries are formal statements of information needs, for example search strings in web search engines. In information retrieval a query does not uniquely identify a single object in the collection. Instead, several objects may match the query, perhaps with different degrees of relevancy.

An object is an entity that is represented by information in a database. User queries are matched against the database information. Depending on the application the data objects may be, for example, text documents, images,[1] audio,[2] mind maps[3] or videos. Often the documents themselves are not kept or stored directly in the IR system, but are instead represented in the system by document surrogates or metadata.

Most IR systems compute a numeric score on how well each object in the database matches the query, and rank the objects according to this value. The top ranking objects are then shown to the user. The process may then be iterated if the user wishes to refine the query.[4]
"""

doc2 = """
The idea of using computers to search for relevant pieces of information was popularized in the article As We May Think by Vannevar Bush in 1945.[5] The first automated information retrieval systems were introduced in the 1950s and 1960s. By 1970 several different techniques had been shown to perform well on small text corpora such as the Cranfield collection (several thousand documents).[5] Large-scale retrieval systems, such as the Lockheed Dialog system, came into use early in the 1970s.

In 1992, the US Department of Defense along with the National Institute of Standards and Technology (NIST), cosponsored the Text Retrieval Conference (TREC) as part of the TIPSTER text program. The aim of this was to look into the information retrieval community by supplying the infrastructure that was needed for evaluation of text retrieval methodologies on a very large text collection. This catalyzed research on methods that scale to huge corpora. The introduction of web search engines has boosted the need for very large scale retrieval systems even further.
"""

doc3 = """
Information retrieval is a wide, often loosely-defined term but in these pages I shall be concerned only with automatic information retrieval systems. Automatic as opposed to manual and information as opposed to data or fact. Unfortunately the word information can be very misleading. In the context of information retrieval (IR), information, in the technical meaning given in Shannon's theory of communication, is not readily measured (Shannon and Weaver[1]). In fact, in many cases one can adequately describe the kind of retrieval by simply substituting 'document' for 'information'. Nevertheless, 'information retrieval' has become accepted as a description of the kind of work published by Cleverdon, Salton, Sparck Jones, Lancaster and others. A perfectly straightforward definition along these lines is given by Lancaster[2]: 'Information retrieval is the term conventionally, though somewhat inaccurately, applied to the type of activity discussed in this volume. An information retrieval system does not inform (i.e. change the knowledge of) the user on the subject of his inquiry. It merely informs on the existence (or non-existence) and whereabouts of documents relating to his request.' This specifically excludes Question-Answering systems as typified by Winograd[3] and those described by Minsky[4]]. It also excludes data retrieval systems such as used by, say, the stock exchange for on-line quotations.
"""

In [40]:
docs = sc.parallelize([doc1, doc2, doc3])
docs.collect()

['\nAn information retrieval process begins when a user enters a query into the system. Queries are formal statements of information needs, for example search strings in web search engines. In information retrieval a query does not uniquely identify a single object in the collection. Instead, several objects may match the query, perhaps with different degrees of relevancy.\n\nAn object is an entity that is represented by information in a database. User queries are matched against the database information. Depending on the application the data objects may be, for example, text documents, images,[1] audio,[2] mind maps[3] or videos. Often the documents themselves are not kept or stored directly in the IR system, but are instead represented in the system by document surrogates or metadata.\n\nMost IR systems compute a numeric score on how well each object in the database matches the query, and rank the objects according to this value. The top ranking objects are then shown to the user. Th

In [41]:
bag_of_words = docs.map(lambda l: l.split(" "))
bag_of_words.collect()

[['\nAn',
  'information',
  'retrieval',
  'process',
  'begins',
  'when',
  'a',
  'user',
  'enters',
  'a',
  'query',
  'into',
  'the',
  'system.',
  'Queries',
  'are',
  'formal',
  'statements',
  'of',
  'information',
  'needs,',
  'for',
  'example',
  'search',
  'strings',
  'in',
  'web',
  'search',
  'engines.',
  'In',
  'information',
  'retrieval',
  'a',
  'query',
  'does',
  'not',
  'uniquely',
  'identify',
  'a',
  'single',
  'object',
  'in',
  'the',
  'collection.',
  'Instead,',
  'several',
  'objects',
  'may',
  'match',
  'the',
  'query,',
  'perhaps',
  'with',
  'different',
  'degrees',
  'of',
  'relevancy.\n\nAn',
  'object',
  'is',
  'an',
  'entity',
  'that',
  'is',
  'represented',
  'by',
  'information',
  'in',
  'a',
  'database.',
  'User',
  'queries',
  'are',
  'matched',
  'against',
  'the',
  'database',
  'information.',
  'Depending',
  'on',
  'the',
  'application',
  'the',
  'data',
  'objects',
  'may',
  'be,',
  'for'

In [42]:
tfs = tf.transform(bag_of_words)
tfs.collect()

[SparseVector(100, {0: 3.0, 1: 2.0, 2: 1.0, 3: 6.0, 4: 2.0, 6: 1.0, 7: 1.0, 10: 2.0, 11: 1.0, 12: 2.0, 15: 1.0, 16: 1.0, 17: 15.0, 18: 1.0, 19: 3.0, 20: 5.0, 21: 1.0, 24: 7.0, 25: 1.0, 26: 3.0, 27: 4.0, 30: 1.0, 32: 1.0, 33: 1.0, 34: 2.0, 35: 1.0, 37: 2.0, 38: 1.0, 39: 1.0, 40: 1.0, 41: 4.0, 42: 1.0, 46: 1.0, 51: 2.0, 52: 7.0, 54: 1.0, 56: 2.0, 57: 8.0, 58: 1.0, 60: 1.0, 61: 1.0, 62: 3.0, 63: 1.0, 64: 2.0, 65: 4.0, 66: 1.0, 68: 1.0, 69: 4.0, 70: 3.0, 71: 1.0, 72: 1.0, 73: 1.0, 77: 1.0, 79: 1.0, 80: 3.0, 83: 5.0, 84: 5.0, 86: 2.0, 87: 4.0, 89: 7.0, 91: 4.0, 92: 1.0, 94: 3.0, 95: 7.0, 96: 1.0, 97: 2.0}),
 SparseVector(100, {0: 1.0, 1: 4.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 8: 1.0, 9: 1.0, 10: 3.0, 11: 3.0, 13: 3.0, 14: 2.0, 15: 1.0, 16: 1.0, 17: 12.0, 18: 2.0, 19: 1.0, 21: 2.0, 25: 3.0, 26: 2.0, 27: 10.0, 28: 1.0, 30: 1.0, 31: 1.0, 33: 2.0, 34: 3.0, 37: 5.0, 38: 1.0, 40: 4.0, 41: 2.0, 42: 2.0, 46: 1.0, 47: 1.0, 48: 1.0, 49: 2.0, 50: 2.0, 51: 1.0, 52: 5.0, 55: 1.0, 56: 3.0, 57: 1.0, 58: 1.0

In [43]:
idf = IDF()

In [44]:
idf

<pyspark.mllib.feature.IDF at 0x7f8004084470>

In [45]:
idf_model = idf.fit(tfs)

In [46]:
idf_model

<pyspark.mllib.feature.IDFModel at 0x7f7ff6d006a0>

In [47]:
tf_idf_vectors = idf_model.transform(tfs)

In [48]:
tf_idf_vectors.collect()

[SparseVector(100, {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0, 4: 0.0, 6: 0.2877, 7: 0.2877, 10: 0.0, 11: 0.0, 12: 0.5754, 15: 0.2877, 16: 0.0, 17: 0.0, 18: 0.0, 19: 0.0, 20: 1.4384, 21: 0.0, 24: 4.852, 25: 0.0, 26: 0.0, 27: 0.0, 30: 0.0, 32: 0.2877, 33: 0.0, 34: 0.0, 35: 0.2877, 37: 0.5754, 38: 0.2877, 39: 0.2877, 40: 0.2877, 41: 0.0, 42: 0.0, 46: 0.0, 51: 0.0, 52: 0.0, 54: 0.2877, 56: 0.0, 57: 2.3015, 58: 0.0, 60: 0.0, 61: 0.2877, 62: 0.0, 63: 0.0, 64: 0.0, 65: 0.0, 66: 0.6931, 68: 0.2877, 69: 0.0, 70: 0.0, 71: 0.2877, 72: 0.2877, 73: 0.0, 77: 0.2877, 79: 0.2877, 80: 0.0, 83: 0.0, 84: 0.0, 86: 0.5754, 87: 0.0, 89: 0.0, 91: 2.7726, 92: 0.2877, 94: 0.863, 95: 0.0, 96: 0.0, 97: 0.5754}),
 SparseVector(100, {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0, 4: 0.0, 5: 0.2877, 8: 0.2877, 9: 0.2877, 10: 0.0, 11: 0.0, 13: 0.863, 14: 0.5754, 15: 0.2877, 16: 0.0, 17: 0.0, 18: 0.0, 19: 0.0, 21: 0.0, 25: 0.0, 26: 0.0, 27: 0.0, 28: 0.2877, 30: 0.0, 31: 0.2877, 33: 0.0, 34: 0.0, 37: 1.4384, 38: 0.2877, 40: 1.1507, 41: 0.0, 4

## Scaling

https://spark.apache.org/docs/latest/mllib-feature-extraction.html#standardscaler

In [49]:
from pyspark.mllib.feature import StandardScaler
scaler = StandardScaler().fit(tf_idf_vectors)

In [50]:
scaled_tf_idf_vectors = scaler.transform(tf_idf_vectors)

In [51]:
scaled_tf_idf_vectors.collect()

[SparseVector(100, {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0, 4: 0.0, 6: 1.7321, 7: 1.0, 10: 0.0, 11: 0.0, 12: 2.0, 15: 1.7321, 16: 0.0, 17: 0.0, 18: 0.0, 19: 0.0, 20: 1.8898, 21: 0.0, 24: 1.7321, 25: 0.0, 26: 0.0, 27: 0.0, 30: 0.0, 32: 1.7321, 33: 0.0, 34: 0.0, 35: 0.6547, 37: 0.7947, 38: 1.7321, 39: 1.0, 40: 0.4804, 41: 0.0, 42: 0.0, 46: 0.0, 51: 0.0, 52: 0.0, 54: 1.0, 56: 0.0, 57: 1.8353, 58: 0.0, 60: 0.0, 61: 1.7321, 62: 0.0, 63: 0.0, 64: 0.0, 65: 0.0, 66: 1.7321, 68: 1.0, 69: 0.0, 70: 0.0, 71: 0.6547, 72: 1.7321, 73: 0.0, 77: 0.6547, 79: 1.0, 80: 0.0, 83: 0.0, 84: 0.0, 86: 1.3093, 87: 0.0, 89: 0.0, 91: 1.7321, 92: 1.0, 94: 1.964, 95: 0.0, 96: 0.0, 97: 1.3093}),
 SparseVector(100, {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0, 4: 0.0, 5: 1.7321, 8: 1.7321, 9: 0.6547, 10: 0.0, 11: 0.0, 13: 1.964, 14: 2.0, 15: 1.7321, 16: 0.0, 17: 0.0, 18: 0.0, 19: 0.0, 21: 0.0, 25: 0.0, 26: 0.0, 27: 0.0, 28: 1.7321, 30: 0.0, 31: 1.7321, 33: 0.0, 34: 0.0, 37: 1.9868, 38: 1.7321, 40: 1.9215, 41: 0.0, 42: 0.0, 46: 0.0, 47: 1.

In [52]:
scaler2 = StandardScaler(withMean=True, withStd=True).fit(tf_idf_vectors)
scaler2.transform(tf_idf_vectors).collect()

[DenseVector([0.0, 0.0, 0.0, 0.0, 0.0, -1.1547, 0.5774, 0.0, -1.1547, -0.8729, 0.0, 0.0, 1.0, -1.0911, -1.0, 0.5774, 0.0, 0.0, 0.0, 0.0, 1.1339, 0.0, -0.5774, 0.0, 1.1547, 0.0, 0.0, 0.0, -1.1547, -0.5774, 0.0, -1.1547, 0.5774, 0.0, 0.0, -0.2182, 0.0, -0.1325, 0.5774, 0.0, -0.3203, 0.0, 0.0, 0.0, 0.0, -0.5774, 0.0, -1.0, -1.1547, -1.0, -0.5774, 0.0, 0.0, -0.5774, 0.0, -0.5774, 0.0, 1.1471, 0.0, -0.5774, 0.0, 0.5774, 0.0, 0.0, 0.0, 0.0, 1.1547, -1.0, 0.0, 0.0, 0.0, -0.2182, 0.5774, 0.0, -0.5774, -0.5774, -1.1547, -0.2182, -1.0, 0.0, 0.0, 0.0, -1.0911, 0.0, 0.0, 0.0, 0.2182, 0.0, -1.0911, 0.0, -1.0, 1.1547, 0.0, -1.0, 0.8729, 0.0, 0.0, 0.2182, -1.1094, -0.8006]),
 DenseVector([0.0, 0.0, 0.0, 0.0, 0.0, 0.5774, -1.1547, -1.0, 0.5774, -0.2182, 0.0, 0.0, -1.0, 0.8729, 1.0, 0.5774, 0.0, 0.0, 0.0, 0.0, -0.7559, 0.0, -0.5774, 0.0, -0.5774, 0.0, 0.0, 0.0, 0.5774, -0.5774, 0.0, 0.5774, -1.1547, 0.0, 0.0, -0.8729, 0.0, 1.0596, 0.5774, -1.0, 1.1209, 0.0, 0.0, 0.0, 0.0, -0.5774, 0.0, 0.0, 0.5774, 1.0

## Normalization

https://spark.apache.org/docs/latest/mllib-feature-extraction.html#normalizer

In [53]:
from pyspark.mllib.feature import Normalizer
normalizer_l2 = Normalizer(p=2)

In [54]:
normalized_tf_idf_vectors = normalizer_l2.transform(tf_idf_vectors)
normalized_tf_idf_vectors.collect()

[SparseVector(100, {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0, 4: 0.0, 6: 0.0441, 7: 0.0441, 10: 0.0, 11: 0.0, 12: 0.0883, 15: 0.0441, 16: 0.0, 17: 0.0, 18: 0.0, 19: 0.0, 20: 0.2207, 21: 0.0, 24: 0.7445, 25: 0.0, 26: 0.0, 27: 0.0, 30: 0.0, 32: 0.0441, 33: 0.0, 34: 0.0, 35: 0.0441, 37: 0.0883, 38: 0.0441, 39: 0.0441, 40: 0.0441, 41: 0.0, 42: 0.0, 46: 0.0, 51: 0.0, 52: 0.0, 54: 0.0441, 56: 0.0, 57: 0.3532, 58: 0.0, 60: 0.0, 61: 0.0441, 62: 0.0, 63: 0.0, 64: 0.0, 65: 0.0, 66: 0.1064, 68: 0.0441, 69: 0.0, 70: 0.0, 71: 0.0441, 72: 0.0441, 73: 0.0, 77: 0.0441, 79: 0.0441, 80: 0.0, 83: 0.0, 84: 0.0, 86: 0.0883, 87: 0.0, 89: 0.0, 91: 0.4255, 92: 0.0441, 94: 0.1324, 95: 0.0, 96: 0.0, 97: 0.0883}),
 SparseVector(100, {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0, 4: 0.0, 5: 0.0767, 8: 0.0767, 9: 0.0767, 10: 0.0, 11: 0.0, 13: 0.2302, 14: 0.1535, 15: 0.0767, 16: 0.0, 17: 0.0, 18: 0.0, 19: 0.0, 21: 0.0, 25: 0.0, 26: 0.0, 27: 0.0, 28: 0.0767, 30: 0.0, 31: 0.0767, 33: 0.0, 34: 0.0, 37: 0.3837, 38: 0.0767, 40: 0.3069, 41: 0.0

## Calculating similarity

https://spark.apache.org/docs/latest/api/python/pyspark.mllib.html#module-pyspark.mllib.linalg

In [55]:
chosen_doc = normalized_tf_idf_vectors.take(1)
chosen_doc

[SparseVector(100, {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0, 4: 0.0, 6: 0.0441, 7: 0.0441, 10: 0.0, 11: 0.0, 12: 0.0883, 15: 0.0441, 16: 0.0, 17: 0.0, 18: 0.0, 19: 0.0, 20: 0.2207, 21: 0.0, 24: 0.7445, 25: 0.0, 26: 0.0, 27: 0.0, 30: 0.0, 32: 0.0441, 33: 0.0, 34: 0.0, 35: 0.0441, 37: 0.0883, 38: 0.0441, 39: 0.0441, 40: 0.0441, 41: 0.0, 42: 0.0, 46: 0.0, 51: 0.0, 52: 0.0, 54: 0.0441, 56: 0.0, 57: 0.3532, 58: 0.0, 60: 0.0, 61: 0.0441, 62: 0.0, 63: 0.0, 64: 0.0, 65: 0.0, 66: 0.1064, 68: 0.0441, 69: 0.0, 70: 0.0, 71: 0.0441, 72: 0.0441, 73: 0.0, 77: 0.0441, 79: 0.0441, 80: 0.0, 83: 0.0, 84: 0.0, 86: 0.0883, 87: 0.0, 89: 0.0, 91: 0.4255, 92: 0.0441, 94: 0.1324, 95: 0.0, 96: 0.0, 97: 0.0883})]

In [56]:
chosen_doc = chosen_doc[0]

In [57]:
def calc_cosine_similarity(a,b):
    return a.dot(b)

similarities = normalized_tf_idf_vectors.map(
    lambda x: calc_cosine_similarity(x, chosen_doc))

In [58]:
similarities

PythonRDD[39] at RDD at PythonRDD.scala:48

In [59]:
similarities.collect()

[1.0000000000000009, 0.1083972033078154, 0.11710656315774444]

In [60]:
chosen_doc = tf_idf_vectors.take(1)[0]
similarities = tf_idf_vectors.map(
    lambda x: calc_cosine_similarity(x, chosen_doc))
similarities.collect()

[42.468825643921001, 2.6483511939248539, 3.4759609420263704]

## Word2Vec

https://spark.apache.org/docs/latest/api/python/pyspark.mllib.html#pyspark.mllib.feature.Word2Vec

In [61]:
from pyspark.mllib.feature import Word2Vec

In [62]:
word2vec = Word2Vec()
model = word2vec.fit(bag_of_words)

In [64]:
list(model.findSynonyms('information',10))

[('by', 0.17507408559322357),
 ('the', 0.13650830090045929),
 ('as', 0.11595361679792404),
 ('The', 0.10526826232671738),
 ('a', 0.095001891255378723),
 ('retrieval', 0.089702315628528595),
 ('in', 0.013783745467662811),
 ('of', 0.0046437471173703671),
 ('for', -0.00085847871378064156),
 ('are', -0.027802722528576851)]

In [65]:
model.findSynonyms('begins',10)

Py4JJavaError: An error occurred while calling o243.findSynonyms.
: java.lang.IllegalStateException: begins not in vocabulary
	at org.apache.spark.mllib.feature.Word2VecModel.transform(Word2Vec.scala:529)
	at org.apache.spark.mllib.feature.Word2VecModel.findSynonyms(Word2Vec.scala:541)
	at org.apache.spark.mllib.api.python.Word2VecModelWrapper.findSynonyms(Word2VecModelWrapper.scala:53)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:745)


In [66]:
model.transform("information")

DenseVector([-0.0047, 0.0026, 0.0007, -0.0001, -0.0014, -0.0029, 0.0044, -0.003, 0.002, -0.0018, 0.0049, 0.0026, 0.0016, 0.002, -0.0011, 0.0047, -0.0011, -0.0018, -0.0043, -0.0025, 0.0021, -0.0051, 0.0, -0.0013, 0.0023, 0.0034, 0.0012, -0.0033, 0.0041, -0.0042, 0.0049, 0.0045, -0.001, -0.0044, -0.0038, 0.0007, -0.0019, 0.0032, 0.0018, 0.0036, -0.0019, -0.002, -0.0007, 0.0036, -0.0034, 0.0024, -0.0029, 0.0049, 0.0022, 0.0046, 0.0035, 0.002, -0.0032, -0.0028, 0.001, 0.0037, 0.0023, -0.0002, -0.0012, -0.0001, -0.0001, 0.0018, -0.0048, -0.0007, 0.0024, 0.0043, -0.0042, -0.0016, -0.0045, 0.0033, 0.0033, -0.003, 0.0034, 0.0036, -0.0025, -0.0004, 0.0032, 0.0029, 0.0045, -0.0044, 0.0024, 0.0023, -0.0038, 0.0049, -0.0004, -0.0041, 0.0039, -0.0005, -0.0009, -0.0035, 0.0007, -0.0005, 0.0013, 0.0037, -0.0029, 0.0015, 0.0028, -0.0037, 0.0029, -0.0007])

In [67]:
dic = model.getVectors()

In [68]:
len(dic)

17

In [69]:
dic.keys()

KeysView({'for': [0.0047001303, -0.0016854116, 0.0026793554, 0.0031683391, -1.7905342e-05, -0.0004868209, 0.0014373835, 0.0024712747, 0.0031801637, 0.0024905412, -0.0031188347, -0.0021240122, 0.0035675413, 0.0023633398, -0.0034750043, 0.0035335694, -0.0029262525, 0.0031822147, 0.0039775884, 0.0026325579, 0.0011055218, 0.00090653956, 0.004229791, -0.00477651, 0.0043247086, 0.00063684775, 0.0030338576, 0.0015136508, -0.00050576485, 0.0028403357, 0.0024474638, 0.004809735, 0.0028830818, -0.00015406685, 0.0022000598, -0.0042193346, 0.0043043313, 0.0015846329, 0.0017031013, 0.001948534, -0.0013776568, 0.0042162244, -0.0015344793, -0.0040058848, 0.0030701826, 0.004713468, -0.0038863115, 0.0048577823, 0.0008113637, 0.0022230644, -0.0016704818, -0.001964513, -0.0025674098, -0.00012785832, 0.003003507, -0.0031702416, 0.00062483054, 0.002402168, 0.0026150695, 0.0043001724, -0.0032592814, -0.0023695154, -0.0028853742, -0.0017271837, -0.004937374, -0.0010920943, 0.0019595006, -0.0040831347, -0.003

In [70]:
dic['information']

[-0.004723864, 0.002613491, 0.0007446775, -0.0001324323, -0.0013685296, -0.002912594, 0.0043770196, -0.0030009036, 0.001953781, -0.0018130864, 0.004938094, 0.0025866104, 0.0015522182, 0.0019539248, -0.0010751929, 0.0047064302, -0.0010971265, -0.0017515055, -0.004278674, -0.0024672844, 0.0020958942, -0.005072666, 4.1995147e-05, -0.0013467753, 0.0022656163, 0.0033714226, 0.0012424166, -0.0033279967, 0.004119997, -0.004201455, 0.004911716, 0.0044554854, -0.00095554773, -0.0043917913, -0.0037531639, 0.0006852576, -0.0018514385, 0.003164281, 0.0017564661, 0.0035515525, -0.0018740431, -0.0019759967, -0.00070078735, 0.0035555274, -0.0033679341, 0.0023586717, -0.0028801835, 0.0049218824, 0.002194466, 0.0045532663, 0.003534684, 0.0020456326, -0.0031618315, -0.002750446, 0.0010424393, 0.0036532586, 0.0023107734, -0.00016288084, -0.0011699398, -0.0001049456, -0.00012637656, 0.0018391148, -0.004809347, -0.00069596263, 0.002379534, 0.0043083364, -0.0041958946, -0.0015698429, -0.004492121, 0.0032924