# Data Types - RDD-based API

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession\
        .builder\
        .appName("PythonPi")\
        .getOrCreate()

In [4]:
import numpy as np
import scipy.sparse as sps
from pyspark.ml.linalg import Vectors

dv1 = np.array([1,0,3])

dv2 = [1, 0, 3]

sv1 = Vectors.sparse(3, [0, 2], [1, 3])
sv2 = sps.csc_matrix((np.array([1.0, 3.0]), np.array([0, 2]), np.array([0, 2])), shape=(3, 1))

In [5]:
dv1

array([1, 0, 3])

In [6]:
dv2

[1, 0, 3]

In [9]:
sv1.values

array([1., 3.])

In [11]:
sv2

<3x1 sparse matrix of type '<class 'numpy.float64'>'
	with 2 stored elements in Compressed Sparse Column format>

In [12]:
# Labeled point
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.regression import LabeledPoint

# Create a labeled point with a positive label and a dense feature vector.
pos = LabeledPoint(1.0, [1.0, 0.0, 3.0])

# Create a labeled point with a negative label and a sparse feature vector.
neg = LabeledPoint(0.0, SparseVector(3, [0, 2], [1.0, 3.0]))

In [15]:
pos

LabeledPoint(1.0, [1.0,0.0,3.0])

In [16]:
from pyspark.mllib.linalg import Matrix, Matrices

# Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0))
dm2 = Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])

# Create a sparse matrix ((9.0, 0.0), (0.0, 8.0), (0.0, 6.0))
sm = Matrices.sparse(3, 2, [0, 1, 3], [0, 2, 1], [9, 6, 8])

In [17]:
dm2

DenseMatrix(3, 2, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], False)

In [18]:
sm

SparseMatrix(3, 2, [0, 1, 3], [0, 2, 1], [9.0, 6.0, 8.0], False)

In [20]:
from pyspark.mllib.linalg.distributed import RowMatrix

# Create an RDD of vectors.
sc = spark.sparkContext
rows = sc.parallelize([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]])

# Create a RowMatrix from an RDD of vectors.
mat = RowMatrix(rows)

# Get its size.
m = mat.numRows()  # 4
n = mat.numCols()  # 3

# Get the rows as an RDD of vectors again.
rowsRDD = mat.rows

In [21]:
mat

<pyspark.mllib.linalg.distributed.RowMatrix at 0x111adcf60>