## PCA and SVD

### LOAD THE DATASET

#### Using the Bank Note Authentication Dataset - http://archive.ics.uci.edu/ml/datasets/banknote+authentication#

In [1]:
banknoteDataset = spark.read.csv("data_banknote_authentication.csv", sep=',',inferSchema=True)\
        .toDF('variance','skewness','curtosis','entropy','class')
print(banknoteDataset.head())
print(banknoteDataset.printSchema())

Row(variance=3.6216, skewness=8.6661, curtosis=-2.8073, entropy=-0.44699, class=0)
root
 |-- variance: double (nullable = true)
 |-- skewness: double (nullable = true)
 |-- curtosis: double (nullable = true)
 |-- entropy: double (nullable = true)
 |-- class: integer (nullable = true)

None


In [2]:
banknoteDataset.show(5)

+--------+--------+--------+--------+-----+
|variance|skewness|curtosis| entropy|class|
+--------+--------+--------+--------+-----+
|  3.6216|  8.6661| -2.8073|-0.44699|    0|
|  4.5459|  8.1674| -2.4586| -1.4621|    0|
|   3.866| -2.6383|  1.9242| 0.10645|    0|
|  3.4566|  9.5228| -4.0112| -3.5944|    0|
| 0.32924| -4.4552|  4.5718| -0.9888|    0|
+--------+--------+--------+--------+-----+
only showing top 5 rows



In [3]:
banknoteDataset.describe().show()

+-------+------------------+------------------+------------------+------------------+------------------+
|summary|          variance|          skewness|          curtosis|           entropy|             class|
+-------+------------------+------------------+------------------+------------------+------------------+
|  count|              1372|              1372|              1372|              1372|              1372|
|   mean|0.4337352570699707|1.9223531206393603|1.3976271172667651|-1.191656520043731|0.4446064139941691|
| stddev|2.8427625862785577| 5.869046743695513| 4.310030090106595| 2.101013137359609|0.4971032701256608|
|    min|           -7.0421|          -13.7731|           -5.2861|           -8.5482|                 0|
|    max|            6.8248|           12.9516|           17.9274|            2.4495|                 1|
+-------+------------------+------------------+------------------+------------------+------------------+



In [4]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [5]:
vector_assembler = VectorAssembler(\
inputCols=['variance','skewness','curtosis','entropy'],\
outputCol="features")

In [6]:
df_temp = vector_assembler.transform(banknoteDataset)
df_temp.show(3)

+--------+--------+--------+--------+-----+--------------------+
|variance|skewness|curtosis| entropy|class|            features|
+--------+--------+--------+--------+-----+--------------------+
|  3.6216|  8.6661| -2.8073|-0.44699|    0|[3.6216,8.6661,-2...|
|  4.5459|  8.1674| -2.4586| -1.4621|    0|[4.5459,8.1674,-2...|
|   3.866| -2.6383|  1.9242| 0.10645|    0|[3.866,-2.6383,1....|
+--------+--------+--------+--------+-----+--------------------+
only showing top 3 rows



In [7]:
df = df_temp.drop('variance','skewness','curtosis','entropy')
df.show(3)

+-----+--------------------+
|class|            features|
+-----+--------------------+
|    0|[3.6216,8.6661,-2...|
|    0|[4.5459,8.1674,-2...|
|    0|[3.866,-2.6383,1....|
+-----+--------------------+
only showing top 3 rows



In [8]:
from pyspark.ml.feature import StringIndexer
l_indexer = StringIndexer(inputCol="class", outputCol="classIndex")

In [9]:
df = l_indexer.fit(df).transform(df)
df.show(5)

+-----+--------------------+----------+
|class|            features|classIndex|
+-----+--------------------+----------+
|    0|[3.6216,8.6661,-2...|       0.0|
|    0|[4.5459,8.1674,-2...|       0.0|
|    0|[3.866,-2.6383,1....|       0.0|
|    0|[3.4566,9.5228,-4...|       0.0|
|    0|[0.32924,-4.4552,...|       0.0|
+-----+--------------------+----------+
only showing top 5 rows



### Perform PCA

In [10]:
from pyspark.ml.feature import PCA
from pyspark.ml import Pipeline

In [11]:
#Perform PCA
from pyspark.ml.feature import PCA

bankPCA = PCA(k=2, inputCol="features", outputCol="pcaFeatures")
pcaModel = bankPCA.fit(df)
pcaResult = pcaModel.transform(df).select("class","pcaFeatures")
pcaResult.show(truncate=False)

+-----+------------------------------------------+
|class|pcaFeatures                               |
+-----+------------------------------------------+
|0    |[-9.141988498528022,1.2645621038281543]   |
|0    |[-8.824158431686138,1.581502362136377]    |
|0    |[2.666160645333471,3.2646675009851416]    |
|0    |[-10.932646142995985,-0.13002268423205332]|
|0    |[5.93351057287925,-0.3742398697901879]    |
|0    |[-11.091338074506377,0.7384944056852125]  |
|0    |[-2.46530968170101,2.117200228644701]     |
|0    |[9.788932914931099,0.6627925029521695]    |
|0    |[-5.62191772112166,1.0048879097449603]    |
|0    |[-9.00956346569529,-0.8317760598330879]   |
|0    |[-8.620424611259098,-1.0249334460642896]  |
|0    |[3.0726725110414987,3.5767457355450856]   |
|0    |[-6.845357453759397,-1.9247461302046134]  |
|0    |[-7.592075903959261,-6.191420634596226]   |
|0    |[-9.279433134932228,1.073876390553459]    |
|0    |[4.220571981332124,4.251206933513333]     |
|0    |[-2.570789612041451,1.48

In [12]:
from pyspark.ml.feature import PCA

bankPCA = PCA(k=2, inputCol="features", outputCol="pcaFeatures")
pcaModel = bankPCA.fit(df)
pcaResult = pcaModel.transform(df).select("class","pcaFeatures")
pcaResult.show(truncate=False)

+-----+------------------------------------------+
|class|pcaFeatures                               |
+-----+------------------------------------------+
|0    |[-9.141988498528022,1.2645621038281543]   |
|0    |[-8.824158431686138,1.581502362136377]    |
|0    |[2.666160645333471,3.2646675009851416]    |
|0    |[-10.932646142995985,-0.13002268423205332]|
|0    |[5.93351057287925,-0.3742398697901879]    |
|0    |[-11.091338074506377,0.7384944056852125]  |
|0    |[-2.46530968170101,2.117200228644701]     |
|0    |[9.788932914931099,0.6627925029521695]    |
|0    |[-5.62191772112166,1.0048879097449603]    |
|0    |[-9.00956346569529,-0.8317760598330879]   |
|0    |[-8.620424611259098,-1.0249334460642896]  |
|0    |[3.0726725110414987,3.5767457355450856]   |
|0    |[-6.845357453759397,-1.9247461302046134]  |
|0    |[-7.592075903959261,-6.191420634596226]   |
|0    |[-9.279433134932228,1.073876390553459]    |
|0    |[4.220571981332124,4.251206933513333]     |
|0    |[-2.570789612041451,1.48

### Perform SVD

In [13]:
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg.distributed import RowMatrix

rows = sc.parallelize([
    Vectors.sparse(5, {1: 1.0, 3: 7.0}),
    Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
    Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)
])

mat = RowMatrix(rows)

# Compute the top 5 singular values and corresponding singular vectors.
svd = mat.computeSVD(5, computeU=True)
# U is a RowMatrix.
U = svd.U  
# S is a dense vector.
s = svd.s 
print('Dense Vector')
print(s)
# V is a dense matrix.
V = svd.V  
print('Dense matrix')
print(V)

Dense Vector
[13.029275535600473,5.368578733451684,2.5330498218813755,6.323166049206486e-08,2.0226934557075942e-08]
Dense matrix
DenseMatrix([[-0.31278534,  0.31167136,  0.30366911,  0.8409913 , -0.07446478],
             [-0.02980145, -0.17133211, -0.02226069,  0.14664984,  0.97352733],
             [-0.12207248,  0.15256471, -0.95070998,  0.23828799, -0.03452092],
             [-0.71847899, -0.68096285, -0.0172245 , -0.02094998, -0.13907533],
             [-0.60841059,  0.62170723,  0.05606596, -0.46260933,  0.16175873]])
