# Boston Housing PCA und SVD

In [1]:
from pyspark.sql.types import BooleanType
from pyspark.ml.feature import StringIndexer, VectorAssembler, PCA
from pyspark.ml.classification import LinearSVC
from pyspark.sql.session import SparkSession, Row
from pyspark.sql.functions import desc, expr
from pyspark.mllib.linalg.distributed import RowMatrix
from pyspark.mllib.linalg import Vectors

# for pretty printing
def printDf(sprkDF): 
    newdf = sprkDF.toPandas()
    from IPython.display import display, HTML
    return HTML(newdf.to_html())

In [2]:
inputFile = "../data/Boston_Housing_Data.csv"

Spark session creation 

In [3]:
spark = (SparkSession
       .builder
       .appName("BostonHousingPCA")
       .getOrCreate())

DataFrame creation using an ifered Schema 

In [4]:
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ";") \
       .csv(inputFile) \
       .withColumn("CATBOOL", expr("CAT").cast(BooleanType()))
print(df.printSchema())

root
 |-- CRIM: double (nullable = true)
 |-- ZN: double (nullable = true)
 |-- INDUS: double (nullable = true)
 |-- CHAS: integer (nullable = true)
 |-- NOX: double (nullable = true)
 |-- RM: double (nullable = true)
 |-- AGE: double (nullable = true)
 |-- DIS: double (nullable = true)
 |-- RAD: integer (nullable = true)
 |-- TAX: integer (nullable = true)
 |-- PTRATIO: double (nullable = true)
 |-- B: double (nullable = true)
 |-- LSTAT: double (nullable = true)
 |-- MEDV: double (nullable = true)
 |-- CAT: integer (nullable = true)
 |-- CATBOOL: boolean (nullable = true)

None


Prepare training and test data.

In [5]:
featureCols = df.columns.copy() 
# TODO remove additional columns to test PCA
featureCols.remove("MEDV")
featureCols.remove("CAT")
featureCols.remove("CATBOOL") 
print(featureCols)

assembler =  VectorAssembler(outputCol="features", inputCols=featureCols)
featureSet = assembler.transform(df)
# TODO test if standardization or normilization helps 

['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']


## PCA Model

In [6]:
# TODO test different Settings and compare the results
pcaModel = PCA(inputCol="features",outputCol="pcaFeatures",k=7).fit(featureSet)
result = pcaModel.transform(featureSet).select("pcaFeatures")
printDf(result.limit(10))

Unnamed: 0,pcaFeatures
0,"[-170.48334946511406, 466.4384020319935, 20.078098061799402, 50.75554287623922, -4.604172476281945, 0.25409369238824464, -2.6753499433893433]"
1,"[-121.41203669936277, 450.7621375378928, 47.68729219561534, 46.76072566080398, -2.415764243841794, 3.324343052394645, -2.271392148019748]"
2,"[-120.99048471582843, 446.7978138756289, 33.65903362728322, 35.18555087637617, -2.8788530017161538, 0.6560960663676898, 0.07601164477166011]"
3,"[-100.07154991900181, 442.5758828738239, 23.439600791075318, 25.8190289786975, -1.2720022430665483, -0.3937607731670085, -3.6141234071706276]"
4,"[-100.16874115628491, 444.7803987428149, 30.06392551637743, 31.2858090653454, -1.0244388038616958, 0.8051771362148079, -4.702391257999549]"
5,"[-101.35161562058168, 442.14918362045637, 33.39160897473632, 34.12873826094227, -1.3362917377221881, 0.24862806700494922, -4.787821958363335]"
6,"[-185.9661956215183, 469.61385544198174, 24.286195582317355, 47.31156920974688, -2.944095303219289, 7.507955565687454, -3.0584825440896477]"
7,"[-188.20788381191778, 470.98637572051086, 47.22093164665687, 66.40460805462332, -2.6506021181517943, 10.235192388109189, -6.313805864491016]"
8,"[-191.77392078179986, 461.12625528248145, 50.92494342480178, 69.3332833831602, -0.3879447969794123, 19.347975842809834, -10.343949235446447]"
9,"[-190.27203718475477, 461.1960140442279, 39.05287404025134, 59.778590505440704, -2.8139519419363275, 9.484241160307661, -5.336165973488915]"


## SVD

In [7]:
featureVector = featureSet.select("features").rdd.map (lambda row : Vectors.dense(row))
mat = RowMatrix(featureVector)
# TODO test different settings and compare the result
svd = mat.computeSVD(7, True, 1.0e-9)
U = svd.U # The U factor is a RowMatrix.
s = svd.s # The singular values are stored in a local dense vector.
V = svd.V 
collectPartitions = U.rows.collect()
print("U factor is:")
for  vector in collectPartitions :
	   print("\t", vector)
print("Singular values are: ", s)
print("V factor is:\n",V )

790433293139482,0.05769360892118863,-0.04539630539413482]
	 [-0.061564555027213946,0.0321762765153388,0.0058328706086037945,0.016234406118266896,-0.000896423755245436,-0.006699817916781287,-0.023150195105879]
	 [-0.06141625411015892,0.03269093478970866,0.002991951668025839,0.019145915119463106,0.011372484927686093,-0.030242700335467262,-0.01270357737137049]
	 [-0.06048102544791018,0.03618475583464925,-0.013983292286914436,0.03732237411912369,0.018397434041220365,0.0006456296478619672,-0.019581696988372116]
	 [-0.061556524081364215,0.0322109836254435,0.004328907376792418,0.018244893612137478,-0.028525176256698807,0.026654296437603142,-0.03218106693141211]
	 [-0.06149268652653876,0.032103784373807634,-0.0025972995595969304,0.026421425709329354,0.0026994209343611693,-0.04177617226770179,-0.0011264240578731377]
	 [-0.06151833027583473,0.032166876381865384,0.0001857461830718156,0.022981914870904025,-0.02683815434773247,-0.04082539706192437,0.004252175133271231]
	 [-0.06131453416263005,0.033

In [8]:
spark.stop()