# Boston Housing PCA und SVD

In [17]:
from pyspark.sql.types import BooleanType
from pyspark.ml.feature import StringIndexer, VectorAssembler, PCA, StandardScaler
from pyspark.ml.classification import LinearSVC
from pyspark.sql.session import SparkSession, Row
from pyspark.sql.functions import desc, expr
from pyspark.mllib.linalg.distributed import RowMatrix
from pyspark.mllib.linalg import Vectors

# for pretty printing
def printDf(sprkDF): 
    newdf = sprkDF.toPandas()
    from IPython.display import display, HTML
    return HTML(newdf.to_html())

In [18]:
inputFile = "../data/Boston_Housing_Data.csv"

Spark session creation 

In [19]:
spark = (SparkSession
       .builder
       .appName("BostonHousingPCA")
       .getOrCreate())

DataFrame creation using an ifered Schema 

In [20]:
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ";") \
       .csv(inputFile) \
       .withColumn("CATBOOL", expr("CAT").cast(BooleanType()))
print(df.printSchema())

root
 |-- CRIM: double (nullable = true)
 |-- ZN: double (nullable = true)
 |-- INDUS: double (nullable = true)
 |-- CHAS: integer (nullable = true)
 |-- NOX: double (nullable = true)
 |-- RM: double (nullable = true)
 |-- AGE: double (nullable = true)
 |-- DIS: double (nullable = true)
 |-- RAD: integer (nullable = true)
 |-- TAX: integer (nullable = true)
 |-- PTRATIO: double (nullable = true)
 |-- B: double (nullable = true)
 |-- LSTAT: double (nullable = true)
 |-- MEDV: double (nullable = true)
 |-- CAT: integer (nullable = true)
 |-- CATBOOL: boolean (nullable = true)

None


Prepare training and test data.

In [21]:
featureCols = df.columns.copy()
featureCols.remove("MEDV")
featureCols.remove("CAT")
featureCols.remove("CATBOOL") 
print(featureCols)

assembler =  VectorAssembler(outputCol="features", inputCols=featureCols)
featureSet = assembler.transform(df)
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                        withStd=True, withMean=False)

# Compute summary statistics by fitting the StandardScaler
scalerModel = scaler.fit(featureSet)
scaledFeatureSet = scalerModel.transform(featureSet)


['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']


## PCA Model

In [22]:
pcaModel = PCA(inputCol="scaledFeatures",outputCol="pcaFeatures",k=7).fit(scaledFeatureSet)
result = pcaModel.transform(scaledFeatureSet).select("pcaFeatures")
printDf(result.limit(10))

Unnamed: 0,pcaFeatures
0,"[-1.255126323467362, 0.6192087530486635, 2.8492026736310136, -0.12832421852268733, -8.459293518071552, -2.5065838672319654, -0.7738056167657951]"
1,"[-1.8955383642585284, 0.43826027853295, 1.8120869800994324, 0.2755915629629274, -9.077627632677466, -1.9275954957738022, -0.5388176601391962]"
2,"[-1.2788028017577684, 0.44590690480701095, 2.6735553663401452, 0.024094787837397096, -9.815555769746382, -1.7439058622966233, -0.6080504433935084]"
3,"[-0.7424275948416084, -0.1600034995910606, 2.4064140009513912, 0.419186754375297, -9.985809177328413, -1.5275650787063515, -0.4701060596117716]"
4,"[-0.895594581774879, -0.055524327247303974, 2.4313252725308705, 0.33508434649155555, -9.946816072159676, -1.5751203100286044, -0.3877424329870516]"
5,"[-1.1386875105131828, -0.16261730588733675, 1.8348826354771166, 0.5868319355679501, -9.508513177538937, -1.620703213167048, -0.44159174477324614]"
6,"[-1.993811697840592, 0.19638661842065766, 2.134967463297675, 0.36482796622426517, -7.809858898589981, -2.6499732906888775, -1.0723856785552783]"
7,"[-2.5101372119926286, 0.42408881980615676, 1.9885712038195325, 0.22534159166219714, -7.503621451523909, -2.770242671577844, -0.8675218045357369]"
8,"[-3.171598958057647, 0.18903985467541123, 1.1582945709536037, 0.516891286279167, -6.533965468702469, -2.816001315226671, -0.7777027953590024]"
9,"[-2.278227215670247, 0.16274914766596799, 1.9486819364168335, 0.38301153223351836, -7.452828823321953, -2.646983140069571, -0.9870259090540333]"


## SVD

In [23]:
featureVector = scaledFeatureSet.select("scaledFeatures").rdd.map (lambda row : Vectors.dense(row))
mat = RowMatrix(featureVector)
svd = mat.computeSVD(7, True, 1.0e-9)
U = svd.U # The U factor is a RowMatrix.
s = svd.s # The singular values are stored in a local dense vector.
V = svd.V 
collectPartitions = U.rows.collect()
print("U factor is:")
for  vector in collectPartitions :
	   print("\t", vector)
print("Singular values are: ", s)
print("V factor is:\n",V )

.028843810593992457,0.03974008341310957,0.03145120279569345,-0.0777548774240385,0.009142658397749512]
	 [-0.04716741136612964,0.05896149311341365,0.011165595524121137,0.02466171050728668,0.014368664419049982,-0.04680947661093531,0.03910524313244254]
	 [-0.047790988149405134,0.05169635701614522,0.0075654974879081835,0.012086114172786391,0.005861770473692601,-0.03092285679780169,0.048097125362523946]
	 [-0.04838744806012327,0.04646440690885231,0.012151696897248234,0.003981009519252465,0.00341648539304165,-0.021362015525638087,0.03489776734932634]
	 [-0.04649352581312513,0.06722945582514323,0.021792609803439663,0.036468029660279226,0.027293998999395813,-0.06684807502391667,0.02990866233396622]
	 [-0.04883308949941825,0.04592242388068702,0.006258487683384388,-0.005341828318454735,-0.003964872111202774,-0.019701675725498295,0.061998749191027276]
	 [-0.04817378366212115,0.05254441763637917,0.014799381200387842,-0.00113184868891196,0.004547072916521816,-0.03148667249792443,0.06814110648601723

In [24]:
spark.stop()