# Boston Housing PCA und SVD

In [None]:
import sys
sys.path.append("..")
from helpers.data_prep_and_print import print_df
from helpers.path_translation import translate_to_file_string
from pyspark.ml.feature import StringIndexer, VectorAssembler, PCA, StandardScaler
from pyspark.ml.classification import LinearSVC
from pyspark.mllib.linalg.distributed import RowMatrix
from pyspark.mllib.linalg import Vectors
from pyspark.sql import Row
from pyspark.sql.functions import desc, expr
from pyspark.sql.session import SparkSession
from pyspark.sql.types import BooleanType

In [None]:
inputFile = translate_to_file_string("../data/Boston_Housing_Data.csv")

Spark session creation 

In [None]:
spark = (SparkSession
       .builder
       .appName("BostonHousingPCA")
       .getOrCreate())

DataFrame creation using an ifered Schema 

In [None]:
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ";") \
       .csv(inputFile) \
       .withColumn("CATBOOL", expr("CAT").cast(BooleanType()))
print(df.printSchema())

Prepare training and test data.

In [None]:
featureCols = df.columns.copy()
featureCols.remove("MEDV")
featureCols.remove("CAT")
featureCols.remove("CATBOOL") 
print(featureCols)

assembler =  VectorAssembler(outputCol="features", inputCols=featureCols)
featureSet = assembler.transform(df)
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                        withStd=True, withMean=False)

# Compute summary statistics by fitting the StandardScaler
scalerModel = scaler.fit(featureSet)
scaledFeatureSet = scalerModel.transform(featureSet)


## PCA Model

In [None]:
pcaModel = PCA(inputCol="scaledFeatures",outputCol="pcaFeatures",k=7).fit(scaledFeatureSet)
result = pcaModel.transform(scaledFeatureSet).select("pcaFeatures")
print_df(result.limit(10))

## SVD

In [None]:
featureVector = scaledFeatureSet.select("scaledFeatures").rdd.map (lambda row : Vectors.dense(row))
mat = RowMatrix(featureVector)
svd = mat.computeSVD(7, True, 1.0e-9)
U = svd.U # The U factor is a RowMatrix.
s = svd.s # The singular values are stored in a local dense vector.
V = svd.V 
collectPartitions = U.rows.collect()
print("U factor is:")
for  vector in collectPartitions :
	   print("\t", vector)
print("Singular values are: ", s)
print("V factor is:\n",V )

In [None]:
spark.stop()