In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('digit-recognizer').getOrCreate()

train = spark.read.csv('./train.csv', header='true', inferSchema='true')
train.printSchema()


root
 |-- label: integer (nullable = true)
 |-- pixel0: integer (nullable = true)
 |-- pixel1: integer (nullable = true)
 |-- pixel2: integer (nullable = true)
 |-- pixel3: integer (nullable = true)
 |-- pixel4: integer (nullable = true)
 |-- pixel5: integer (nullable = true)
 |-- pixel6: integer (nullable = true)
 |-- pixel7: integer (nullable = true)
 |-- pixel8: integer (nullable = true)
 |-- pixel9: integer (nullable = true)
 |-- pixel10: integer (nullable = true)
 |-- pixel11: integer (nullable = true)
 |-- pixel12: integer (nullable = true)
 |-- pixel13: integer (nullable = true)
 |-- pixel14: integer (nullable = true)
 |-- pixel15: integer (nullable = true)
 |-- pixel16: integer (nullable = true)
 |-- pixel17: integer (nullable = true)
 |-- pixel18: integer (nullable = true)
 |-- pixel19: integer (nullable = true)
 |-- pixel20: integer (nullable = true)
 |-- pixel21: integer (nullable = true)
 |-- pixel22: integer (nullable = true)
 |-- pixel23: integer (nullable = true)
 |-- pi

In [2]:
from pyspark.sql import functions
from pyspark.ml.linalg import Vectors, VectorUDT

cols = train.columns[:]
cols.remove('label')

udf = functions.udf(lambda x : Vectors.dense(x), VectorUDT())

for col in cols:
    train = train.withColumn(col+'Vec', udf(train[col])).drop(col)

train.printSchema()

root
 |-- label: integer (nullable = true)
 |-- pixel0Vec: vector (nullable = true)
 |-- pixel1Vec: vector (nullable = true)
 |-- pixel2Vec: vector (nullable = true)
 |-- pixel3Vec: vector (nullable = true)
 |-- pixel4Vec: vector (nullable = true)
 |-- pixel5Vec: vector (nullable = true)
 |-- pixel6Vec: vector (nullable = true)
 |-- pixel7Vec: vector (nullable = true)
 |-- pixel8Vec: vector (nullable = true)
 |-- pixel9Vec: vector (nullable = true)
 |-- pixel10Vec: vector (nullable = true)
 |-- pixel11Vec: vector (nullable = true)
 |-- pixel12Vec: vector (nullable = true)
 |-- pixel13Vec: vector (nullable = true)
 |-- pixel14Vec: vector (nullable = true)
 |-- pixel15Vec: vector (nullable = true)
 |-- pixel16Vec: vector (nullable = true)
 |-- pixel17Vec: vector (nullable = true)
 |-- pixel18Vec: vector (nullable = true)
 |-- pixel19Vec: vector (nullable = true)
 |-- pixel20Vec: vector (nullable = true)
 |-- pixel21Vec: vector (nullable = true)
 |-- pixel22Vec: vector (nullable = true)
 

In [None]:
from pyspark.ml.feature import PCA, StandardScaler, VectorAssembler
from pyspark.ml import Pipeline

cols = train.columns[:]
cols.remove('label')

scalers = [StandardScaler(inputCol=col, outputCol=col+"Feature") for col in cols]
scalerOutCols = [col+"Feature" for col in cols]
assembler = VectorAssembler(inputCols=scalerOutCols, outputCol="features")
pca = PCA(k=100, inputCol="features", outputCol="pcaFeatures")
pipeline = Pipeline(stages=[*scalers, assembler, pca])
model = pipeline.fit(train)
train = model.transform(train)

In [None]:
# train.select('pcaFeatures').show()
model.explainedVariance
# model.