In [0]:
from pyspark.ml.feature import PCA, StandardScaler, VectorAssembler
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
import numpy as np
import pandas as pd

# Create Spark session (already available in Databricks)
spark = SparkSession.builder.appName("DimensionalityReduction").getOrCreate()



In [0]:
from pyspark.ml.linalg import Vectors
from sklearn.datasets import load_breast_cancer

# Load breast cancer dataset
bc = load_breast_cancer()
data = [(Vectors.dense(features), str(label)) for features, label in zip(bc.data, bc.target)]

# Create DataFrame
df = spark.createDataFrame(data, ["features", "label"])
print("Dataset loaded with shape:", (df.count(), len(bc.feature_names)))
df.show(5)

Dataset loaded with shape: (569, 30)
+--------------------+-----+
|            features|label|
+--------------------+-----+
|[17.99,10.38,122....|    0|
|[20.57,17.77,132....|    0|
|[19.69,21.25,130....|    0|
|[11.42,20.38,77.5...|    0|
|[20.29,14.34,135....|    0|
+--------------------+-----+
only showing top 5 rows



In [0]:
# Vector Assembler is already part of our input, so we can skip that step
# Create scaler
scaler = StandardScaler(
   inputCol="features",
   outputCol="scaled_features",
   withStd=True,
   withMean=True
)

# Create PCA model
pca = PCA(
   k=2,  # reduce to 2 dimensions
   inputCol="scaled_features",
   outputCol="pca_features"
)

# Create and run the pipeline
pipeline = Pipeline(stages=[scaler, pca])

In [0]:
# Fit pipeline
model = pipeline.fit(df)

# Transform data
result = model.transform(df)

# Show results
print("Original Features:", bc.feature_names)
print("\nPrincipal components:")
print(model.stages[-1].pc)  # Get PCA components

# Select and display results
result.select("label", "pca_features").show(5, truncate=False)

# Create a visualization using Databricks' display function
display(result.select("label", "pca_features"))

Original Features: ['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']

Principal components:
DenseMatrix([[-0.21890244,  0.23385713],
             [-0.10372458,  0.05970609],
             [-0.22753729,  0.21518136],
             [-0.22099499,  0.23107671],
             [-0.14258969, -0.18611302],
             [-0.23928535, -0.15189161],
             [-0.25840048, -0.06016536],
             [-0.26085376,  0.0347675 ],
             [-0.13816696, -0.19034877],
             [-0.06436335, -0.3665

label,pca_features
0,"Map(vectorType -> dense, length -> 2, values -> List(-9.184755209858798, -1.9468700303852997))"
0,"Map(vectorType -> dense, length -> 2, values -> List(-2.385702628982577, 3.764859062972658))"
0,"Map(vectorType -> dense, length -> 2, values -> List(-5.728855490819108, 1.0742285887048915))"
0,"Map(vectorType -> dense, length -> 2, values -> List(-7.116691259621306, -10.266555635124405))"
0,"Map(vectorType -> dense, length -> 2, values -> List(-3.931842466790753, 1.9463589770798275))"
0,"Map(vectorType -> dense, length -> 2, values -> List(-2.3781546249740986, -3.9464564299158127))"
0,"Map(vectorType -> dense, length -> 2, values -> List(-2.236915058609935, 2.6876664141795787))"
0,"Map(vectorType -> dense, length -> 2, values -> List(-2.1414142815514485, -2.3381866491831156))"
0,"Map(vectorType -> dense, length -> 2, values -> List(-3.172133150308517, -3.3888311376864144))"
0,"Map(vectorType -> dense, length -> 2, values -> List(-6.346162835225129, -7.720380945492959))"


In [0]:
# Convert to pandas for better visualization
pandas_df = result.select("label", "pca_features").toPandas()
pandas_df['PC1'] = pandas_df['pca_features'].apply(lambda x: float(x[0]))
pandas_df['PC2'] = pandas_df['pca_features'].apply(lambda x: float(x[1]))

# Create scatter plot using Databricks' display function
display(spark.createDataFrame(pandas_df), "scatter", {
 "x": "PC1",
 "y": "PC2",
 "color": "label"
})

# Calculate explained variance
explained_variance = model.stages[-1].explainedVariance
print("\nExplained Variance Ratio:")
for i, var in enumerate(explained_variance):
   print(f"PC{i+1}: {var:.3f}")

  Unable to convert the field pca_features. If this column is not necessary, you may consider dropping it or converting to primitive type before the conversion.
Direct cause: Unsupported type in conversion to Arrow: VectorUDT()
Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.
  warn(msg)
  Could not convert DenseVector([-9.1848, -1.9469]) with type DenseVector: did not recognize Python value type when inferring an Arrow data type
Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.
  warn(msg)


label,pca_features,PC1,PC2
0,"Map(vectorType -> dense, length -> 2, values -> List(-9.184755209858798, -1.9468700303852997))",-9.184755209858798,-1.9468700303853
0,"Map(vectorType -> dense, length -> 2, values -> List(-2.385702628982577, 3.764859062972658))",-2.385702628982577,3.764859062972658
0,"Map(vectorType -> dense, length -> 2, values -> List(-5.728855490819108, 1.0742285887048915))",-5.728855490819108,1.0742285887048917
0,"Map(vectorType -> dense, length -> 2, values -> List(-7.116691259621306, -10.266555635124405))",-7.116691259621306,-10.266555635124403
0,"Map(vectorType -> dense, length -> 2, values -> List(-3.931842466790753, 1.9463589770798275))",-3.931842466790753,1.9463589770798275
0,"Map(vectorType -> dense, length -> 2, values -> List(-2.3781546249740986, -3.9464564299158127))",-2.3781546249740986,-3.9464564299158127
0,"Map(vectorType -> dense, length -> 2, values -> List(-2.236915058609935, 2.6876664141795787))",-2.236915058609935,2.6876664141795787
0,"Map(vectorType -> dense, length -> 2, values -> List(-2.1414142815514485, -2.3381866491831156))",-2.1414142815514485,-2.338186649183116
0,"Map(vectorType -> dense, length -> 2, values -> List(-3.172133150308517, -3.3888311376864144))",-3.172133150308517,-3.3888311376864144
0,"Map(vectorType -> dense, length -> 2, values -> List(-6.346162835225129, -7.720380945492959))",-6.346162835225129,-7.720380945492959



Explained Variance Ratio:
PC1: 0.443
PC2: 0.190


In [0]:
from pyspark.ml.stat import Correlation
import time

# Measure transformation time
start_time = time.time()
transformed_data = model.transform(df)
end_time = time.time()

print(f"Transformation time: {end_time - start_time:.2f} seconds")

# Calculate feature correlations with principal components
loadings = model.stages[-1].pc.toArray()
feature_importance = pd.DataFrame(
    loadings,
    columns=[f'PC{i+1}' for i in range(loadings.shape[1])],
    index=bc.feature_names
)

print("\nFeature contributions to principal components:")
display(spark.createDataFrame(feature_importance.reset_index()))

Transformation time: 0.09 seconds

Feature contributions to principal components:


index,PC1,PC2
mean radius,-0.2189024437000036,0.2338571317474297
mean texture,-0.1037245782157058,0.0597060882917296
mean perimeter,-0.2275372930056264,0.2151813613967707
mean area,-0.2209949853859409,0.2310767112838646
mean smoothness,-0.1425896943602374,-0.1861130226705194
mean compactness,-0.2392853539530001,-0.1518916100733257
mean concavity,-0.2584004812487713,-0.0601653627986625
mean concave points,-0.2608537583857405,0.0347675004937464
mean symmetry,-0.1381669593036487,-0.1903487703722512
mean fractal dimension,-0.0643633463717727,-0.3665754713782565


In [0]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

# Prepare data for clustering
kmeans = KMeans(k=2, featuresCol="pca_features", predictionCol="cluster")
kmeans_model = kmeans.fit(result)
clustered = kmeans_model.transform(result)

# Evaluate clustering
evaluator = ClusteringEvaluator(
   predictionCol="cluster",
   featuresCol="pca_features",
   metricName="silhouette"
)
silhouette = evaluator.evaluate(clustered)

print(f"Silhouette score: {silhouette:.3f}")

# Visualize clusters
display(clustered.select("label", "cluster", "pca_features"))

Silhouette score: 0.671


label,cluster,pca_features
0,1,"Map(vectorType -> dense, length -> 2, values -> List(-9.184755209858798, -1.9468700303852997))"
0,1,"Map(vectorType -> dense, length -> 2, values -> List(-2.385702628982577, 3.764859062972658))"
0,1,"Map(vectorType -> dense, length -> 2, values -> List(-5.728855490819108, 1.0742285887048915))"
0,1,"Map(vectorType -> dense, length -> 2, values -> List(-7.116691259621306, -10.266555635124405))"
0,1,"Map(vectorType -> dense, length -> 2, values -> List(-3.931842466790753, 1.9463589770798275))"
0,1,"Map(vectorType -> dense, length -> 2, values -> List(-2.3781546249740986, -3.9464564299158127))"
0,1,"Map(vectorType -> dense, length -> 2, values -> List(-2.236915058609935, 2.6876664141795787))"
0,1,"Map(vectorType -> dense, length -> 2, values -> List(-2.1414142815514485, -2.3381866491831156))"
0,1,"Map(vectorType -> dense, length -> 2, values -> List(-3.172133150308517, -3.3888311376864144))"
0,1,"Map(vectorType -> dense, length -> 2, values -> List(-6.346162835225129, -7.720380945492959))"


In [0]:
# Compare original vs reduced dimensions
print("Dataset Summary:")
print(f"Original dimensions: {len(bc.feature_names)}")
print(f"Reduced dimensions: 2")
print(f"Data points: {df.count()}")
print("\nDimensionality Reduction Performance:")
print(f"Total explained variance: {sum(explained_variance):.3f}")
print(f"Clustering quality (Silhouette): {silhouette:.3f}")

# Save results to a table if needed
clustered.write.mode("overwrite").saveAsTable("dimensionality_reduction_results")

# Final visualization with both original labels and clusters
final_viz = clustered.select("label", "cluster", "pca_features").toPandas()
final_viz['PC1'] = final_viz['pca_features'].apply(lambda x: float(x[0]))
final_viz['PC2'] = final_viz['pca_features'].apply(lambda x: float(x[1]))

display(spark.createDataFrame(final_viz), "scatter", {
    "x": "PC1",
    "y": "PC2",
    "color": ["label", "cluster"]  # This will create two visualizations
})

Dataset Summary:
Original dimensions: 30
Reduced dimensions: 2
Data points: 569

Dimensionality Reduction Performance:
Total explained variance: 0.632
Clustering quality (Silhouette): 0.671


  Unable to convert the field pca_features. If this column is not necessary, you may consider dropping it or converting to primitive type before the conversion.
Direct cause: Unsupported type in conversion to Arrow: VectorUDT()
Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.
  warn(msg)
  Could not convert DenseVector([-9.1848, -1.9469]) with type DenseVector: did not recognize Python value type when inferring an Arrow data type
Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.
  warn(msg)


label,cluster,pca_features,PC1,PC2
0,1,"Map(vectorType -> dense, length -> 2, values -> List(-9.184755209858798, -1.9468700303852997))",-9.184755209858798,-1.9468700303853
0,1,"Map(vectorType -> dense, length -> 2, values -> List(-2.385702628982577, 3.764859062972658))",-2.385702628982577,3.764859062972658
0,1,"Map(vectorType -> dense, length -> 2, values -> List(-5.728855490819108, 1.0742285887048915))",-5.728855490819108,1.0742285887048917
0,1,"Map(vectorType -> dense, length -> 2, values -> List(-7.116691259621306, -10.266555635124405))",-7.116691259621306,-10.266555635124403
0,1,"Map(vectorType -> dense, length -> 2, values -> List(-3.931842466790753, 1.9463589770798275))",-3.931842466790753,1.9463589770798275
0,1,"Map(vectorType -> dense, length -> 2, values -> List(-2.3781546249740986, -3.9464564299158127))",-2.3781546249740986,-3.9464564299158127
0,1,"Map(vectorType -> dense, length -> 2, values -> List(-2.236915058609935, 2.6876664141795787))",-2.236915058609935,2.6876664141795787
0,1,"Map(vectorType -> dense, length -> 2, values -> List(-2.1414142815514485, -2.3381866491831156))",-2.1414142815514485,-2.338186649183116
0,1,"Map(vectorType -> dense, length -> 2, values -> List(-3.172133150308517, -3.3888311376864144))",-3.172133150308517,-3.3888311376864144
0,1,"Map(vectorType -> dense, length -> 2, values -> List(-6.346162835225129, -7.720380945492959))",-6.346162835225129,-7.720380945492959
