Linear Regression

In [None]:
from pyspark.sql import SparkSession
import pandas as pd
from sklearn import datasets

# Start Spark
spark = SparkSession.builder.appName("MLlibExamples").getOrCreate()


In [None]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler

# Load dataset
boston = datasets.load_diabetes()  # (Boston is deprecated, so using diabetes dataset for regression)
X = pd.DataFrame(boston.data, columns=boston.feature_names)
y = pd.DataFrame(boston.target, columns=["label"])
df_pd = pd.concat([X, y], axis=1)

# Convert to Spark DataFrame
df = spark.createDataFrame(df_pd)

# Assemble features
assembler = VectorAssembler(inputCols=df.columns[:-1], outputCol="features")
df = assembler.transform(df)

# Train regression model
lr = LinearRegression(featuresCol="features", labelCol="label")
model = lr.fit(df)

print("Coefficients:", model.coefficients)
print("Intercept:", model.intercept)
print("RMSE:", model.summary.rootMeanSquaredError)


Coefficients: [-10.009866299810275,-239.8156436724212,519.8459200544622,324.38464550232294,-792.1756385521954,476.7390210052281,101.04326793802068,177.06323767134498,751.2736995570891,67.62669218370446]
Intercept: 152.13348416289597
RMSE: 53.47612876402655


 Classification (Breast Cancer Dataset)

In [None]:
from pyspark.ml.classification import LogisticRegression

# Load dataset
cancer = datasets.load_breast_cancer()
X = pd.DataFrame(cancer.data, columns=cancer.feature_names)
y = pd.DataFrame(cancer.target, columns=["label"])
df_pd = pd.concat([X, y], axis=1)

# Convert to Spark DataFrame
df = spark.createDataFrame(df_pd)

# Assemble features
assembler = VectorAssembler(inputCols=df.columns[:-1], outputCol="features")
df = assembler.transform(df)

# Train classifier
lr = LogisticRegression(featuresCol="features", labelCol="label", maxIter=10)
model = lr.fit(df)

# Predict
predictions = model.transform(df)
predictions.select("label", "prediction", "probability").show(5)


+-----+----------+--------------------+
|label|prediction|         probability|
+-----+----------+--------------------+
|    0|       0.0|[0.99999999999986...|
|    0|       0.0|[0.99999973030752...|
|    0|       0.0|[0.99999999991389...|
|    0|       0.0|[0.99997561772253...|
|    0|       0.0|[0.99999979529657...|
+-----+----------+--------------------+
only showing top 5 rows



clustering Iris Dataset

In [None]:
from pyspark.ml.clustering import KMeans

# Load dataset
iris = datasets.load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
df_pd = X.copy()
df_pd["label"] = iris.target

# Convert to Spark DataFrame
df = spark.createDataFrame(df_pd)

# Assemble features
assembler = VectorAssembler(inputCols=iris.feature_names, outputCol="features")
df = assembler.transform(df)

# KMeans clustering
kmeans = KMeans(k=3, seed=1, featuresCol="features")
model = kmeans.fit(df)

predictions = model.transform(df)
predictions.select("features", "prediction").show(5)

print("Cluster Centers:", model.clusterCenters())


+-----------------+----------+
|         features|prediction|
+-----------------+----------+
|[5.1,3.5,1.4,0.2]|         1|
|[4.9,3.0,1.4,0.2]|         1|
|[4.7,3.2,1.3,0.2]|         1|
|[4.6,3.1,1.5,0.2]|         1|
|[5.0,3.6,1.4,0.2]|         1|
+-----------------+----------+
only showing top 5 rows

Cluster Centers: [array([5.88360656, 2.74098361, 4.38852459, 1.43442623]), array([5.006, 3.428, 1.462, 0.246]), array([6.85384615, 3.07692308, 5.71538462, 2.05384615])]


Advanced MLlib Techniques

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator, MulticlassClassificationEvaluator, ClusteringEvaluator

import pandas as pd
from sklearn import datasets

spark = SparkSession.builder.appName("MLlib_Examples").getOrCreate()


Regression – Diabetes Dataset

In [None]:
from pyspark.ml.regression import LinearRegression

# Load dataset
diabetes = datasets.load_diabetes()
X = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
y = pd.DataFrame(diabetes.target, columns=["label"])
df_pd = pd.concat([X, y], axis=1)
df = spark.createDataFrame(df_pd)

# Assemble features
assembler = VectorAssembler(inputCols=list(X.columns), outputCol="features")
df = assembler.transform(df).select("features", "label")

# Train/Test split
train, test = df.randomSplit([0.8, 0.2], seed=42)

# Model
lr = LinearRegression(featuresCol="features", labelCol="label")

# Hyperparameter tuning
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.01, 0.1, 0.5])
             .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])
             .build())

evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")

cv = CrossValidator(estimator=lr,
                    estimatorParamMaps=paramGrid,
                    evaluator=evaluator,
                    numFolds=5)

cvModel = cv.fit(train)
predictions = cvModel.transform(test)

# Evaluation
rmse = evaluator.evaluate(predictions)
print("Best Model Params:", cvModel.bestModel._java_obj.parent().extractParamMap())
print("Test RMSE:", rmse)


Best Model Params: {
	LinearRegression_cab2b80362de-aggregationDepth: 2,
	LinearRegression_cab2b80362de-elasticNetParam: 1.0,
	LinearRegression_cab2b80362de-epsilon: 1.35,
	LinearRegression_cab2b80362de-featuresCol: features,
	LinearRegression_cab2b80362de-fitIntercept: true,
	LinearRegression_cab2b80362de-labelCol: label,
	LinearRegression_cab2b80362de-loss: squaredError,
	LinearRegression_cab2b80362de-maxBlockSizeInMB: 0.0,
	LinearRegression_cab2b80362de-maxIter: 100,
	LinearRegression_cab2b80362de-predictionCol: prediction,
	LinearRegression_cab2b80362de-regParam: 0.01,
	LinearRegression_cab2b80362de-solver: auto,
	LinearRegression_cab2b80362de-standardization: true,
	LinearRegression_cab2b80362de-tol: 1.0E-6
}
Test RMSE: 59.96741272186755


Classification – Breast Cancer Dataset

In [None]:
from pyspark.ml.classification import LogisticRegression

# Load dataset
cancer = datasets.load_breast_cancer()
X = pd.DataFrame(cancer.data, columns=cancer.feature_names)
y = pd.DataFrame(cancer.target, columns=["label"])
df_pd = pd.concat([X, y], axis=1)
df = spark.createDataFrame(df_pd)

# Assemble features
assembler = VectorAssembler(inputCols=list(X.columns), outputCol="features")
df = assembler.transform(df).select("features", "label")

# Train/Test split
train, test = df.randomSplit([0.8, 0.2], seed=42)

# Model
lr = LogisticRegression(featuresCol="features", labelCol="label", maxIter=20)

# Hyperparameter tuning
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.01, 0.1])
             .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])
             .build())

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

cv = CrossValidator(estimator=lr,
                    estimatorParamMaps=paramGrid,
                    evaluator=evaluator,
                    numFolds=5)

cvModel = cv.fit(train)
predictions = cvModel.transform(test)

# Evaluation
accuracy = evaluator.evaluate(predictions)
print("Best Model Params:", cvModel.bestModel._java_obj.parent().extractParamMap())
print("Test Accuracy:", accuracy)


Best Model Params: {
	LogisticRegression_9cdfd2648582-aggregationDepth: 2,
	LogisticRegression_9cdfd2648582-elasticNetParam: 0.0,
	LogisticRegression_9cdfd2648582-family: auto,
	LogisticRegression_9cdfd2648582-featuresCol: features,
	LogisticRegression_9cdfd2648582-fitIntercept: true,
	LogisticRegression_9cdfd2648582-labelCol: label,
	LogisticRegression_9cdfd2648582-maxBlockSizeInMB: 0.0,
	LogisticRegression_9cdfd2648582-maxIter: 20,
	LogisticRegression_9cdfd2648582-predictionCol: prediction,
	LogisticRegression_9cdfd2648582-probabilityCol: probability,
	LogisticRegression_9cdfd2648582-rawPredictionCol: rawPrediction,
	LogisticRegression_9cdfd2648582-regParam: 0.01,
	LogisticRegression_9cdfd2648582-standardization: true,
	LogisticRegression_9cdfd2648582-threshold: 0.5,
	LogisticRegression_9cdfd2648582-tol: 1.0E-6
}
Test Accuracy: 0.9789473684210527


Clustering – Iris Dataset

In [None]:
from pyspark.ml.clustering import KMeans

# Load dataset
iris = datasets.load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
df_pd = X.copy()
df_pd["label"] = iris.target
df = spark.createDataFrame(df_pd)

# Assemble features
assembler = VectorAssembler(inputCols=iris.feature_names, outputCol="features")
df = assembler.transform(df).select("features", "label")

# Train/Test split (not typical in clustering, but we can still do it for evaluation)
train, test = df.randomSplit([0.8, 0.2], seed=42)

# Model
kmeans = KMeans(featuresCol="features", k=3, seed=42)

# Hyperparameter tuning (try different cluster numbers)
paramGrid = (ParamGridBuilder()
             .addGrid(kmeans.k, [2, 3, 4, 5])
             .build())

evaluator = ClusteringEvaluator(featuresCol="features", predictionCol="prediction", metricName="silhouette")

cv = CrossValidator(estimator=kmeans,
                    estimatorParamMaps=paramGrid,
                    evaluator=evaluator,
                    numFolds=3)

cvModel = cv.fit(train)
predictions = cvModel.transform(test)

# Evaluation
silhouette = evaluator.evaluate(predictions)
print("Best k:", cvModel.bestModel.summary.k)
print("Silhouette Score:", silhouette)


Best k: 2
Silhouette Score: 0.8885470258075133
