In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator

# Initialize Spark session
spark = SparkSession.builder.appName('RegressionExample').getOrCreate()

# Load data (example CSV file)
data = spark.read.csv('your_data.csv', header=True, inferSchema=True)

# Select features and target variable
features = ['feature1', 'feature2', 'feature3']  # Replace with your feature names
target = 'target_column'  # Replace with your target column name

# Assemble features into a vector column
assembler = VectorAssembler(inputCols=features, outputCol='features')
data = assembler.transform(data)

# Split the data into training and test sets
train_data, test_data = data.randomSplit([0.8, 0.2])

# Initialize and train the linear regression model
lr = LinearRegression(featuresCol='features', labelCol=target)
lr_model = lr.fit(train_data)

# Make predictions on test data
predictions = lr_model.transform(test_data)

# Initialize RegressionEvaluator
evaluator = RegressionEvaluator(predictionCol="prediction", labelCol=target, metricName="rmse")

# Calculate RMSE (Root Mean Squared Error)
rmse = evaluator.evaluate(predictions)
print(f'Root Mean Squared Error (RMSE): {rmse}')

# You can also calculate other metrics such as MSE, R2, and MAE:
mse_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol=target, metricName="mse")
mse = mse_evaluator.evaluate(predictions)
print(f'Mean Squared Error (MSE): {mse}')

r2_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol=target, metricName="r2")
r2 = r2_evaluator.evaluate(predictions)
print(f'R-squared (R2): {r2}')

mae_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol=target, metricName="mae")
mae = mae_evaluator.evaluate(predictions)
print(f'Mean Absolute Error (MAE): {mae}')

# Show predictions
predictions.select('features', target, 'prediction').show()


In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import ClusteringEvaluator

# Initialize Spark session
spark = SparkSession.builder.appName('ClusteringExample').getOrCreate()

# Load data (example CSV file)
data = spark.read.csv('your_data.csv', header=True, inferSchema=True)

# Select features for clustering
features = ['feature1', 'feature2', 'feature3']  # Replace with your feature names

# Assemble features into a vector column
assembler = VectorAssembler(inputCols=features, outputCol='features')
data = assembler.transform(data)

# Initialize KMeans and set parameters
kmeans = KMeans(k=3, featuresCol='features', predictionCol='prediction')  # Set k as the number of clusters

# Fit the model
kmeans_model = kmeans.fit(data)

# Make predictions
predictions = kmeans_model.transform(data)

# Initialize ClusteringEvaluator
evaluator = ClusteringEvaluator(predictionCol='prediction', featuresCol='features', metricName='silhouette')

# Calculate Silhouette Score
silhouette_score = evaluator.evaluate(predictions)
print(f'Silhouette Score: {silhouette_score}')

# You can also calculate other clustering metrics such as Davies-Bouldin Index (DBI) if needed.
