In [2]:
import os

# Set SPARK_HOME and JAVA_HOME environment variables
os.environ['SPARK_HOME'] = '/usr/local/Cellar/apache-spark/3.5.1/libexec'
os.environ['JAVA_HOME'] = '/usr/local/opt/openjdk/libexec/openjdk.jdk/Contents/Home'

## Basics of Spark

Apache Spark is an open-source unified analytics engine for large-scale data processing. It provides an interface for programming entire clusters with implicit data parallelism and fault tolerance.

### Key Features of Spark:
- **Speed**: Spark processes in-memory, which makes it faster than traditional disk-based processing engines.
- **Ease of Use**: High-level APIs in Java, Scala, Python, and R, and a rich set of libraries including SQL, MLlib (for machine learning), GraphX (for graph processing), and Spark Streaming.
- **Generalized**: Combine SQL, streaming, and complex analytics.
- **Fault Tolerance**: Built-in support for fault tolerance.

## Spark Architecture

The architecture of Spark comprises the following components:

- **Driver**: The process that runs the `main()` function of the application and creates the `SparkContext`.
- **Cluster Manager**: The external service for acquiring resources on the cluster (e.g., YARN, Mesos, Kubernetes, Standalone).
- **Workers**: The nodes that execute the tasks.
- **Executors**: Run on worker nodes, executing the tasks and keeping data in memory.
- **Tasks**: Units of work sent to executors by the driver.

## Spark Execution Flow:

1. **Job Submission**: User submits a job.
2. **Task Scheduling**: Driver program splits the job into tasks.
3. **Task Execution**: Tasks are sent to executors for execution.
4. **Result Collection**: Results are collected and returned to the driver.


In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Create a SparkSession
spark = SparkSession.builder \
    .appName("Sample Spark Program") \
    .getOrCreate()

# Sample data
data = [ 
    ("John", 28), # list of tuples 
    ("Doe", 35),  # It could also be in the form of a list of dictionaries ex: [{"Name": "Alice", "Age": 22}]
    ("Alice", 22), # It could also be in the form of a list of Row objects ex: [Row(Name="Bob", Age=29)]
    ("Bob", 29)    # It could also be in the form of a list of namedtuples  ex: [Person(Name="Bob", Age=29)]
]         # It could also be in the form of a list of objects                ex: [Person("Bob", 29)]

# Create a DataFrame
columns = ["Name", "Age"]
df = spark.createDataFrame(data, columns)

# Show the DataFrame
print("Original DataFrame:")
df.show()

# Perform some transformations
df_transformed = df.withColumn("AgeAfter5Years", col("Age") + 5)

# Show the transformed DataFrame
print("Transformed DataFrame:")
df_transformed.show()

# Perform an action: Count the number of rows
count = df_transformed.count()
print(f"Number of rows in the DataFrame: {count}")

# # Stop the SparkSession
# spark.stop()


24/07/31 11:47:57 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


Original DataFrame:


                                                                                

+-----+---+
| Name|Age|
+-----+---+
| John| 28|
|  Doe| 35|
|Alice| 22|
|  Bob| 29|
+-----+---+

Transformed DataFrame:
+-----+---+--------------+
| Name|Age|AgeAfter5Years|
+-----+---+--------------+
| John| 28|            33|
|  Doe| 35|            40|
|Alice| 22|            27|
|  Bob| 29|            34|
+-----+---+--------------+

Number of rows in the DataFrame: 4


In [8]:
spark.sql("SHOW TABLES").show()  # Check available tables/views

# Register DataFrame as a SQL temporary view
df_transformed.createOrReplaceTempView("people")

# Run SQL query
result = spark.sql("SELECT * FROM people WHERE Age > 33")

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|         |   people|       true|
+---------+---------+-----------+



In [9]:
result.show()

+----+---+--------------+
|Name|Age|AgeAfter5Years|
+----+---+--------------+
| Doe| 35|            40|
+----+---+--------------+



Machine Learning
Using MLlib for Machine Learning Tasks

In [21]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Initialize Spark session
spark = SparkSession.builder.appName("LogisticRegressionExample").getOrCreate()

# Sample data
data = [(1.0, 1.0), (2.0, 0.0), (3.0, 1.0), (4.0, 0.0)]
columns = ["feature1", "label"]

df_ml = spark.createDataFrame(data, columns)

# Assemble features into a feature vector
# The inputCols parameter is a list of columns that will be used as features
# The outputCol parameter is the name of the output column that will contain the feature vector
# for example if we have 2 columns "feature1" and "feature2" and we want to combine them into a single feature vector

assembler = VectorAssembler(inputCols=["feature1"], outputCol="features")
df_ml = assembler.transform(df_ml)

# Logistic Regression
lr = LogisticRegression(featuresCol="features", labelCol="label")
lr_model = lr.fit(df_ml)

# Make predictions
predictions = lr_model.transform(df_ml)

# Show predictions
predictions.select("feature1", "label", "prediction").show()

# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy:.4f}")



24/07/31 11:56:03 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
                                                                                

+--------+-----+----------+
|feature1|label|prediction|
+--------+-----+----------+
|     1.0|  1.0|       1.0|
|     2.0|  0.0|       1.0|
|     3.0|  1.0|       0.0|
|     4.0|  0.0|       0.0|
+--------+-----+----------+

Accuracy: 0.5000


In [22]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier

# Sample data
data = [("a", 1.0), ("b", 0.0)]
columns = ["category", "label"]

df_ml = spark.createDataFrame(data, columns)

# Stage 1: String Indexer
indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")

# Stage 2: Vector Assembler
assembler = VectorAssembler(inputCols=["categoryIndex"], outputCol="features")

# Stage 3: RandomForestClassifier
rf = RandomForestClassifier(labelCol="label", featuresCol="features")

# Pipeline
pipeline = Pipeline(stages=[indexer, assembler, rf])

# Train model
model = pipeline.fit(df_ml)

# Make predictions
predictions = model.transform(df_ml)
predictions.show()

# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy:.4f}")



24/07/31 11:56:38 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 32 to 2 (= number of training instances)


+--------+-----+-------------+--------+-------------+--------------------+----------+
|category|label|categoryIndex|features|rawPrediction|         probability|prediction|
+--------+-----+-------------+--------+-------------+--------------------+----------+
|       a|  1.0|          0.0|   [0.0]|   [6.0,11.0]|[0.35294117647058...|       1.0|
|       b|  0.0|          1.0|   [1.0]|   [14.0,3.0]|[0.82352941176470...|       0.0|
+--------+-----+-------------+--------+-------------+--------------------+----------+

Accuracy: 1.0000
