# Set up spark environment and spark session

In [1]:
import os

# 1. Install OpenJDK 21 (if not already done in a previous cell)
!apt-get update -qq
!apt-get install -qq openjdk-21-jdk-headless

# 2. Verify where it landed (if needed)
!ls /usr/lib/jvm | grep 21

# 3. Point to JDK 21
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-21-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

# 4. Install PySpark via pip (make sure this happens AFTER setting JAVA_HOME)
!pip install pyspark --quiet
# 5. Import and start Spark
from pyspark.sql import SparkSession
spark = (
    SparkSession.builder
      .master("local[*]")
      .appName("RDD-Statistics")
      .getOrCreate()
)

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Selecting previously unselected package openjdk-21-jre-headless:amd64.
(Reading database ... 126109 files and directories currently installed.)
Preparing to unpack .../openjdk-21-jre-headless_21.0.7+6~us1-0ubuntu1~22.04_amd64.deb ...
Unpacking openjdk-21-jre-headless:amd64 (21.0.7+6~us1-0ubuntu1~22.04) ...
Selecting previously unselected package openjdk-21-jdk-headless:amd64.
Preparing to unpack .../openjdk-21-jdk-headless_21.0.7+6~us1-0ubuntu1~22.04_amd64.deb ...
Unpacking openjdk-21-jdk-headless:amd64 (21.0.7+6~us1-0ubuntu1~22.04) ...
Setting up openjdk-21-jre-headless:amd64 (21.0.7+6~us1-0ubuntu1~22.04) ...
update-alternatives: using /usr/lib/jvm/java-21-openjdk-amd64/bin/java to provide /usr/bin/java (java) in auto mode
update-alternatives: using /usr/lib/jvm/java-21-openjdk-amd64/bin/jpackage to

In [2]:
from pyspark.sql.types import StructType, StructField,StringType,IntegerType
from pyspark.sql import SparkSession
from pyspark import SparkContext
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)



In [3]:
# Create a list of tuple
data = [('ann', 'spring', 'math', 98),
        ('ann', 'fall', 'bio', 50),
        ('bob', 'spring', 'stats', 100),
        ('bob', 'fall', 'stats', 92),
        ('bob', 'summer', 'stats', 100),
        ('charles', 'spring', 'stats', 88),
        ('charles', 'fall', 'bio', 100)]
# Create a RDD from the list
rdd = sc.parallelize(data)
# Create a PySpark DataFrame from RDD
df = spark.createDataFrame(rdd, ['name', 'semester', 'subject', 'score'])
df.show()
df.describe()
df.select("score").describe().show()
# Filtering rows
df.filter(df['score'] > 90).show()
# Mutating values
df.select(df['name'], df['semester'], df['subject'], df['score'],
(df['score'] - 10).alias('adj_score')).show()
df.withColumn('sqrt_socre', df['score']/2).show()
# Sorting
df.sort(df['score']).show()
df.sort(df['score'].desc()).show()
# Join
data = [('ann', 'female', 23),
        ('bob', 'male', 19),
        ('charles', 'male', 22),
        ('david', 'male', 23)]
# Define PySpark DataFrame schema: the entire schema is stored as a StructType and individual columns are stored as StructFields
schema = StructType([StructField('name', StringType(), True),
                    StructField('sex', StringType(), True),
                    StructField('age', IntegerType(), True)])
df_meta = spark.createDataFrame(data, schema)
df_meta.printSchema()
df.join(df_meta, on='name', how='inner').show()


+-------+--------+-------+-----+
|   name|semester|subject|score|
+-------+--------+-------+-----+
|    ann|  spring|   math|   98|
|    ann|    fall|    bio|   50|
|    bob|  spring|  stats|  100|
|    bob|    fall|  stats|   92|
|    bob|  summer|  stats|  100|
|charles|  spring|  stats|   88|
|charles|    fall|    bio|  100|
+-------+--------+-------+-----+

+-------+------------------+
|summary|             score|
+-------+------------------+
|  count|                 7|
|   mean| 89.71428571428571|
| stddev|18.126539343499314|
|    min|                50|
|    max|               100|
+-------+------------------+

+-------+--------+-------+-----+
|   name|semester|subject|score|
+-------+--------+-------+-----+
|    ann|  spring|   math|   98|
|    bob|  spring|  stats|  100|
|    bob|    fall|  stats|   92|
|    bob|  summer|  stats|  100|
|charles|    fall|    bio|  100|
+-------+--------+-------+-----+

+-------+--------+-------+-----+---------+
|   name|semester|subject|score|a

In [4]:
# Stop the Spark session
spark.stop()