In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, count

# Initialize Spark session
spark = SparkSession.builder \
    .appName("StudentDatasetAnalysis") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

# Load the dataset
csv_path = "students.csv"  # Update the path if needed
df_spark = spark.read.csv(csv_path, header=True, inferSchema=True)

# Show dataset structure
df_spark.printSchema()

# Perform basic statistics on numeric columns
df_spark.describe().show()

# Average GPA per department
df_spark.groupBy("Department").agg(avg("GPA").alias("AvgGPA")).show()

# Count students per graduation year
df_spark.groupBy("GraduationYear").agg(count("*").alias("StudentCount")).orderBy("GraduationYear").show()

# Stop Spark session
spark.stop()

root
 |-- StudentID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Email: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- GPA: double (nullable = true)
 |-- GraduationYear: integer (nullable = true)

+-------+------------------+--------------+------------------+-----------------+----------+------------------+------------------+
|summary|         StudentID|          Name|               Age|            Email|Department|               GPA|    GraduationYear|
+-------+------------------+--------------+------------------+-----------------+----------+------------------+------------------+
|  count|               200|           200|               200|              200|       200|               200|               200|
|   mean|           5726.46|          NULL|             21.58|             NULL|      NULL| 3.027849999999999|           2027.18|
| stddev|2507.4076220903103|          NULL|2.2670016502887815|       