# 测试PySpark和py4j

此笔记本用于验证PySpark和py4j是否在我们的自定义镜像中正常工作。

In [1]:
# 测试1: 检查py4j是否可以导入
try:
    import py4j
    print(f"✅ py4j版本: {py4j.__version__}")
except ImportError as e:
    print(f"❌ 无法导入py4j: {e}")

✅ py4j版本: 0.10.9.7


In [2]:
# 测试2: 检查PySpark是否可以导入
try:
    import pyspark
    print(f"✅ PySpark版本: {pyspark.__version__}")
except ImportError as e:
    print(f"❌ 无法导入PySpark: {e}")

✅ PySpark版本: 3.5.0


In [None]:
# 测试3: 创建SparkSession并运行简单任务
import threading
import time
from pyspark.sql import SparkSession

# 定义创建SparkSession的函数
def create_spark_session_with_timeout(result_dict):
    try:
        print("正在创建SparkSession...")
        spark = SparkSession.builder \
            .appName("PySparkPy4jTest") \
            .master("spark://spark-master:7077") \
            .config("spark.sql.adaptive.enabled", "true") \
            .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
            .config("spark.network.timeout", "120s") \
            .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
            .getOrCreate()
        
        result_dict['status'] = 'success'
        result_dict['spark'] = spark
        print("✅ SparkSession创建成功")
        print(f"Spark版本: {spark.version}")
        print(f"应用ID: {spark.sparkContext.applicationId}")
        
    except Exception as e:
        result_dict['status'] = 'error'
        result_dict['error'] = str(e)
        print(f"❌ 创建SparkSession时出错: {e}")

# 创建结果字典
result = {}

# 在单独的线程中创建SparkSession
print("启动SparkSession创建线程...")
thread = threading.Thread(target=create_spark_session_with_timeout, args=(result,))
thread.start()

# 等待最多60秒
timeout = 60
start_time = time.time()

while thread.is_alive() and (time.time() - start_time) < timeout:
    elapsed = int(time.time() - start_time)
    print(f"等待SparkSession创建... ({elapsed}s)")
    time.sleep(5)

if thread.is_alive():
    print("❌ 创建SparkSession超时")
else:
    if result.get('status') == 'success':
        # 继续执行任务
        spark = result['spark']
        
        # 创建简单的DataFrame
        print("\n创建测试DataFrame...")
        data = [("Alice", 25), ("Bob", 30), ("Charlie", 35)]
        columns = ["Name", "Age"]
        df = spark.createDataFrame(data, columns)
        
        print("DataFrame内容:")
        df.show()
        
        # 执行简单操作
        print("\n计算平均年龄...")
        avg_age = df.agg({"Age": "avg"}).collect()[0][0]
        print(f"平均年龄: {avg_age}")
        
        # 停止SparkSession
        print("\n正在停止SparkSession...")
        spark.stop()
        print("✅ SparkSession已停止")
        
    elif result.get('status') == 'error':
        print(f"❌ 创建SparkSession时出错: {result['error']}")

如果以上所有测试都成功执行并显示正确的输出，说明我们的自定义镜像已经正确配置，包含了所需的py4j模块，并且PySpark也能正常工作。