# 调试PySpark连接问题

此笔记本用于调试PySpark连接问题，逐步检查连接过程中的各个环节。

In [None]:
# 测试1: 检查PySpark是否可以导入
try:
    import pyspark
    print(f"✅ PySpark版本: {pyspark.__version__}")
except ImportError as e:
    print(f"❌ 无法导入PySpark: {e}")

In [None]:
# 测试2: 检查py4j是否可以导入
try:
    import py4j
    print(f"✅ py4j版本: {py4j.__version__}")
except ImportError as e:
    print(f"❌ 无法导入py4j: {e}")

In [None]:
# 测试3: 创建基本的SparkContext
try:
    from pyspark import SparkContext, SparkConf
    
    print("正在创建SparkConf...")
    conf = SparkConf().setAppName("DebugConnection").setMaster("spark://spark-master:7077")
    print("SparkConf创建成功")
    
    print("正在创建SparkContext...")
    sc = SparkContext(conf=conf)
    print("✅ SparkContext创建成功")
    print(f"Spark版本: {sc.version}")
    
    print("正在停止SparkContext...")
    sc.stop()
    print("✅ SparkContext已停止")
    
except Exception as e:
    print(f"❌ 创建或使用SparkContext时出错: {e}")

In [None]:
# 测试4: 创建SparkSession（带超时）
import threading
import time
from pyspark.sql import SparkSession

# 定义创建SparkSession的函数
def create_spark_session(result_dict):
    try:
        print("正在创建SparkSession...")
        spark = SparkSession.builder \
            .appName("DebugConnectionWithTimeout") \
            .master("spark://spark-master:7077") \
            .config("spark.sql.adaptive.enabled", "true") \
            .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
            .config("spark.network.timeout", "120s") \
            .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
            .getOrCreate()
        
        result_dict['status'] = 'success'
        result_dict['spark'] = spark
        print("✅ SparkSession创建成功")
        print(f"Spark版本: {spark.version}")
        print(f"应用ID: {spark.sparkContext.applicationId}")
        
    except Exception as e:
        result_dict['status'] = 'error'
        result_dict['error'] = e
        print(f"❌ 创建SparkSession时出错: {e}")

# 创建结果字典
result = {}

# 在单独的线程中创建SparkSession
thread = threading.Thread(target=create_spark_session, args=(result,))
thread.start()

# 等待最多60秒
timeout = 60
start_time = time.time()

while thread.is_alive() and (time.time() - start_time) < timeout:
    print(f"等待SparkSession创建... ({int(time.time() - start_time)}s)")
    time.sleep(5)

if thread.is_alive():
    print("❌ 创建SparkSession超时")
else:
    if result.get('status') == 'success':
        # 创建简单的DataFrame
        spark = result['spark']
        print("\n创建测试DataFrame...")
        data = [("Alice", 25), ("Bob", 30), ("Charlie", 35)]
        columns = ["Name", "Age"]
        df = spark.createDataFrame(data, columns)
        
        print("DataFrame内容:")
        df.show()
        
        print("正在停止SparkSession...")
        spark.stop()
        print("✅ SparkSession已停止")
    elif result.get('status') == 'error':
        print(f"❌ 创建SparkSession时出错: {result['error']}")