# HDFS and Hive Integration Test

This notebook tests the integration between Hadoop (HDFS), Spark, and Hive in our data processing ecosystem, which was recently fixed to use Derby as the metastore database.

# Spark-Hive Integration Test

This notebook tests the integration between Apache Spark and Apache Hive in our data processing ecosystem.

In [None]:
from pyspark.sql import SparkSession

# Initialize Spark session with Hive support
spark = SparkSession.builder \
    .appName("HiveTest") \
    .config("spark.sql.warehouse.dir", "hdfs://namenode:8020/user/hive/warehouse") \
    .config("hive.metastore.uris", "thrift://hive-server:9083") \
    .enableHiveSupport() \
    .getOrCreate()

print(f"Spark version: {spark.version}")

In [None]:
# Show databases
print("Databases in Hive:")
spark.sql("SHOW DATABASES").show()

In [None]:
# Create a test table if it doesn't exist
spark.sql("CREATE TABLE IF NOT EXISTS spark_jupyter_test (id INT, name STRING)")

# Insert some test data
spark.sql("INSERT INTO spark_jupyter_test VALUES (1, 'test from jupyter')")

In [None]:
# Query the data
print("Data in spark_jupyter_test:")
spark.sql("SELECT * FROM spark_jupyter_test").show()

In [None]:
# Try to query the test_table we created earlier with beeline
print("Data in test_table:")
try:
    spark.sql("SELECT * FROM test_table").show()
except Exception as e:
    print(f"Error: {e}")

In [None]:
# Show all tables
print("All tables:")
spark.sql("SHOW TABLES").show()

In [None]:
# Clean up
spark.stop()

In [None]:
# Test basic Spark functionality
import pyspark
print(f"PySpark version: {pyspark.__version__}")

# Initialize basic Spark session first
spark = pyspark.sql.SparkSession.builder \
    .appName("TestSpark") \
    .getOrCreate()

# Print Spark version
print(f"Spark version: {spark.version}")

In [None]:
# Test HDFS access
try:
    # Read the test file we created in HDFS
    hdfs_file = spark.read.text("hdfs://namenode:8020/hdfs_test.txt")
    print("HDFS test file content:")
    hdfs_file.show()
    print("✅ HDFS integration is working correctly!")
except Exception as e:
    print(f"❌ Error reading HDFS file: {e}")

In [None]:
# Stop the current session and create one with Hive support
spark.stop()

# Create a new Spark session with Hive support
spark = pyspark.sql.SparkSession.builder \
    .appName("HiveTest") \
    .config("spark.sql.warehouse.dir", "hdfs://namenode:8020/user/hive/warehouse") \
    .enableHiveSupport() \
    .getOrCreate()

# Test Hive integration
try:
    # Show available databases
    print("Available databases:")
    spark.sql("SHOW DATABASES").show()
    
    # Show available tables
    print("\nAvailable tables:")
    spark.sql("SHOW TABLES").show()
    
    print("\n✅ Hive integration is working correctly!")
except Exception as e:
    print(f"❌ Error with Hive integration: {e}")

In [None]:
# Test creating and querying a Hive table
try:
    # Create a test table
    spark.sql("CREATE TABLE IF NOT EXISTS jupyter_test (id INT, name STRING)")
    
    # Insert test data
    spark.sql("INSERT INTO jupyter_test VALUES (1, 'jupyter test')")
    
    # Query the table
    print("Data in jupyter_test:")
    spark.sql("SELECT * FROM jupyter_test").show()
    
    print("\n✅ Hive table creation and querying is working correctly!")
except Exception as e:
    print(f"❌ Error with Hive table operations: {e}")

In [None]:
# Clean up
spark.stop()
print("Test completed!")