In [9]:
# Notebook cell: register_hive_table (paste & run)
from pyspark.sql import SparkSession
import traceback

# Stop any existing Spark session to avoid "stopped SparkContext" issues
try:
    spark.stop()
except Exception:
    pass

# Create SparkSession with a bit more resources (tune as needed)
spark = SparkSession.builder \
    .appName("register_flights_parquet") \
    .master("spark://spark-master:7077") \
    .config("spark.driver.memory", "2g") \
    .config("spark.executor.memory", "2g") \
    .config("spark.executor.cores", "1") \
    .config("spark.executor.instances", "2") \
    .enableHiveSupport() \
    .config("spark.sql.warehouse.dir", "hdfs://namenode:8020/user/hive/warehouse") \
    .config("spark.hadoop.hive.metastore.uris", "thrift://hive-metastore:9083") \
    .getOrCreate()

parquet_path = "hdfs://namenode:8020/data/parquet/flights_2006"
table_name = "flights_2006_staged"

# 1) Quick sample read to confirm path/permissions
try:
    print("Reading small sample from parquet (fast):")
    sample = spark.read.parquet(parquet_path).limit(1000)
    sample.show(5)
    print("Sample count:", sample.count())
except Exception:
    print("Failed to read parquet sample — check HDFS path/permissions and that Parquet files exist.")
    traceback.print_exc()
    raise

# 2) Infer schema and build CREATE EXTERNAL TABLE DDL
df = spark.read.parquet(parquet_path)
print("Inferred schema:")
df.printSchema()

type_map = {
    "int": "INT",
    "bigint": "BIGINT",
    "string": "STRING",
    "double": "DOUBLE",
    "float": "FLOAT",
    "boolean": "BOOLEAN",
    "tinyint": "TINYINT",
    "smallint": "SMALLINT",
    "decimal": "DECIMAL"
}

cols = []
for (name, dtype) in df.dtypes:
    hive_type = type_map.get(dtype.lower(), "STRING")
    cols.append(f"`{name}` {hive_type}")
cols_ddl = ",\n  ".join(cols)

create_stmt = f"""
CREATE EXTERNAL TABLE {table_name} (
  {cols_ddl}
)
STORED AS PARQUET
LOCATION '{parquet_path}'
"""

# 3) Execute DROP and CREATE as separate statements (Spark expects single statements)
try:
    print(f"Dropping table if exists: {table_name}")
    spark.sql(f"DROP TABLE IF EXISTS {table_name}")
    print("Creating EXTERNAL table...")
    spark.sql(create_stmt)
    print(f"Table {table_name} created (metadata only).")
except Exception:
    print("CREATE TABLE failed — traceback:")
    traceback.print_exc()
    raise

# 4) (Optional) If data is partitioned by Year/Month on disk, repair partitions
# Uncomment the next two lines if you wrote partitions to disk
try:
    print("Repairing partitions (if any)...")
    spark.sql(f"MSCK REPAIR TABLE {table_name}")
except Exception:
    print("MSCK REPAIR TABLE failed or not needed; continuing.")
    # continue without raising

# 5) Verification: first a quick groupBy (faster), then full COUNT (may take time)
try:
    print("Quick verification: counts by Year (fast):")
    spark.sql(f"SELECT Year, COUNT(*) AS cnt FROM {table_name} GROUP BY Year ORDER BY Year").show()
    print("Now running full COUNT (may take time)...")
    spark.sql(f"SELECT COUNT(*) AS cnt FROM {table_name}").show()
except Exception:
    print("Verification failed — full traceback:")
    traceback.print_exc()
    # If the COUNT is stuck, cancel from another cell:
    # spark.sparkContext.cancelAllJobs()

Reading small sample from parquet (fast):
+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+----+-----+
|DayofMonth|DayOfWeek|DepTime|CRSDepTime|ArrTime|CRSArrTime|UniqueCarrier|FlightNum|TailNum|ActualElapsedTime|CRSElapsedTime|AirTime|ArrDelay|DepDelay|Origin|Dest|Distance|TaxiIn|TaxiOut|Cancelled|CancellationCode|Diverted|CarrierDelay|WeatherDelay|NASDelay|SecurityDelay|LateAircraftDelay|Year|Month|
+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+----+-----+
|        13|        7|   1748|      1800|   1947|   

In [8]:
spark.sparkContext.cancelAllJobs()
spark.stop()