In [None]:
from IPython.display import Javascript
Javascript("Jupyter.notebook.kernel.restart()")

<IPython.core.display.Javascript object>

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F, types as T
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import col, mean, stddev, corr, isnan
import traceback
from py4j.protocol import Py4JJavaError
import pandas as pd
import pyspark.sql.types as T

from scripts.load_and_clean import load_and_clean
from scripts.select_features import select_features


## 1. Load `clean_df` via scripts

In [2]:
# Load & clean (use the kernel SparkSession)
spark, clean_df = load_and_clean()

print("Loaded clean_df rows:", clean_df.count())
clean_df.printSchema()
clean_df.show(5, truncate=False)

Parquet path not found: hdfs://namenode:8020/data/parquet/flights_2006_cleaned
Parquet path not found: hdfs://namenode:8020/data/parquet/flights_2006
Parquet path not found: hdfs://namenode:8020/data/parquet/flights_2006_features
Loading raw CSV fallback: hdfs://namenode:8020/data/flights/2006.csv
   Year  Month  DayofMonth  DayOfWeek DepTime  CRSDepTime ArrTime  CRSArrTime  \
0  2006      1          11          3     743         745    1024        1018   
1  2006      1          11          3    1053        1053    1313        1318   
2  2006      1          11          3    1915        1915    2110        2133   
3  2006      1          11          3    1753        1755    1925        1933   
4  2006      1          11          3     824         832    1015        1015   

  UniqueCarrier  FlightNum  ... TaxiIn TaxiOut Cancelled CancellationCode  \
0            US        343  ...     45      13         0             None   
1            US        613  ...     27      19         0    

In [3]:
# clean target variable
# Diagnostics - run in your notebook cell
print(type(clean_df))
print("has count attr:", hasattr(clean_df, 'count'))
print("count callable:", callable(getattr(clean_df, 'count', None)))
print(type(clean_df))
print("rows:", clean_df.count())

<class 'pyspark.sql.dataframe.DataFrame'>
has count attr: True
count callable: True
<class 'pyspark.sql.dataframe.DataFrame'>
rows: 7003802


In [4]:
# verify counts & schema
print("type:", type(clean_df))
print("rows:", clean_df.count())
clean_df.printSchema()

# ArrDelay summary (spark summary)
clean_df.select('ArrDelay').summary('min','25%','50%','75%','max').show()

# quick sample
clean_df.show(5, truncate=False)

type: <class 'pyspark.sql.dataframe.DataFrame'>
rows: 7003802
root
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- DayofMonth: integer (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- DepTime: string (nullable = true)
 |-- CRSDepTime: integer (nullable = true)
 |-- ArrTime: string (nullable = true)
 |-- CRSArrTime: integer (nullable = true)
 |-- UniqueCarrier: string (nullable = true)
 |-- FlightNum: integer (nullable = true)
 |-- TailNum: string (nullable = true)
 |-- ActualElapsedTime: string (nullable = true)
 |-- CRSElapsedTime: string (nullable = true)
 |-- AirTime: string (nullable = true)
 |-- ArrDelay: double (nullable = true)
 |-- DepDelay: string (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Dest: string (nullable = true)
 |-- Distance: integer (nullable = true)
 |-- TaxiIn: integer (nullable = true)
 |-- TaxiOut: integer (nullable = true)
 |-- Cancelled: integer (nullable = true)
 |-- CancellationCode: string (nu

## 2. Feature Selection: select relevant features.

In [5]:
# Minimal explicit desired columns using names from your schema
keep_cols = [
    'Year', 'Month', 'DayofMonth', 'DayOfWeek',
    'DepTime', 'CRSDepTime', 'ArrTime', 'CRSArrTime',
    'DepDelay', 'Distance', 'Cancelled', 'TaxiIn', 'TaxiOut',
    'ArrDelay'
]

# Only keep columns that actually exist (safe)
existing = [c for c in keep_cols if c in clean_df.columns]
missing = [c for c in keep_cols if c not in clean_df.columns]
if missing:
    print("Warning — these requested columns are missing and will be skipped:", missing)

# Build select expressions and cast to sensible types
exprs = []
for c in existing:
    if c in ('Year', 'Month', 'DayofMonth', 'DayOfWeek', 'Cancelled'):
        exprs.append(F.col(c).cast(T.IntegerType()).alias(c))
    elif c in ('DepTime', 'CRSDepTime', 'ArrTime', 'CRSArrTime'):
        # times often stored as hhmm strings -> cast to int (may produce null for empty strings)
        exprs.append(F.col(c).cast(T.IntegerType()).alias(c))
    elif c in ('DepDelay', 'ArrDelay', 'TaxiIn', 'TaxiOut', 'Distance'):
        exprs.append(F.col(c).cast(T.DoubleType()).alias(c))
    else:
        exprs.append(F.col(c))

# Create features_df
features_df = clean_df.select(*exprs).persist()

# Materialize checks
print("features_df type:", type(features_df))
print("rows:", features_df.count())
features_df.printSchema()
features_df.show(5, truncate=False)

# Quick missingness report (safe: catches non-numeric isnan usage)
total = features_df.count()
miss = []
for col in features_df.columns:
    col_expr = F.col(col)
    try:
        nnull = features_df.filter(col_expr.isNull() | F.isnan(col_expr) | (col_expr == '')).count()
    except Exception:
        nnull = features_df.filter(col_expr.isNull() | (col_expr == '')).count()
    miss.append((col, int(nnull), round(nnull / total * 100, 3)))

miss_df = pd.DataFrame(miss, columns=['column', 'n_missing', 'pct_missing']).sort_values('pct_missing', ascending=False)
print("\nMissing values report:")
print(miss_df)

features_df type: <class 'pyspark.sql.dataframe.DataFrame'>
rows: 7003802
root
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- DayofMonth: integer (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- DepTime: integer (nullable = true)
 |-- CRSDepTime: integer (nullable = true)
 |-- ArrTime: integer (nullable = true)
 |-- CRSArrTime: integer (nullable = true)
 |-- DepDelay: double (nullable = true)
 |-- Distance: double (nullable = true)
 |-- Cancelled: integer (nullable = true)
 |-- TaxiIn: double (nullable = true)
 |-- TaxiOut: double (nullable = true)
 |-- ArrDelay: double (nullable = true)

+----+-----+----------+---------+-------+----------+-------+----------+--------+--------+---------+------+-------+--------+
|Year|Month|DayofMonth|DayOfWeek|DepTime|CRSDepTime|ArrTime|CRSArrTime|DepDelay|Distance|Cancelled|TaxiIn|TaxiOut|ArrDelay|
+----+-----+----------+---------+-------+----------+-------+----------+--------+--------+---------+------+-

## 3. Writes features_df to HDFS as partitioned Parquet

In [8]:
# Path where features Parquet will be written on HDFS
out_path = 'hdfs://namenode:8020/data/parquet/flights_2006_features'
try:
    # write partitioned by Year and Month with snappy compression
    features_df.write.mode('overwrite').partitionBy('Year','Month').option('compression','snappy').parquet(out_path)
    print('Wrote features_df to', out_path)
except Exception as e:
    print('Failed to write features to HDFS:', e)
    # re-raise so the notebook shows the full traceback if desired
    raise

Wrote features_df to hdfs://namenode:8020/data/parquet/flights_2006_features


## 4. Register a Hive external table points to Parquet file

In [11]:
# Try to register a Hive/Metastore table that points at the existing Parquet data.
# 1) Non-destructive: ask Spark to create a table that uses the Parquet files (no data rewrite)
try:
    spark.sql(
        f"CREATE TABLE IF NOT EXISTS default.flights_2006_features USING PARQUET OPTIONS (path '{out_path}')"
    )
    spark.sql('MSCK REPAIR TABLE default.flights_2006_features')
    print('Registered table default.flights_2006_features using PARQUET OPTIONS (no data rewrite)')
except Exception as e1:
    print('Primary registration (USING PARQUET OPTIONS) failed:', e1)
    print('Falling back to read+saveAsTable which will rewrite the table metadata (and may rewrite files).')
    try:
        df = spark.read.parquet(out_path)
        # saveAsTable with explicit path ensures metastore entry points to the existing location
        df.write.mode('overwrite').option('path', out_path).partitionBy('Year','Month').format('parquet').saveAsTable('default.flights_2006_features')
        print('Wrote and registered default.flights_2006_features via saveAsTable')
    except Exception as e2:
        print('Fallback registration also failed:', e2)
        print('You can still read the Parquet directly in Notebook 05 with spark.read.parquet(out_path)')

Registered table default.flights_2006_features using PARQUET OPTIONS (no data rewrite)
