In [3]:
# --- Install Java and Apache Spark ---
!apt-get update -qq
!apt-get install -y openjdk-11-jdk > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.3.4/spark-3.3.4-bin-hadoop3.tgz -O spark-3.3.4-bin-hadoop3.tgz
!tar xf spark-3.3.4-bin-hadoop3.tgz
!pip install -q findspark

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)


In [4]:
# Remove old Spark & Java
!rm -rf spark-3.3.2-bin-hadoop3
!rm -rf spark.tgz

# Install Java (REQUIRED)
!apt-get update -qq
!apt-get install -y openjdk-11-jdk > /dev/null

# Download Spark from official Apache mirror (ALWAYS WORKING)
!wget https://dlcdn.apache.org/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz -O spark.tgz

# Verify downloaded Spark tarball
!echo "Checking spark.tgz size and integrity..."
!ls -lh spark.tgz
!gzip -t spark.tgz # This will test the integrity of the gzipped file

# Extract Spark
!tar -xzf spark.tgz

# Verify Spark directory
!ls -l /content/spark-3.3.2-bin-hadoop3

# Install findspark
!pip install -q findspark

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
--2025-12-06 18:29:34--  https://dlcdn.apache.org/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz
Resolving dlcdn.apache.org (dlcdn.apache.org)... 151.101.2.132, 2a04:4e42::644
Connecting to dlcdn.apache.org (dlcdn.apache.org)|151.101.2.132|:443... connected.
HTTP request sent, awaiting response... 404 Not Found
2025-12-06 18:29:34 ERROR 404: Not Found.

Checking spark.tgz size and integrity...
-rw-r--r-- 1 root root 0 Dec  6 18:29 spark.tgz

gzip: spark.tgz: unexpected end of file

gzip: stdin: unexpected end of file
tar: Child returned status 1
tar: Error is not recoverable: exiting now
ls: cannot access '/content/spark-3.3.2-bin-hadoop3': No such file or directory


In [6]:
import os, findspark

# Set environment paths for Java and Spark
os.environ["JAVA_HOME"]  = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.3.4-bin-hadoop3"
findspark.init()

from pyspark.sql import SparkSession

# Initialize SparkSession
spark = (SparkSession.builder
         .appName("Flight_Delay_Analysis")
         .master("local[*]")
         .config("spark.sql.shuffle.partitions", "8")
         .getOrCreate())

spark


In [7]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
!ls /content/drive/MyDrive/flight_delay_analysis_project/Data


Airline_Delay_Cause.csv  Download_Column_Definitions.xlsx  flights.csv


In [9]:
# Define base path
base_path = "/content/drive/MyDrive/flight_delay_analysis_project/Data"

# Define dataset paths
flights_path = f"{base_path}/flights.csv"
delay_path   = f"{base_path}/Airline_Delay_Cause.csv"

# Read CSVs
flights_df = spark.read.csv(flights_path, header=True, inferSchema=True)
delay_df   = spark.read.csv(delay_path, header=True, inferSchema=True)

# Display a few rows
print("=== Flights Dataset Sample ===")
flights_df.show(5)

print("=== Airline Delay Cause Dataset Sample ===")
delay_df.show(5)


=== Flights Dataset Sample ===
+----+-----+---+-----------+-------+-------------+-----------+--------------+-------------------+-------------------+--------------+---------------+--------+----------+--------------+------------+--------+--------+---------+-------+-----------------+------------+-------------+--------+---------+-------------------+----------------+--------------+-------------+-------------------+-------------+
|YEAR|MONTH|DAY|DAY_OF_WEEK|AIRLINE|FLIGHT_NUMBER|TAIL_NUMBER|ORIGIN_AIRPORT|DESTINATION_AIRPORT|SCHEDULED_DEPARTURE|DEPARTURE_TIME|DEPARTURE_DELAY|TAXI_OUT|WHEELS_OFF|SCHEDULED_TIME|ELAPSED_TIME|AIR_TIME|DISTANCE|WHEELS_ON|TAXI_IN|SCHEDULED_ARRIVAL|ARRIVAL_TIME|ARRIVAL_DELAY|DIVERTED|CANCELLED|CANCELLATION_REASON|AIR_SYSTEM_DELAY|SECURITY_DELAY|AIRLINE_DELAY|LATE_AIRCRAFT_DELAY|WEATHER_DELAY|
+----+-----+---+-----------+-------+-------------+-----------+--------------+-------------------+-------------------+--------------+---------------+--------+----------+-------

In [13]:
print("Flights Dataset Schema:")
flights_df.printSchema()

print("\nDelay Cause Dataset Schema:")
delay_df.printSchema()

print("\nFlights Record Count:", flights_df.count())
print("Delay Cause Record Count:", delay_df.count())


Flights Dataset Schema:
root
 |-- YEAR: integer (nullable = true)
 |-- MONTH: integer (nullable = true)
 |-- DAY: integer (nullable = true)
 |-- DAY_OF_WEEK: integer (nullable = true)
 |-- AIRLINE: string (nullable = true)
 |-- FLIGHT_NUMBER: integer (nullable = true)
 |-- TAIL_NUMBER: string (nullable = true)
 |-- ORIGIN_AIRPORT: string (nullable = true)
 |-- DESTINATION_AIRPORT: string (nullable = true)
 |-- SCHEDULED_DEPARTURE: integer (nullable = true)
 |-- DEPARTURE_TIME: integer (nullable = true)
 |-- DEPARTURE_DELAY: integer (nullable = true)
 |-- TAXI_OUT: integer (nullable = true)
 |-- WHEELS_OFF: integer (nullable = true)
 |-- SCHEDULED_TIME: integer (nullable = true)
 |-- ELAPSED_TIME: integer (nullable = true)
 |-- AIR_TIME: integer (nullable = true)
 |-- DISTANCE: integer (nullable = true)
 |-- WHEELS_ON: integer (nullable = true)
 |-- TAXI_IN: integer (nullable = true)
 |-- SCHEDULED_ARRIVAL: integer (nullable = true)
 |-- ARRIVAL_TIME: integer (nullable = true)
 |-- ARRI

In [11]:
# Create schema summary for flights_df
schema_data = [(f.name, f.dataType.simpleString(), f.nullable) for f in flights_df.schema.fields]

# Convert to a Pandas DataFrame for easy display
import pandas as pd
schema_df = pd.DataFrame(schema_data, columns=["Column Name", "Data Type", "Nullable"])

# Display as a clean table
from IPython.display import display
display(schema_df.head(15))  # Show first 15 columns


Unnamed: 0,Column Name,Data Type,Nullable
0,YEAR,int,True
1,MONTH,int,True
2,DAY,int,True
3,DAY_OF_WEEK,int,True
4,AIRLINE,string,True
5,FLIGHT_NUMBER,int,True
6,TAIL_NUMBER,string,True
7,ORIGIN_AIRPORT,string,True
8,DESTINATION_AIRPORT,string,True
9,SCHEDULED_DEPARTURE,int,True


In [12]:
schema_df.to_csv("/content/flights_schema.csv", index=False)
