In [1]:
!pip install pyspark findspark scikit-learn joblib --quiet


In [2]:
import findspark
findspark.init()

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Model Deployment Prep").getOrCreate()

In [4]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Model Deployment Prep").getOrCreate()

df = spark.read.parquet("/content/drive/MyDrive/flight_delay_analysis_project/Outputs/preview_sample.parquet")

df.show(5)
df.printSchema()


+----+-----+-------+---+-----------+-------------+-----------+--------------+-------------------+-------------------+--------------+---------------+--------+----------+--------------+------------+--------+--------+---------+-------+-----------------+------------+-------------+--------+---------+-------------------+----------------+-------------+-----------+------------+--------+---------------+-------+--------------------+-----------+---------+----------+----------+------+-----------+----------------+-------------+------------+-------------+-----------------+---------+
|YEAR|MONTH|AIRLINE|DAY|DAY_OF_WEEK|FLIGHT_NUMBER|TAIL_NUMBER|ORIGIN_AIRPORT|DESTINATION_AIRPORT|SCHEDULED_DEPARTURE|DEPARTURE_TIME|DEPARTURE_DELAY|TAXI_OUT|WHEELS_OFF|SCHEDULED_TIME|ELAPSED_TIME|AIR_TIME|DISTANCE|WHEELS_ON|TAXI_IN|SCHEDULED_ARRIVAL|ARRIVAL_TIME|ARRIVAL_DELAY|DIVERTED|CANCELLED|CANCELLATION_REASON|AIR_SYSTEM_DELAY|AIRLINE_DELAY|TOTAL_DELAY|ON_TIME_FLAG|DEP_HOUR|   carrier_name|AIRPORT|        airport_nam

In [3]:
# Convert Spark → Pandas (small sample only)
pdf = df.limit(300000).toPandas()   # 3 lakh rows → safe for RAM
pdf.to_csv("training_sample.csv", index=False)

print("Sample created:", pdf.shape)


Sample created: (300000, 46)


In [4]:
import pandas as pd

pdf = pd.read_csv("training_sample.csv")

# Create binary classification label
pdf["LABEL"] = (pdf["ARRIVAL_DELAY"] > 15).astype(int)

# Handle missing values
pdf = pdf.dropna(subset=["ARRIVAL_DELAY", "MONTH", "DAY", "DISTANCE", "AIRLINE"])


  pdf = pd.read_csv("training_sample.csv")


In [5]:
from sklearn.preprocessing import LabelEncoder

le_airline = LabelEncoder()
pdf["AIRLINE_ENC"] = le_airline.fit_transform(pdf["AIRLINE"])


In [6]:
X = pdf[["AIRLINE_ENC", "MONTH", "DAY", "DISTANCE"]]
y = pdf["LABEL"]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42)
model.fit(X_train, y_train)


In [8]:
from sklearn.metrics import accuracy_score, classification_report

pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, pred))
print(classification_report(y_test, pred))


Accuracy: 0.75735
              precision    recall  f1-score   support

           0       0.79      0.87      0.83     39759
           1       0.68      0.53      0.60     20241

    accuracy                           0.76     60000
   macro avg       0.73      0.70      0.71     60000
weighted avg       0.75      0.76      0.75     60000



In [10]:
import joblib
joblib.dump(model, "flight_delay_model.pkl")
print("Model saved as flight_delay_model.pkl")


Model saved as flight_delay_model.pkl


In [11]:
joblib.dump(le_airline, "airline_encoder.pkl")
print("Encoder saved as airline_encoder.pkl")


Encoder saved as airline_encoder.pkl


In [12]:
from google.colab import files

files.download("flight_delay_model.pkl")
files.download("airline_encoder.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>