# Weather Impact Prediction

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve


## Load Data

In [2]:
data = pd.read_csv('TAS_Airport_Data_2024_balanced.csv')
data

Unnamed: 0,FlightID,Airline,DepartureAirport,ArrivalAirport,OriginAirport,Gate,ScheduledTime,ScheduledArrivalTime,FlightDuration,BaggageHandlers,...,GateAvailability,GateType,AircraftTurnaroundTime,CheckInQueueTime,SecurityWaitTime,LoungeUsage,OnTimePerformance,BaggageClaimBelts,CustomsOfficers,ImmigrationOfficers
0,HY20240101_001D,HY,TAS,FRA,,C18,2024-01-01 22:11:00,,135,4,...,Available,International,57,7,27,No,Delayed,2,9,5
1,KE20240101_002D,KE,TAS,KUL,,C3,2024-01-01 08:02:00,,521,4,...,Available,International,57,7,27,No,Delayed,2,9,5
2,KE20240101_003D,KE,TAS,IST,,C3,2024-01-01 20:33:00,,291,4,...,Available,International,57,7,27,No,Delayed,2,9,5
3,HY20240101_004D,HY,TAS,SIN,,C3,2024-01-01 20:03:00,,462,2,...,Available,International,57,7,27,No,OnTime,2,9,5
4,HY20240101_005A,HY,,,FRA,C8,,2024-01-01 11:50:00,192,4,...,Available,International,57,7,27,No,OnTime,2,9,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32935,TK20241231_086A,TK,,,DXB,B1,,2024-12-31 16:50:00,119,6,...,Available,International,57,7,27,No,OnTime,2,9,5
32936,HY20241231_087A,HY,,,DXB,B16,,2024-12-31 22:10:00,591,2,...,Available,International,57,7,27,No,Delayed,2,9,5
32937,HY20241231_088A,HY,,,LHR,C8,,2024-12-31 19:04:00,103,4,...,Available,International,57,7,27,No,Delayed,2,9,5
32938,TK20241231_089A,TK,,,DEL,B20,,2025-01-01 06:42:00,596,4,...,Available,International,57,7,27,No,OnTime,2,9,5


DelayDueToWeather
1.0    13176
Name: count, dtype: int64


In [3]:
bad_weather_conditions = ["Storm", "Heavy Rain", "Snow", "Fog", "Thunderstorm"]

## New binary target

In [4]:
data["DelayDueToWeather"] = np.where(
    (data["WeatherCondition"].isin(bad_weather_conditions)) & (data["DepartureDelay"] > 15),
    1, 0
)



## Drop iirelevant columns

In [5]:
data = data.drop(columns=["FlightID", "DepartureAirport", "ArrivalAirport", "ScheduledTime", "ScheduledArrivalTime", 
                      "ReasonForDelay", "OnTimePerformance", "DepartureDelay", "ArrivalDelay"])  

## Fill missing values and Encode categorical variables

In [6]:
data = data.fillna(method='ffill')


categorical_cols = data.select_dtypes(include="object").columns
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le


## Feature matrix and target

In [7]:
X = data.drop(columns=["DelayDueToWeather"])
y = data["DelayDueToWeather"]

## Split data

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y)

## Feature scaling

In [9]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Model training

In [10]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

## Predictions

In [11]:
y_pred = model.predict(X_test_scaled)
y_prob = model.predict_proba(X_test_scaled)[:, 1]

IndexError: index 1 is out of bounds for axis 1 with size 1

In [12]:
print(y.value_counts())

DelayDueToWeather
0    32940
Name: count, dtype: int64


In [13]:
if len(np.unique(y_test)) > 1:
    y_prob = model.predict_proba(X_test_scaled)[:, 1]

    # ROC Curve
    fpr, tpr, thresholds = roc_curve(y_test, y_prob)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f"AUC = {roc_auc_score(y_test, y_prob):.2f}")
    plt.plot([0, 1], [0, 1], linestyle='--')
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve")
    plt.legend()
    plt.grid(True)
    plt.savefig("weather_delay_roc_curve.png")
    plt.show()
else:
    print("Only one class present in y_test. ROC Curve cannot be drawn.")

Only one class present in y_test. ROC Curve cannot be drawn.


In [14]:
print("Train set class distribution:")
print(y_train.value_counts())

print("\nTest set class distribution:")
print(y_test.value_counts())

Train set class distribution:
DelayDueToWeather
0    26352
Name: count, dtype: int64

Test set class distribution:
DelayDueToWeather
0    6588
Name: count, dtype: int64


In [19]:
joblib.dump(model, "weather_estimation.joblib")

['weather_estimation.joblib']