Library

In [2]:
!pip install python-docx



Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.1.2


Importing Dataset

In [3]:
import zipfile
import os


zip_file = 'flights_data.zip'
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
    zip_ref.extractall('flights_data')


os.listdir('flights_data')


['Flights Data']

Read Files

In [4]:
import json
import pandas as pd
from docx import Document


def read_docx_file(filepath):
    doc = Document(filepath)
    return '\n'.join([para.text for para in doc.paragraphs])


def parse_json_from_docx(directory):
    data = []
    for filename in os.listdir(directory):
        if filename.endswith('.docx'):
            filepath = os.path.join(directory, filename)
            doc_text = read_docx_file(filepath)
            try:

                json_data = json.loads(doc_text)
                data.extend(json_data)
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON in file {filename}: {e}")
    return data


data_directory = 'flights_data/Flights Data'
raw_data = parse_json_from_docx(data_directory)


df = pd.json_normalize(raw_data)


print(df.head())


        type  status departure.iataCode departure.icaoCode departure.terminal  \
0  departure  active                lhe               opla                  m   
1  departure  active                lhe               opla                  m   
2  departure  active                lhe               opla                  m   
3  departure  active                lhe               opla                  m   
4  departure  active                lhe               opla                NaN   

   departure.scheduledTime  departure.estimatedTime arrival.iataCode  \
0  2023-08-01t08:00:00.000  2023-08-01t08:00:00.000              jed   
1  2023-08-01t09:00:00.000  2023-08-01t09:00:00.000              khi   
2  2023-08-01t09:10:00.000                      NaN              jed   
3  2023-08-01t09:15:00.000  2023-08-01t09:21:00.000              uet   
4  2023-08-01t09:50:00.000  2023-08-01t09:50:00.000              urc   

  arrival.icaoCode arrival.terminal  ... codeshared.airline.iataCode  \
0       

Missing Values

In [5]:

df['departure.delay'] = df['departure.delay'].fillna(df['departure.delay'].median())


categorical_columns = ['departure.terminal', 'arrival.terminal', 'airline.name', 'flight.number',
                       'codeshared.airline.name', 'codeshared.flight.number']
for col in categorical_columns:
    if col in df.columns:
        df[col] = df[col].fillna('Unknown')


In [6]:

df['departure.scheduledTime'] = pd.to_datetime(df['departure.scheduledTime'], errors='coerce')
df['departure.estimatedTime'] = pd.to_datetime(df['departure.estimatedTime'], errors='coerce')


df['delay_minutes'] = (df['departure.estimatedTime'] - df['departure.scheduledTime']).dt.total_seconds() / 60


df['delay_minutes'] = df['delay_minutes'].fillna(0)


In [7]:

df['delay_bin'] = pd.qcut(df['delay_minutes'], 8, labels=False, duplicates='drop')


print(df[['delay_minutes', 'delay_bin']].head())


   delay_minutes  delay_bin
0            0.0          0
1            0.0          0
2            0.0          0
3            6.0          1
4            0.0          0


Logistic Regression

In [8]:
import numpy as np

class LogisticRegressionOVR:
    def __init__(self, learning_rate=0.01, epochs=1000):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.classes = None
        self.weights = {}
        self.bias = {}

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
        self.classes = np.unique(y)
        for cls in self.classes:
            y_binary = np.where(y == cls, 1, 0)
            w = np.zeros(X.shape[1])
            b = 0
            for _ in range(self.epochs):
                linear_model = np.dot(X, w) + b
                y_pred = self.sigmoid(linear_model)
                dw = np.dot(X.T, (y_pred - y_binary)) / len(y_binary)
                db = np.sum(y_pred - y_binary) / len(y_binary)
                w -= self.learning_rate * dw
                b -= self.learning_rate * db
            self.weights[cls] = w
            self.bias[cls] = b

    def predict(self, X):
        probabilities = {}
        for cls in self.classes:
            linear_model = np.dot(X, self.weights[cls]) + self.bias[cls]
            probabilities[cls] = self.sigmoid(linear_model)
        predictions = np.argmax(list(probabilities.values()), axis=0)
        return predictions


Decision Tree

In [9]:
class DecisionTreeNode:
    def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

class DecisionTree:
    def __init__(self, max_depth=5, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.root = None

    def fit(self, X, y):
        self.root = self._grow_tree(X, y)

    def _gini(self, y):
        classes, counts = np.unique(y, return_counts=True)
        impurity = 1 - sum((count / len(y)) ** 2 for count in counts)
        return impurity

    def _best_split(self, X, y):
        best_gini = 1
        best_feature, best_threshold = None, None
        n_features = X.shape[1]

        for feature in range(n_features):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                left_idxs = X[:, feature] < threshold
                right_idxs = X[:, feature] >= threshold
                if len(y[left_idxs]) == 0 or len(y[right_idxs]) == 0:
                    continue
                gini_left = self._gini(y[left_idxs])
                gini_right = self._gini(y[right_idxs])
                gini = (len(y[left_idxs]) / len(y)) * gini_left + (len(y[right_idxs]) / len(y)) * gini_right
                if gini < best_gini:
                    best_gini = gini
                    best_feature = feature
                    best_threshold = threshold
        return best_feature, best_threshold

    def _grow_tree(self, X, y, depth=0):
        num_samples, num_features = X.shape
        if (depth >= self.max_depth or num_samples < self.min_samples_split or len(np.unique(y)) == 1):
            leaf_value = self._most_common_label(y)
            return DecisionTreeNode(value=leaf_value)

        feature, threshold = self._best_split(X, y)
        if feature is None:
            leaf_value = self._most_common_label(y)
            return DecisionTreeNode(value=leaf_value)

        left_idxs = X[:, feature] < threshold
        right_idxs = X[:, feature] >= threshold
        left = self._grow_tree(X[left_idxs], y[left_idxs], depth + 1)
        right = self._grow_tree(X[right_idxs], y[right_idxs], depth + 1)
        return DecisionTreeNode(feature, threshold, left, right)

    def _most_common_label(self, y):
        return np.bincount(y).argmax()

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.root) for x in X])

    def _traverse_tree(self, x, node):
        if node.value is not None:
            return node.value
        if x[node.feature] < node.threshold:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)


SVM

In [10]:
class SVM:
    def __init__(self, learning_rate=0.001, lambda_param=0.01, n_iters=1000, kernel='linear'):
        self.lr = learning_rate
        self.lambda_param = lambda_param
        self.n_iters = n_iters
        self.kernel = kernel
        self.weights = None
        self.bias = None
        self.kernel_function = self._linear_kernel if kernel == 'linear' else None

    def _linear_kernel(self, x1, x2):
        return np.dot(x1, x2)

    def fit(self, X, y):
        n_samples, n_features = X.shape
        y_ = np.where(y <= 0, -1, 1)

        self.weights = np.zeros(n_features)
        self.bias = 0

        for _ in range(self.n_iters):
            for idx, x_i in enumerate(X):
                condition = y_[idx] * (np.dot(x_i, self.weights) - self.bias) >= 1
                if condition:
                    self.weights -= self.lr * (2 * self.lambda_param * self.weights)
                else:
                    self.weights -= self.lr * (2 * self.lambda_param * self.weights - np.dot(x_i, y_[idx]))
                    self.bias -= self.lr * y_[idx]

    def predict(self, X):
        approx = np.dot(X, self.weights) - self.bias
        return np.sign(approx)


Ensemble

In [11]:
class StackingEnsemble:
    def __init__(self, base_models, meta_model):
        self.base_models = base_models
        self.meta_model = meta_model

    def fit(self, X, y):

        for model in self.base_models:
            model.fit(X, y)


        meta_features = np.column_stack([model.predict(X) for model in self.base_models])


        self.meta_model.fit(meta_features, y)

    def predict(self, X):
        meta_features = np.column_stack([model.predict(X) for model in self.base_models])
        return self.meta_model.predict(meta_features)



Dataset Creation

In [17]:

X = df.drop(columns=['delay_bin'])
y = df['delay_bin']
X = X.select_dtypes(include=[np.number]).fillna(0)


def train_test_split_manual(X, y, test_size=0.2):

    np.random.seed(42)
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)


    split_idx = int(X.shape[0] * (1 - test_size))


    X_train, X_test = X.iloc[indices[:split_idx]], X.iloc[indices[split_idx:]]
    y_train, y_test = y.iloc[indices[:split_idx]], y.iloc[indices[split_idx:]]

    return X_train.values, X_test.values, y_train.values, y_test.values


X_train, X_test, y_train, y_test = train_test_split_manual(X, y, test_size=0.2)


Logistic Regression Training

In [19]:

model_lr = LogisticRegressionOVR(learning_rate=0.01, epochs=500)
model_lr.fit(X_train, y_train)


  return 1 / (1 + np.exp(-z))


Decision Tree Training

In [21]:

model_dt = DecisionTree(max_depth=5, min_samples_split=2)
model_dt.fit(X_train, y_train)


SVM Training

In [22]:

model_svm = SVM(learning_rate=0.001, lambda_param=0.01, n_iters=200, kernel='linear')
model_svm.fit(X_train, y_train)


Ensemble Training

In [23]:

meta_model = LogisticRegressionOVR(learning_rate=0.01, epochs=500)


ensemble = StackingEnsemble(base_models=[model_lr, model_dt, model_svm], meta_model=meta_model)
ensemble.fit(X_train, y_train)


  return 1 / (1 + np.exp(-z))


Evaluation

In [24]:

y_pred = ensemble.predict(X_test)


def compute_accuracy(y_true, y_pred):
    return np.sum(y_true == y_pred) / len(y_true)


def compute_f1_score(y_true, y_pred):
    classes = np.unique(y_true)
    f1_total = 0
    for cls in classes:
        tp = np.sum((y_true == cls) & (y_pred == cls))
        fp = np.sum((y_true != cls) & (y_pred == cls))
        fn = np.sum((y_true == cls) & (y_pred != cls))
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0
        f1_total += f1
    return f1_total / len(classes)


accuracy = compute_accuracy(y_test, y_pred)
f1 = compute_f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"F1-Score: {f1}")


Accuracy: 0.9007924319675655
F1-Score: 0.5745893976819519


  return 1 / (1 + np.exp(-z))


Bonus

Naive Bayes

In [25]:
import numpy as np

class NaiveBayes:
    def __init__(self):
        self.class_priors = {}
        self.feature_stats = {}

    def fit(self, X, y):
        self.classes = np.unique(y)
        for cls in self.classes:
            X_cls = X[y == cls]
            self.class_priors[cls] = len(X_cls) / len(X)
            self.feature_stats[cls] = {
                'mean': np.mean(X_cls, axis=0),
                'var': np.var(X_cls, axis=0)
            }

    def gaussian_pdf(self, x, mean, var):
        eps = 1e-6
        coef = 1.0 / np.sqrt(2.0 * np.pi * var + eps)
        exponent = np.exp(-(x - mean) ** 2 / (2 * var + eps))
        return coef * exponent

    def predict(self, X):
        posteriors = []
        for x in X:
            class_posteriors = []
            for cls in self.classes:
                prior = np.log(self.class_priors[cls])
                likelihood = np.sum(np.log(self.gaussian_pdf(x, self.feature_stats[cls]['mean'], self.feature_stats[cls]['var'])))
                class_posteriors.append(prior + likelihood)
            posteriors.append(self.classes[np.argmax(class_posteriors)])
        return np.array(posteriors)



Naive Bayes Training

In [26]:

model_nb = NaiveBayes()
model_nb.fit(X_train, y_train)


New Ensemble Training

In [27]:

meta_model = LogisticRegressionOVR(learning_rate=0.01, epochs=500)


ensemble = StackingEnsemble(base_models=[model_lr, model_dt, model_svm, model_nb], meta_model=meta_model)
ensemble.fit(X_train, y_train)


  return 1 / (1 + np.exp(-z))
  likelihood = np.sum(np.log(self.gaussian_pdf(x, self.feature_stats[cls]['mean'], self.feature_stats[cls]['var'])))


New Ensemble Evaluation

In [28]:

y_pred = ensemble.predict(X_test)


accuracy = compute_accuracy(y_test, y_pred)
f1 = compute_f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"F1-Score: {f1}")


  return 1 / (1 + np.exp(-z))
  likelihood = np.sum(np.log(self.gaussian_pdf(x, self.feature_stats[cls]['mean'], self.feature_stats[cls]['var'])))


Accuracy: 0.9007924319675655
F1-Score: 0.5745893976819519
