In [121]:
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
from traffic.core import Traffic
from preprocessing import preprocess_traffic, generate_aux_columns, seconds_till_arrival
import h5py
import numpy as np
import pandas as pd
import datetime
from sklearn.preprocessing import StandardScaler
from joblib import dump, load
import itertools
import copy
from tqdm.auto import tqdm
import os

In [106]:
data_files_train = []
for i in range(1, 13):
    month = "0" + str(i) if i < 10 else str(i)
    file = ".." + os.sep + "data" + os.sep + "Frankfurt_LH_22" + month + ".h5"
    data_files_train.append(file)
data_files_test = [".." + os.sep + "data" + os.sep + "Frankfurt_LH_2301" + ".h5",
                   ".." + os.sep + "data" + os.sep + "Frankfurt_LH_2302" + ".h5",
                   ".." + os.sep + "data" + os.sep + "Frankfurt_LH_2303" + ".h5"]
def load_data_batch(file_batch, sample_fraction=1, quick_sample = True):
    first_day = True
    for file in file_batch:
        with h5py.File(file, 'r') as f:
            if quick_sample:
                i = 0
            for key in tqdm(list(f.keys()),desc=file):
                if i > 0:
                    continue
                new_flights = Traffic.from_file(file, key=key,
                                                parse_dates=["day", "firstseen", "hour", "last_position",
                                                             "lastseen", "timestamp"]).data

                if first_day:
                    df_flights = preprocess_traffic(new_flights)
                    df_flights = df_flights[
                        [
                            "distance",
                            "altitude",
                            "geoaltitude",
                            "arrival_time",
                            "timestamp",
                            "vertical_rate",
                            "groundspeed",
                            "track",
                            "latitude",
                            "longitude"
                        ]
                    ].dropna()
                    df_flights = df_flights.sample(frac=sample_fraction)
                    first_day = False
                else:
                    old_flights = pd.concat([old_flights,new_flights])
                    start = new_flights.day.min().replace(tzinfo=None)
                    end = start + datetime.timedelta(days=1)
                    relevant_time = [str(start), str(end)]
                    df_add_flights = preprocess_traffic(old_flights, relevant_time)
                    df_add_flights = df_add_flights[
                        [
                            "distance",
                            "altitude",
                            "geoaltitude",
                            "arrival_time",
                            "timestamp",
                            "vertical_rate",
                            "groundspeed",
                            "track",
                            "latitude",
                            "longitude"
                        ]
                    ].dropna()
                    del(old_flights)
                    df_add_flights = df_add_flights.sample(frac=sample_fraction)
                    df_flights = pd.concat([df_flights, df_add_flights])
                    del(df_add_flights)
                old_flights = new_flights
                if quick_sample:
                    i += 1


    return df_flights

In [107]:
df_train = load_data_batch(data_files_train, sample_fraction=0.001)
df_train = generate_aux_columns(df_train)
df_train.head(100)

..\data\Frankfurt_LH_2201.h5:   0%|          | 0/31 [00:00<?, ?it/s]

..\data\Frankfurt_LH_2202.h5:   0%|          | 0/28 [00:00<?, ?it/s]

..\data\Frankfurt_LH_2203.h5:   0%|          | 0/31 [00:00<?, ?it/s]

..\data\Frankfurt_LH_2204.h5:   0%|          | 0/30 [00:00<?, ?it/s]

..\data\Frankfurt_LH_2205.h5:   0%|          | 0/31 [00:00<?, ?it/s]

..\data\Frankfurt_LH_2206.h5:   0%|          | 0/30 [00:00<?, ?it/s]

..\data\Frankfurt_LH_2207.h5:   0%|          | 0/31 [00:00<?, ?it/s]

..\data\Frankfurt_LH_2208.h5:   0%|          | 0/31 [00:00<?, ?it/s]

..\data\Frankfurt_LH_2209.h5:   0%|          | 0/30 [00:00<?, ?it/s]

..\data\Frankfurt_LH_2210.h5:   0%|          | 0/31 [00:00<?, ?it/s]

..\data\Frankfurt_LH_2211.h5:   0%|          | 0/30 [00:00<?, ?it/s]

..\data\Frankfurt_LH_2212.h5:   0%|          | 0/31 [00:00<?, ?it/s]

Unnamed: 0,distance,altitude,geoaltitude,arrival_time,timestamp,vertical_rate,groundspeed,track,latitude,longitude,...,bearing_cos,track_sin,track_cos,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6
0,797.719481,25950.0,27150.0,2022-01-01 15:10:07+00:00,2022-01-01 13:50:08+00:00,1536.0,433.0,48.179830,44.170898,2.474096,...,-0.793129,0.745241,0.666795,0,0,0,0,0,1,0
1,1360.263227,38025.0,37425.0,2022-01-01 16:56:31+00:00,2022-01-01 15:05:16+00:00,0.0,449.0,188.833415,61.895949,14.066569,...,0.977051,-0.153562,-0.988139,0,0,0,0,0,1,0
2,2752.981764,34025.0,34050.0,2022-01-01 05:24:53+00:00,2022-01-01 01:27:08+00:00,0.0,448.0,304.541459,32.880112,32.797705,...,-0.568646,-0.823716,0.567002,0,0,0,0,0,1,0
3,8556.482725,32975.0,34700.0,2022-01-01 13:44:59+00:00,2022-01-01 04:45:03+00:00,0.0,542.0,48.138402,27.006024,-93.853666,...,0.450048,0.744759,0.667334,0,0,0,0,0,1,0
4,8.769132,1400.0,1875.0,2022-01-01 13:57:54+00:00,2022-01-01 13:55:49+00:00,-768.0,141.0,252.208797,50.062835,8.681649,...,0.320260,-0.952176,-0.305549,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,25.037531,6525.0,7050.0,2022-01-01 11:08:41+00:00,2022-01-01 10:59:26+00:00,-2240.0,260.0,69.960352,50.028992,8.915702,...,-0.035957,0.939456,0.342670,0,0,0,0,0,1,0
96,6143.441303,18000.0,17900.0,2022-01-01 06:59:27+00:00,2022-01-01 00:19:49+00:00,2176.0,453.0,49.837955,41.078579,-73.615577,...,0.417814,0.764223,0.644952,0,0,0,0,0,1,0
97,365.343990,38000.0,39075.0,2022-01-01 19:11:51+00:00,2022-01-01 18:31:54+00:00,0.0,449.0,48.969998,46.824051,7.518400,...,-0.975956,0.754366,0.656454,0,0,0,0,0,1,0
98,1666.351930,34000.0,32300.0,2022-01-01 04:14:10+00:00,2022-01-01 02:11:58+00:00,0.0,468.0,226.903389,61.243845,26.483256,...,0.820228,-0.730203,-0.683231,0,0,0,0,0,1,0


In [176]:
class LinearModel:
    def __init__(self, cols, pol_degree = 1, scaler = None, pol_only = True):
        self.feature_columns = cols
        self.pol_degree = pol_degree
        self.scaler = scaler
        self.model = LinearRegression()
        self.pol_only = pol_only

    def preprocess(self, features, features_to_scale):
        if self.scaler == None:
            self.scaler = StandardScaler()
            self.scaler.fit(features[features_to_scale])
        X = features.copy()
        X[features_to_scale] = self.scaler.transform(X[features_to_scale])

        if self.pol_degree > 1:
            poly = PolynomialFeatures(self.pol_degree, interaction_only=True, include_bias=False)

            if self.pol_only:
                X = np.hstack([X[self.feature_columns]**(i+1) for i in range(self.pol_degree)])

            else:
                X = poly.fit_transform(X[self.feature_columns])

        else:
            X = X[self.feature_columns]

        return X

    def fit(self, X_train, y_train):
        self.model.fit(X_train, y_train)

    def evaluate(self, X_test, y_test):

        y_pred = self.model.predict(X_test)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        return mae, r2

In [203]:
features = ['distance', 'altitude', 'vertical_rate', 'groundspeed', 'holiday', 'sec_sin', 'sec_cos', 'day_sin', 'day_cos', 'bearing_sin',
       'bearing_cos', 'track_sin', 'track_cos', 'weekday_1',
       'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6']
model = LinearModel(cols = features, pol_degree=5)

In [204]:
cols_to_scale = ["distance", "altitude", "vertical_rate","groundspeed"]
X = model.preprocess(df_train, cols_to_scale)
y =  seconds_till_arrival(df_train)

In [205]:
model.fit(X,y)

In [206]:
model.model.coef_

array([ 6.04622426e+03,  1.02129727e+02,  8.33519795e+01, -2.98741285e+02,
       -2.20938978e+01, -1.74859656e+01, -1.88564530e+01,  4.31442695e+01,
        4.09303512e+01,  4.65723783e+01, -5.12380737e+02, -1.23764879e+02,
       -2.74271510e+02, -7.25939816e+00,  1.52360806e+01,  1.13810109e+01,
        3.56096944e+00,  1.03287553e+01,  1.88299472e+01, -1.77139596e+02,
       -1.15249458e+02,  4.77098494e-01, -1.75982819e+02, -2.20938978e+01,
       -3.21546185e+00,  3.21546185e+00,  9.78043698e+00, -9.78043698e+00,
       -7.29707929e+01,  7.29707929e+01,  3.88413577e+01, -3.88413577e+01,
       -7.25939811e+00,  1.52360805e+01,  1.13810109e+01,  3.56096948e+00,
        1.03287553e+01,  1.88299472e+01, -1.28400807e+02, -1.94908765e+01,
       -1.78418883e-01,  2.42846442e+01, -2.20938978e+01,  6.62555560e+01,
        9.06572665e+00, -1.69482097e+01,  5.26022431e+01,  4.86194136e+02,
        6.97309866e+02,  7.76836961e+02,  3.36347504e+02, -7.25939811e+00,
        1.52360805e+01,  

In [159]:
df_test = load_data_batch(data_files_test, sample_fraction=0.001)
df_test = generate_aux_columns(df_test)
df_test.head(100)

..\data\Frankfurt_LH_2301.h5:   0%|          | 0/30 [00:00<?, ?it/s]

..\data\Frankfurt_LH_2302.h5:   0%|          | 0/17 [00:00<?, ?it/s]

..\data\Frankfurt_LH_2303.h5:   0%|          | 0/31 [00:00<?, ?it/s]

Unnamed: 0,distance,altitude,geoaltitude,arrival_time,timestamp,vertical_rate,groundspeed,track,latitude,longitude,...,bearing_cos,track_sin,track_cos,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6
0,501.237127,27800.0,27800.0,2023-01-01 15:01:11+00:00,2023-01-01 14:15:26+00:00,896.0,529.0,96.735164,51.200060,1.693115,...,0.301837,0.993099,-0.117280,0,0,0,0,0,0,1
1,101.039174,14050.0,14475.0,2023-01-01 18:12:08+00:00,2023-01-01 17:54:37+00:00,-2432.0,377.0,186.860895,50.765030,9.421073,...,0.803441,-0.119459,-0.992839,0,0,0,0,0,0,1
2,6.295997,1050.0,1375.0,2023-01-01 14:36:32+00:00,2023-01-01 14:35:03+00:00,-960.0,134.0,249.943905,50.059344,8.646686,...,0.383913,-0.939357,-0.342940,0,0,0,0,0,0,1
3,40.403698,5050.0,5425.0,2023-01-01 07:08:10+00:00,2023-01-01 06:59:29+00:00,-1024.0,231.0,232.931332,50.174835,9.090175,...,0.380639,-0.797914,-0.602772,0,0,0,0,0,0,1
4,592.785626,34000.0,35000.0,2023-01-01 18:46:25+00:00,2023-01-01 17:47:55+00:00,0.0,404.0,318.413598,46.087509,13.935554,...,-0.715883,-0.663749,0.747956,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,167.597679,22700.0,23450.0,2023-01-01 04:15:49+00:00,2023-01-01 03:50:09+00:00,-896.0,364.0,322.373766,49.050696,10.322956,...,-0.645440,-0.610508,0.792010,0,0,0,0,0,0,1
96,888.085238,36050.0,35550.0,2023-01-01 15:28:03+00:00,2023-01-01 13:57:11+00:00,-768.0,302.0,226.743829,55.949112,17.530934,...,0.778502,-0.728297,-0.685261,0,0,0,0,0,0,1
97,645.606000,36025.0,36300.0,2023-01-01 14:49:37+00:00,2023-01-01 13:56:59+00:00,-64.0,543.0,63.199165,47.844528,0.368112,...,-0.326123,0.892579,0.450891,0,0,0,0,0,0,1
98,849.069544,36025.0,37000.0,2023-01-01 19:21:42+00:00,2023-01-01 18:10:45+00:00,0.0,483.0,26.406044,42.721594,5.361520,...,-0.951130,0.444730,0.895665,0,0,0,0,0,0,1


In [207]:
cols_to_scale = ["distance", "altitude", "vertical_rate","groundspeed"]
#y_test =  seconds_till_arrival(df_test)
#df_test = df_test.drop(columns = ["arrival_time"])
X_test = model.preprocess(df_test, cols_to_scale)


In [209]:
model.evaluate(X_test,y_test)

(338.4511250700758, 0.9864284560450514)

In [211]:
model.evaluate(X,y)

(254.55744330048609, 0.9930823268576983)

In [197]:
df_train["altitude"].corr(df_train.geoaltitude)

0.9937562604094307

In [218]:
from datetime import datetime

# datetime object containing current date and time
now = datetime.now()

print("now =", now)

# dd/mm/YY H:M:S
dt_string = now.strftime("%y%m%d_%H_%M_%S")
print("date and time =", dt_string)

now = 2023-05-26 19:22:31.257260
date and time = 230526_19_22_31


In [215]:
type(dt_string)

str