# Read in data

In [1]:
from datetime import timedelta
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

airports = [
    "KATL",
    "KCLT",
    "KDEN",
    "KDFW",
    "KJFK",
    "KMEM",
    "KMIA",
    "KORD",
    "KPHX",
    "KSEA",
]

In [2]:
DATA_DIRECTORY = Path("code execution development data/old")

## LAMP

In [3]:
def read_lamp(airport):
    lamp = pd.read_csv(
        DATA_DIRECTORY / airport / f"{airport}_lamp.csv.bz2",
        parse_dates=["timestamp", "forecast_timestamp"],
        dtype={"temperature": "int16", "wind_direction":"int16", "wind_gust":"int16", "cloud_ceiling":"float16", "visibility":"int16"}
    )
    return lamp

## TBFM

In [4]:
def read_tbfm(airport):
    tbfm = pd.read_csv(
        DATA_DIRECTORY / airport / f"{airport}_tbfm.csv.bz2",
        parse_dates=["timestamp", "scheduled_runway_estimated_time"]
    )
    return tbfm

## ETD

In [5]:
def read_etd(airport):
    etd = pd.read_csv(
        DATA_DIRECTORY / airport / f"{airport}_etd.csv.bz2",
        parse_dates=["departure_runway_estimated_time", "timestamp"]
    )
    return etd

## MFS

In [6]:
def read_mfs(airport):
    etd = pd.read_csv(
        DATA_DIRECTORY / airport / f"{airport}_mfs.csv.bz2"
    )
    return etd

# Initial Model, Linear Regression

In [7]:
submission_format = pd.read_csv(
    "code execution development data/test_labels.csv", parse_dates=["timestamp"]
)

## Insert Data

In [23]:
def filter_lamp(current_time):
    # weather data
    valid_lamp = lamp.loc[(lamp.timestamp <= current_time) 
                    & (lamp.timestamp > valid_time) 
                    & (lamp.forecast_timestamp <= current_time) 
                    & (lamp.forecast_timestamp > valid_time)]
    return valid_lamp.iloc[-1]
    
def filter_tbfm(current_time):
    # traffic data
    valid_tbfm = tbfm.loc[(tbfm.timestamp <= current_time) 
                    & (tbfm.timestamp > valid_time)]
    return valid_tbfm['gufi'].nunique()
    
def filter_etd(current_time):    
    #etd data
    valid_etd = etd.loc[(etd.timestamp <= current_time) 
                    & (etd.timestamp > valid_time) 
                    & (etd.gufi == df.loc[i].gufi)]
    try:
        return (valid_etd.iloc[-1].departure_runway_estimated_time - current_time).total_seconds()
    except:
        return 3600

In [22]:
full_frame = pd.DataFrame()
for a in airports:
    airport = a
    df = submission_format[submission_format.airport == a]
    times = df.timestamp.unique()

    etd = read_etd(airport)
    tbfm = read_tbfm(airport)
    lamp = read_lamp(airport)

    for t in times:
        indices = df[df.timestamp == t].index
        current_time = pd.to_datetime(t)
        valid_time = current_time - pd.Timedelta(30, unit='hours')
        
        # insert etd data
        for i in indices:
            df.loc[i, 'etd'] = filter_etd(current_time)
            
        # insert traffic data
        df.loc[indices, 'traffic'] = filter_tbfm(current_time)

        # insert weather data
        current_forecast = filter_lamp(current_time)
        df.loc[indices, 'precip'] = current_forecast.precip
        df.loc[indices, 'lightning_prob'] = current_forecast.lightning_prob
        df.loc[indices, 'cloud'] = filter_lamp(current_time).cloud
        df.loc[indices, 'visibility'] = current_forecast.visibility
        df.loc[indices, 'cloud_ceiling'] = current_forecast.cloud_ceiling
        df.loc[indices, 'wind_gust'] = current_forecast.wind_gust
        df.loc[indices, 'wind_speed'] = current_forecast.wind_speed
        df.loc[indices, 'wind_direction'] = current_forecast.wind_direction
        df.loc[indices, 'temperature'] = current_forecast.temperature
        
    full_frame = pd.concat([full_frame, df])
        
# insert metadata
metadata = pd.concat([read_mfs('KATL'),
                      read_mfs('KCLT'),
                      read_mfs('KDEN'),
                      read_mfs('KDFW'),
                      read_mfs('KJFK'),
                      read_mfs('KMEM'),
                      read_mfs('KMIA'),
                      read_mfs('KORD'),
                      read_mfs('KPHX'),
                      read_mfs('KSEA')])

full_frame = full_frame.merge(metadata, how='left', on='gufi')

full_frame.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[i, 'etd'] = filter_etd(current_time)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[indices, 'traffic'] = filter_tbfm(current_time)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[indices, 'precip'] = current_forecast.precip
A value is trying to be set on a copy of a slice from 

gufi                               157513
timestamp                          157513
scheduled_runway_estimated_time    157513
dtype: int64 833
gufi                               163299
timestamp                          163299
scheduled_runway_estimated_time    163299
dtype: int64 877
gufi                               156727
timestamp                          156727
scheduled_runway_estimated_time    156727
dtype: int64 808
gufi                               156657
timestamp                          156657
scheduled_runway_estimated_time    156657
dtype: int64 814
gufi                               215591
timestamp                          215591
scheduled_runway_estimated_time    215591
dtype: int64 1253
gufi                               216809
timestamp                          216809
scheduled_runway_estimated_time    216809
dtype: int64 1241
gufi                               215440
timestamp                          215440
scheduled_runway_estimated_time    215440
dtype: int64 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[i, 'etd'] = filter_etd(current_time)


gufi                               169984
timestamp                          169984
scheduled_runway_estimated_time    169984
dtype: int64 882
gufi                               164995
timestamp                          164995
scheduled_runway_estimated_time    164995
dtype: int64 860


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[indices, 'traffic'] = filter_tbfm(current_time)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[indices, 'precip'] = current_forecast.precip
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[indices, 'lightning_prob'] = current_forecast.lightning_prob
A value is trying to be set on

gufi                               173540
timestamp                          173540
scheduled_runway_estimated_time    173540
dtype: int64 922
gufi                               119097
timestamp                          119097
scheduled_runway_estimated_time    119097
dtype: int64 602
gufi                               126279
timestamp                          126279
scheduled_runway_estimated_time    126279
dtype: int64 643
gufi                               118390
timestamp                          118390
scheduled_runway_estimated_time    118390
dtype: int64 589
gufi                               159209
timestamp                          159209
scheduled_runway_estimated_time    159209
dtype: int64 827
gufi                               152563
timestamp                          152563
scheduled_runway_estimated_time    152563
dtype: int64 796
gufi                               146057
timestamp                          146057
scheduled_runway_estimated_time    146057
dtype: int64 758

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[i, 'etd'] = filter_etd(current_time)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[indices, 'traffic'] = filter_tbfm(current_time)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[indices, 'precip'] = current_forecast.precip
A value is trying to be set on a copy of a slice from 

gufi                               276137
timestamp                          276137
scheduled_runway_estimated_time    276137
dtype: int64 559
gufi                               277252
timestamp                          277252
scheduled_runway_estimated_time    277252
dtype: int64 562
gufi                               425833
timestamp                          425833
scheduled_runway_estimated_time    425833
dtype: int64 968
gufi                               429352
timestamp                          429352
scheduled_runway_estimated_time    429352
dtype: int64 981
gufi                               320433
timestamp                          320433
scheduled_runway_estimated_time    320433
dtype: int64 726
gufi                               309962
timestamp                          309962
scheduled_runway_estimated_time    309962
dtype: int64 666
gufi                               336025
timestamp                          336025
scheduled_runway_estimated_time    336025
dtype: int64 746

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[i, 'etd'] = filter_etd(current_time)


gufi                               545064
timestamp                          545064
scheduled_runway_estimated_time    545064
dtype: int64 1239


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[indices, 'traffic'] = filter_tbfm(current_time)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[indices, 'precip'] = current_forecast.precip
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[indices, 'lightning_prob'] = current_forecast.lightning_prob
A value is trying to be set on

gufi                               536551
timestamp                          536551
scheduled_runway_estimated_time    536551
dtype: int64 1240
gufi                               518525
timestamp                          518525
scheduled_runway_estimated_time    518525
dtype: int64 1186
gufi                               504523
timestamp                          504523
scheduled_runway_estimated_time    504523
dtype: int64 1155
gufi                               489734
timestamp                          489734
scheduled_runway_estimated_time    489734
dtype: int64 1135
gufi                               377163
timestamp                          377163
scheduled_runway_estimated_time    377163
dtype: int64 799
gufi                               374273
timestamp                          374273
scheduled_runway_estimated_time    374273
dtype: int64 795
gufi                               374599
timestamp                          374599
scheduled_runway_estimated_time    374599
dtype: int64

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[i, 'etd'] = filter_etd(current_time)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[indices, 'traffic'] = filter_tbfm(current_time)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[indices, 'precip'] = current_forecast.precip
A value is trying to be set on a copy of a slice from 

gufi                               182842
timestamp                          182842
scheduled_runway_estimated_time    182842
dtype: int64 394
gufi                               181586
timestamp                          181586
scheduled_runway_estimated_time    181586
dtype: int64 384
gufi                               139652
timestamp                          139652
scheduled_runway_estimated_time    139652
dtype: int64 282
gufi                               138283
timestamp                          138283
scheduled_runway_estimated_time    138283
dtype: int64 281
gufi                               137979
timestamp                          137979
scheduled_runway_estimated_time    137979
dtype: int64 285
gufi                               137813
timestamp                          137813
scheduled_runway_estimated_time    137813
dtype: int64 281
gufi                               178643
timestamp                          178643
scheduled_runway_estimated_time    178643
dtype: int64 376

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[i, 'etd'] = filter_etd(current_time)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[indices, 'traffic'] = filter_tbfm(current_time)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[indices, 'precip'] = current_forecast.precip
A value is trying to be set on a copy of a slice from 

gufi                               78377
timestamp                          78377
scheduled_runway_estimated_time    78377
dtype: int64 229
gufi                               82476
timestamp                          82476
scheduled_runway_estimated_time    82476
dtype: int64 258
gufi                               121021
timestamp                          121021
scheduled_runway_estimated_time    121021
dtype: int64 356
gufi                               121986
timestamp                          121986
scheduled_runway_estimated_time    121986
dtype: int64 338
gufi                               121473
timestamp                          121473
scheduled_runway_estimated_time    121473
dtype: int64 325
gufi                               119313
timestamp                          119313
scheduled_runway_estimated_time    119313
dtype: int64 321
gufi                               111456
timestamp                          111456
scheduled_runway_estimated_time    111456
dtype: int64 317
gufi 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[i, 'etd'] = filter_etd(current_time)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[indices, 'traffic'] = filter_tbfm(current_time)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[indices, 'precip'] = current_forecast.precip
A value is trying to be set on a copy of a slice from 

gufi                               100431
timestamp                          100431
scheduled_runway_estimated_time    100431
dtype: int64 554
gufi                               98040
timestamp                          98040
scheduled_runway_estimated_time    98040
dtype: int64 554
gufi                               100650
timestamp                          100650
scheduled_runway_estimated_time    100650
dtype: int64 560
gufi                               101043
timestamp                          101043
scheduled_runway_estimated_time    101043
dtype: int64 564
gufi                               73844
timestamp                          73844
scheduled_runway_estimated_time    73844
dtype: int64 421
gufi                               75125
timestamp                          75125
scheduled_runway_estimated_time    75125
dtype: int64 424
gufi                               93988
timestamp                          93988
scheduled_runway_estimated_time    93988
dtype: int64 544
gufi       

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[i, 'etd'] = filter_etd(current_time)


gufi                               294221
timestamp                          294221
scheduled_runway_estimated_time    294221
dtype: int64 1003
gufi                               286747
timestamp                          286747
scheduled_runway_estimated_time    286747
dtype: int64 932


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[indices, 'traffic'] = filter_tbfm(current_time)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[indices, 'precip'] = current_forecast.precip
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[indices, 'lightning_prob'] = current_forecast.lightning_prob
A value is trying to be set on

gufi                               285429
timestamp                          285429
scheduled_runway_estimated_time    285429
dtype: int64 917
gufi                               204357
timestamp                          204357
scheduled_runway_estimated_time    204357
dtype: int64 677
gufi                               198013
timestamp                          198013
scheduled_runway_estimated_time    198013
dtype: int64 638
gufi                               196756
timestamp                          196756
scheduled_runway_estimated_time    196756
dtype: int64 626
gufi                               196977
timestamp                          196977
scheduled_runway_estimated_time    196977
dtype: int64 631
gufi                               282339
timestamp                          282339
scheduled_runway_estimated_time    282339
dtype: int64 914
gufi                               262247
timestamp                          262247
scheduled_runway_estimated_time    262247
dtype: int64 892

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[i, 'etd'] = filter_etd(current_time)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[indices, 'traffic'] = filter_tbfm(current_time)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[indices, 'precip'] = current_forecast.precip
A value is trying to be set on a copy of a slice from 

gufi                               255325
timestamp                          255325
scheduled_runway_estimated_time    255325
dtype: int64 639
gufi                               253798
timestamp                          253798
scheduled_runway_estimated_time    253798
dtype: int64 629
gufi                               252384
timestamp                          252384
scheduled_runway_estimated_time    252384
dtype: int64 620
gufi                               201219
timestamp                          201219
scheduled_runway_estimated_time    201219
dtype: int64 496
gufi                               227027
timestamp                          227027
scheduled_runway_estimated_time    227027
dtype: int64 576
gufi                               218379
timestamp                          218379
scheduled_runway_estimated_time    218379
dtype: int64 553
gufi                               179744
timestamp                          179744
scheduled_runway_estimated_time    179744
dtype: int64 404

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[i, 'etd'] = filter_etd(current_time)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[indices, 'traffic'] = filter_tbfm(current_time)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[indices, 'precip'] = current_forecast.precip
A value is trying to be set on a copy of a slice from 

gufi                               241723
timestamp                          241723
scheduled_runway_estimated_time    241723
dtype: int64 592
gufi                               232722
timestamp                          232722
scheduled_runway_estimated_time    232722
dtype: int64 567
gufi                               223216
timestamp                          223216
scheduled_runway_estimated_time    223216
dtype: int64 562
gufi                               252774
timestamp                          252774
scheduled_runway_estimated_time    252774
dtype: int64 644
gufi                               257336
timestamp                          257336
scheduled_runway_estimated_time    257336
dtype: int64 650
gufi                               252869
timestamp                          252869
scheduled_runway_estimated_time    252869
dtype: int64 640
gufi                               251525
timestamp                          251525
scheduled_runway_estimated_time    251525
dtype: int64 626

  etd = pd.read_csv(
  etd = pd.read_csv(


Unnamed: 0,gufi,timestamp,airport,minutes_until_pushback,etd,traffic,precip,lightning_prob,cloud,visibility,cloud_ceiling,wind_gust,wind_speed,wind_direction,temperature,aircraft_engine_class,aircraft_type,major_carrier,flight_type,isdeparture
0,AAL1227.ATL.MIA.201114.1242.0052.TFM,2020-11-15 11:15:00,KATL,86,6300.0,833.0,False,N,FW,7.0,8.0,0.0,2.0,13.0,54.0,JET,A319,AAL,SCHEDULED_AIR_TRANSPORT,True
1,AAL1227.ATL.MIA.201114.1242.0052.TFM,2020-11-15 11:15:00,KATL,86,6300.0,833.0,False,N,FW,7.0,8.0,0.0,2.0,13.0,54.0,JET,A319,AAL,SCHEDULED_AIR_TRANSPORT,False
2,AAL1227.ATL.MIA.201114.1242.0052.TFM,2020-11-15 12:00:00,KATL,41,3600.0,877.0,False,N,SC,7.0,8.0,0.0,5.0,14.0,53.0,JET,A319,AAL,SCHEDULED_AIR_TRANSPORT,True
3,AAL1227.ATL.MIA.201114.1242.0052.TFM,2020-11-15 12:00:00,KATL,41,3600.0,877.0,False,N,SC,7.0,8.0,0.0,5.0,14.0,53.0,JET,A319,AAL,SCHEDULED_AIR_TRANSPORT,False
4,AAL153.ATL.CLT.201114.1137.0016.TFM,2020-11-15 10:30:00,KATL,62,4980.0,808.0,False,N,SC,7.0,8.0,0.0,5.0,11.0,53.0,JET,A320,AAL,SCHEDULED_AIR_TRANSPORT,True


## Feature Engineering

In [34]:
full_frame.nunique()

gufi                                   1228
timestamp                                17
minutes_until_pushback                  133
etd                                     142
traffic                                 114
                                       ... 
aircraft_type_MD11                        2
aircraft_type_PC12                        2
major_carrier_DAL                         2
major_carrier_UAL                         2
flight_type_SCHEDULED_AIR_TRANSPORT       2
Length: 68, dtype: int64

In [32]:
# binary encoding
full_frame.replace(False, 0, inplace=True)
full_frame.replace(True, 0, inplace=True)

In [33]:
# nominal encoding
full_frame = pd.get_dummies(full_frame, columns=['airport',
                                 'cloud', 
                                 'lightning_prob',
                                 'aircraft_engine_class',
                                 'aircraft_type',
                                 'major_carrier',
                                 'flight_type'], 
                    drop_first=True)
full_frame.head(5)

KeyError: "None of [Index(['airport', 'cloud', 'lightning_prob', 'aircraft_engine_class',\n       'aircraft_type', 'major_carrier', 'flight_type'],\n      dtype='object')] are in the [columns]"

# Linear Regression

## Model 1

In [35]:
import statsmodels.api as sm

Y = full_frame['minutes_until_pushback']
X = full_frame.drop(columns=['gufi', 'timestamp', 'minutes_until_pushback'])
X = sm.add_constant(X)

model = sm.OLS(Y,X).fit()
model.summary()

0,1,2,3
Dep. Variable:,minutes_until_pushback,R-squared:,0.581
Model:,OLS,Adj. R-squared:,0.57
Method:,Least Squares,F-statistic:,50.53
Date:,"Sat, 01 Apr 2023",Prob (F-statistic):,0.0
Time:,17:47:45,Log-Likelihood:,-10227.0
No. Observations:,2356,AIC:,20580.0
Df Residuals:,2292,BIC:,20950.0
Df Model:,63,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,25.1745,13.242,1.901,0.057,-0.793,51.141
etd,0.0131,0.000,52.505,0.000,0.013,0.014
traffic,0.0041,0.006,0.647,0.517,-0.008,0.016
precip,-1.474e-12,8.83e-12,-0.167,0.867,-1.88e-11,1.58e-11
visibility,2.5664,0.885,2.900,0.004,0.831,4.302
cloud_ceiling,-6.4138,1.307,-4.907,0.000,-8.977,-3.850
wind_gust,0.1354,0.163,0.830,0.406,-0.184,0.455
wind_speed,-0.4282,0.329,-1.300,0.194,-1.074,0.218
wind_direction,-0.1162,0.126,-0.923,0.356,-0.363,0.131

0,1,2,3
Omnibus:,2161.112,Durbin-Watson:,0.89
Prob(Omnibus):,0.0,Jarque-Bera (JB):,102163.596
Skew:,4.251,Prob(JB):,0.0
Kurtosis:,34.12,Cond. No.,1.1e+16


In [36]:
# calculate MAE
from sklearn.metrics import mean_absolute_error as mae
ypred = model.predict()
MAE = mae(Y, ypred)
MAE

10.231197185342388

In [37]:
# check for multicollinearity
from statsmodels.stats.outliers_influence import variance_inflation_factor

# create VIF dataframe
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
  
# calculate VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
                          for i in range(len(X.columns))]
  
vif_data

  return 1 - self.ssr/self.centered_tss


Unnamed: 0,feature,VIF
0,const,1164.245307
1,etd,1.108416
2,traffic,21.694123
3,precip,
4,visibility,3.114533
...,...,...
61,aircraft_type_MD11,3.218903
62,aircraft_type_PC12,1.937598
63,major_carrier_DAL,2.733234
64,major_carrier_UAL,2.937714


## Model 2

In [38]:
# reduce features
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SequentialFeatureSelector

# re-define independent variables
X = full_frame.drop(columns=['gufi', 'timestamp', 'minutes_until_pushback'])

# linear regression estimator to be used in feature selection
reg = LinearRegression().fit(X, Y)

In [39]:
# reduce features using SequentialFeatureSelector
sfs = SequentialFeatureSelector(reg, n_features_to_select=20)
sfs.fit(X, Y)

In [40]:
# return selected features
sfs.get_feature_names_out()

array(['etd', 'wind_gust', 'wind_direction', 'airport_KCLT',
       'airport_KJFK', 'airport_KORD', 'cloud_OV', 'aircraft_type_A21N',
       'aircraft_type_A320', 'aircraft_type_A333', 'aircraft_type_B752',
       'aircraft_type_B764', 'aircraft_type_B772', 'aircraft_type_CRJ7',
       'aircraft_type_CRJ9', 'aircraft_type_E145', 'aircraft_type_E170',
       'aircraft_type_E75L', 'aircraft_type_PC12',
       'flight_type_SCHEDULED_AIR_TRANSPORT'], dtype=object)

In [42]:
# re-fit model
X = full_frame[['etd', 'wind_gust', 'wind_direction', 'airport_KCLT',
       'airport_KJFK', 'airport_KORD', 'cloud_OV', 'aircraft_type_A21N',
       'aircraft_type_A320', 'aircraft_type_A333', 'aircraft_type_B752',
       'aircraft_type_B764', 'aircraft_type_B772', 'aircraft_type_CRJ7',
       'aircraft_type_CRJ9', 'aircraft_type_E145', 'aircraft_type_E170',
       'aircraft_type_E75L', 'aircraft_type_PC12',
       'flight_type_SCHEDULED_AIR_TRANSPORT']]
X = sm.add_constant(X)

model2 = sm.OLS(Y,X).fit()
model2.summary()

0,1,2,3
Dep. Variable:,minutes_until_pushback,R-squared:,0.555
Model:,OLS,Adj. R-squared:,0.551
Method:,Least Squares,F-statistic:,145.7
Date:,"Sat, 01 Apr 2023",Prob (F-statistic):,0.0
Time:,17:49:20,Log-Likelihood:,-10299.0
No. Observations:,2356,AIC:,20640.0
Df Residuals:,2335,BIC:,20760.0
Df Model:,20,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,14.8173,2.144,6.911,0.000,10.613,19.022
etd,0.0129,0.000,52.852,0.000,0.012,0.013
wind_gust,0.0919,0.054,1.701,0.089,-0.014,0.198
wind_direction,-0.2298,0.073,-3.152,0.002,-0.373,-0.087
airport_KCLT,-6.9866,1.507,-4.637,0.000,-9.942,-4.032
airport_KJFK,-4.0936,2.267,-1.806,0.071,-8.538,0.351
airport_KORD,-13.2500,1.989,-6.662,0.000,-17.150,-9.350
cloud_OV,1.9402,1.241,1.563,0.118,-0.494,4.374
aircraft_type_A21N,-12.3662,11.309,-1.094,0.274,-34.543,9.810

0,1,2,3
Omnibus:,2249.693,Durbin-Watson:,0.884
Prob(Omnibus):,0.0,Jarque-Bera (JB):,120549.989
Skew:,4.493,Prob(JB):,0.0
Kurtosis:,36.871,Cond. No.,109000.0


In [43]:
# re-calculate MAE
from sklearn.metrics import mean_absolute_error as mae
ypred = model.predict()
MAE = mae(Y, ypred)
MAE

10.231197185342388

In [44]:
# re-check for multicollinearity
from statsmodels.stats.outliers_influence import variance_inflation_factor

# create VIF dataframe
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
  
# calculate VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
                          for i in range(len(X.columns))]
  
vif_data

Unnamed: 0,feature,VIF
0,const,29.256322
1,etd,1.01575
2,wind_gust,2.412839
3,wind_direction,2.019255
4,airport_KCLT,1.112467
5,airport_KJFK,1.098348
6,airport_KORD,2.00876
7,cloud_OV,1.761591
8,aircraft_type_A21N,1.035147
9,aircraft_type_A320,1.060267
