In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, RobustScaler
pd.set_option('display.max_columns', None)

random_state = 101 
path_import_and_export = "../../../Thesis_data/processed_data/"

In [2]:
ontime_reporting = pd.read_csv(path_import_and_export + "ontime_reporting_clean_export.csv")
print(ontime_reporting.dtypes)

MONTH                   int64
DAY_OF_MONTH            int64
DAY_OF_WEEK             int64
OP_UNIQUE_CARRIER      object
CRS_DEP_TIME            int64
DEP_DEL15               int64
CRS_ARR_TIME            int64
DISTANCE_GROUP          int64
NUMBER_OF_SEATS         int64
AWND                  float64
PRCP                  float64
SNOW                  float64
SNWD                  float64
TMAX                  float64
MEDIAN_AGE            float64
TOT_POP                 int64
AVG_HOUSEHOLD_SIZE    float64
ORIGIN_LAT            float64
ORIGIN_LONG           float64
DEST_LAT              float64
DEST_LONG             float64
PREV_FLIGHT_DELAY       int64
PLANE_AGE               int64
dtype: object


In [3]:
X = ontime_reporting.loc[:, ontime_reporting.columns != "DEP_DEL15"]
y = ontime_reporting["DEP_DEL15"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state, shuffle=True, stratify=y)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
X_train.head(10)

(4452592, 22)
(4452592,)
(1113149, 22)
(1113149,)


Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,CRS_DEP_TIME,CRS_ARR_TIME,DISTANCE_GROUP,NUMBER_OF_SEATS,AWND,PRCP,SNOW,SNWD,TMAX,MEDIAN_AGE,TOT_POP,AVG_HOUSEHOLD_SIZE,ORIGIN_LAT,ORIGIN_LONG,DEST_LAT,DEST_LONG,PREV_FLIGHT_DELAY,PLANE_AGE
5001121,11,24,7,AA,359,524,4,172,11.86,0.42,0.0,0.0,51.0,33.8,672228,2.24,38.8522,-77.0378,25.7932,-80.2906,0,9
3966644,9,19,4,B6,843,1019,5,150,3.36,0.0,0.0,0.0,71.0,31.0,124014,2.59,41.9392,-72.6833,27.9755,-82.5332,0,13
4797632,11,11,1,YV,765,844,2,76,6.26,0.0,0.0,0.0,69.0,33.8,672228,2.24,38.9445,-77.4558,42.9405,-78.7322,0,2
514892,2,7,4,AS,525,655,3,181,5.14,0.0,0.0,0.0,57.0,36.5,1026919,3.13,37.3626,-121.929,47.449,-122.309,0,2
4889376,11,17,7,AA,960,1084,8,128,2.46,0.0,0.0,0.0,49.0,33.7,200579,2.62,39.998,-82.8919,33.9425,-118.408,0,19
2868605,7,11,4,AA,454,665,4,160,11.41,0.4,0.0,0.0,88.0,34.1,654596,2.39,36.1245,-86.6782,25.7932,-80.2906,0,7
1446797,4,10,3,F9,1225,1335,2,230,6.26,0.0,0.0,0.0,82.0,33.1,270917,2.42,28.4294,-81.309,35.214,-80.9431,0,2
4131986,9,30,1,DL,510,612,3,191,3.8,0.0,0.0,0.0,96.0,33.8,463875,2.15,33.6367,-84.4281,38.8522,-77.0378,0,2
1452123,4,10,3,WN,1270,1350,2,143,7.38,0.0,0.0,0.0,75.0,30.8,335423,4.58,33.6757,-117.868,37.7213,-122.221,0,11
5305918,12,14,6,DL,1225,1321,2,110,4.7,0.0,0.0,0.0,73.0,32.4,410935,2.26,44.882,-93.2218,41.9786,-87.9047,0,19


In [4]:
#Dummy encoding for 16 categorical values in the OP_UNIQUE_CARRIER feature
print(X_train.shape)
X_train = pd.get_dummies(X_train, columns=["OP_UNIQUE_CARRIER"])
X_test = pd.get_dummies(X_test, columns=["OP_UNIQUE_CARRIER"]) 
print(X_train.shape)
X_train.head(10)

(4452592, 22)
(4452592, 37)


Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,CRS_DEP_TIME,CRS_ARR_TIME,DISTANCE_GROUP,NUMBER_OF_SEATS,AWND,PRCP,SNOW,SNWD,TMAX,MEDIAN_AGE,TOT_POP,AVG_HOUSEHOLD_SIZE,ORIGIN_LAT,ORIGIN_LONG,DEST_LAT,DEST_LONG,PREV_FLIGHT_DELAY,PLANE_AGE,OP_UNIQUE_CARRIER_9E,OP_UNIQUE_CARRIER_AA,OP_UNIQUE_CARRIER_AS,OP_UNIQUE_CARRIER_B6,OP_UNIQUE_CARRIER_DL,OP_UNIQUE_CARRIER_EV,OP_UNIQUE_CARRIER_F9,OP_UNIQUE_CARRIER_HA,OP_UNIQUE_CARRIER_MQ,OP_UNIQUE_CARRIER_NK,OP_UNIQUE_CARRIER_OH,OP_UNIQUE_CARRIER_OO,OP_UNIQUE_CARRIER_UA,OP_UNIQUE_CARRIER_WN,OP_UNIQUE_CARRIER_YV,OP_UNIQUE_CARRIER_YX
5001121,11,24,7,359,524,4,172,11.86,0.42,0.0,0.0,51.0,33.8,672228,2.24,38.8522,-77.0378,25.7932,-80.2906,0,9,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3966644,9,19,4,843,1019,5,150,3.36,0.0,0.0,0.0,71.0,31.0,124014,2.59,41.9392,-72.6833,27.9755,-82.5332,0,13,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
4797632,11,11,1,765,844,2,76,6.26,0.0,0.0,0.0,69.0,33.8,672228,2.24,38.9445,-77.4558,42.9405,-78.7322,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
514892,2,7,4,525,655,3,181,5.14,0.0,0.0,0.0,57.0,36.5,1026919,3.13,37.3626,-121.929,47.449,-122.309,0,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4889376,11,17,7,960,1084,8,128,2.46,0.0,0.0,0.0,49.0,33.7,200579,2.62,39.998,-82.8919,33.9425,-118.408,0,19,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2868605,7,11,4,454,665,4,160,11.41,0.4,0.0,0.0,88.0,34.1,654596,2.39,36.1245,-86.6782,25.7932,-80.2906,0,7,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1446797,4,10,3,1225,1335,2,230,6.26,0.0,0.0,0.0,82.0,33.1,270917,2.42,28.4294,-81.309,35.214,-80.9431,0,2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
4131986,9,30,1,510,612,3,191,3.8,0.0,0.0,0.0,96.0,33.8,463875,2.15,33.6367,-84.4281,38.8522,-77.0378,0,2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
1452123,4,10,3,1270,1350,2,143,7.38,0.0,0.0,0.0,75.0,30.8,335423,4.58,33.6757,-117.868,37.7213,-122.221,0,11,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
5305918,12,14,6,1225,1321,2,110,4.7,0.0,0.0,0.0,73.0,32.4,410935,2.26,44.882,-93.2218,41.9786,-87.9047,0,19,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [6]:
X_test.to_csv(path_import_and_export + "ontime_reporting_X_test_unscaled.csv", index=False)
X_train.to_csv(path_import_and_export + "ontime_reporting_X_train_unscaled.csv", index=False)

In [5]:
rbscaler = RobustScaler()

rbscaler.fit(X_train)

X_train_scaled = rbscaler.transform(X_train)
X_test_scaled = rbscaler.transform(X_test)

X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns)

X_train_scaled_df.head(10)

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,CRS_DEP_TIME,CRS_ARR_TIME,DISTANCE_GROUP,NUMBER_OF_SEATS,AWND,PRCP,SNOW,SNWD,TMAX,MEDIAN_AGE,TOT_POP,AVG_HOUSEHOLD_SIZE,ORIGIN_LAT,ORIGIN_LONG,DEST_LAT,DEST_LONG,PREV_FLIGHT_DELAY,PLANE_AGE,OP_UNIQUE_CARRIER_9E,OP_UNIQUE_CARRIER_AA,OP_UNIQUE_CARRIER_AS,OP_UNIQUE_CARRIER_B6,OP_UNIQUE_CARRIER_DL,OP_UNIQUE_CARRIER_EV,OP_UNIQUE_CARRIER_F9,OP_UNIQUE_CARRIER_HA,OP_UNIQUE_CARRIER_MQ,OP_UNIQUE_CARRIER_NK,OP_UNIQUE_CARRIER_OH,OP_UNIQUE_CARRIER_OO,OP_UNIQUE_CARRIER_UA,OP_UNIQUE_CARRIER_WN,OP_UNIQUE_CARRIER_YV,OP_UNIQUE_CARRIER_YX
0,-1.0,-0.466667,-0.25,-0.856299,-0.77336,0.333333,0.414894,2.35123,0.5,0.0,0.0,-1.0,-0.055556,0.852142,0.333333,0.300712,0.408044,-1.249589,0.219872,0.0,-0.916667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.833333,-0.466667,-0.75,-0.677165,-0.570577,0.333333,-0.159574,-0.051454,38.0,0.0,5.1,-0.777778,1.388889,-0.038114,-0.266667,0.82846,0.566955,-0.327028,0.23207,0.0,0.666667,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-1.0,-0.466667,-0.25,-0.807087,-0.785288,2.0,0.340426,0.299776,0.0,0.0,0.0,-0.037037,-0.611111,-0.382002,-0.3,-1.299258,0.212525,-0.209257,-0.908307,0.0,-0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.166667,-0.866667,0.5,0.832677,0.988072,0.0,-0.159574,-0.400447,13.5,0.0,0.0,0.37037,-0.055556,-0.016784,-0.4,-0.223276,0.03952,0.306346,0.42215,0.0,0.666667,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.666667,0.666667,-0.5,-0.462598,-0.506958,-0.333333,-0.712766,-0.152125,5.5,0.0,0.0,-0.222222,1.333333,-0.451424,-0.5,-0.418348,-0.139185,-1.038142,-0.247907,0.0,-0.833333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
5,-1.0,0.066667,0.0,0.403543,0.479125,-0.333333,-0.712766,0.798658,28.5,0.0,0.0,-0.481481,0.444444,3.140908,1.166667,-0.528378,-0.982871,-0.56903,-0.803633,1.0,-0.583333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6,0.333333,0.066667,-0.5,0.234252,0.218688,-0.333333,-0.712766,-0.60179,0.0,0.0,0.0,0.074074,0.333333,0.00466,0.3,0.627991,0.146654,-0.015474,0.352858,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
7,0.833333,0.4,0.75,0.049213,-0.057654,-0.666667,-0.712766,0.550336,0.0,0.0,0.0,-0.851852,0.333333,0.00466,0.3,0.627991,0.146654,0.515726,0.201844,0.0,-0.583333,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.833333,-0.2,0.25,-0.062992,0.153082,0.666667,0.510638,-0.800895,0.0,0.0,0.0,0.0,-1.0,-0.248721,-0.833333,1.001258,-0.171326,0.645259,0.563356,0.0,-1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,-0.666667,0.733333,-0.25,0.448819,0.610338,0.666667,0.074468,-0.400447,0.0,0.0,0.0,-1.0,-1.777778,-0.521837,0.266667,0.589776,0.490461,-1.570063,0.258418,0.0,0.333333,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
X_train_scaled_df.to_csv(path_import_and_export + "ontime_reporting_X_train.csv", index=False)
y_train.to_csv(path_import_and_export + "ontime_reporting_y_train.csv", index=False)
X_test_scaled_df.to_csv(path_import_and_export + "ontime_reporting_X_test.csv", index=False)
y_test.to_csv(path_import_and_export + "ontime_reporting_y_test.csv", index=False)

## Generate 4 other random test sets

In [None]:
random_states = [102, 103, 104, 105]
ontime_reporting = pd.read_csv(path_import_and_export + "ontime_reporting_clean_export.csv")
X = ontime_reporting.loc[:, ontime_reporting.columns != "DEP_DEL15"]
y = ontime_reporting["DEP_DEL15"]

In [3]:
for state in random_states:
    print("exporting X_test with state:", state)
    #splitting using different random states
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=state, shuffle=True, stratify=y)

    #Dummy encoding X_train and X_test
    X_train = pd.get_dummies(X_train, columns=["OP_UNIQUE_CARRIER"])
    X_test = pd.get_dummies(X_test, columns=["OP_UNIQUE_CARRIER"]) 

    #Fitting scaler on X_train applying to X_test
    rbscaler = RobustScaler()
    rbscaler.fit(X_train)
    X_test_scaled = rbscaler.transform(X_test)
    X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns)

    #Exporting X_test and y_test
    X_test_scaled_df.to_csv(path_import_and_export + f"ontime_reporting_X_test_{state}.csv", index=False)
    y_test.to_csv(path_import_and_export + f"ontime_reporting_y_test_{state}.csv", index=False)

exporting X_test with state: 102
exporting X_test with state: 103
exporting X_test with state: 104
exporting X_test with state: 105
