In [96]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import prose.datainsights as di
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from scipy.stats.stats import pearsonr  
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import os

np.random.seed(0)

le = preprocessing.LabelEncoder()

columns = ["Year","Month","Day","DayOfWeek","CRSDepartureTime","CRSArrivalTime","UniqueCarrier","FlightNumber","ActualElapsedTime","Origin","Destination","Distance","Diverted","ArrivalDelay"]
df = pd.read_csv(os.path.join("Datasets", "uncompressed", "2008_14col.data"), header=None, names=columns)
df = df[["Origin", "Destination","Month","Day","CRSDepartureTime","CRSArrivalTime","UniqueCarrier","FlightNumber","ActualElapsedTime","Distance","Diverted","ArrivalDelay"]]
for col in ["UniqueCarrier",  "FlightNumber", "Origin", "Destination"]:   
    df[col] = le.fit_transform(df[col])

In [97]:
sameDay = df[df.CRSArrivalTime > df.CRSDepartureTime]  # This happens when the arrival is on same day
diffDay = df[df.CRSArrivalTime <= df.CRSDepartureTime] # This happens when the arrival is on next day

y = np.array(sameDay.ArrivalDelay)
x = np.array(sameDay.drop(columns=["ArrivalDelay"]))
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.95, test_size=0.05, random_state=0)

y_test_drifted = np.array(diffDay.ArrivalDelay)
x_test_drifted = np.array(diffDay.drop(columns=["ArrivalDelay"]))

x_test_all = np.vstack((x_test, x_test_drifted))
y_test_all = np.hstack((y_test, y_test_drifted))

x_train_df = pd.DataFrame(x_train, columns = ["Origin", "Destination","Month","Day","CRSDepartureTime","CRSArrivalTime","UniqueCarrier","FlightNumber","ActualElapsedTime","Distance","Diverted"])#["Month", "Day", "CRSDepartureTime", "CRSArrivalTime", "UniqueCarrier", "FlightNumber", "ActualElapsedTime", "Origin", "Destination", "Distance", "Diverted"])
x_test_df = pd.DataFrame(x_test, columns = ["Origin", "Destination","Month","Day","CRSDepartureTime","CRSArrivalTime","UniqueCarrier","FlightNumber","ActualElapsedTime","Distance","Diverted"])
x_test_drifted_df = pd.DataFrame(x_test_drifted, columns = ["Origin", "Destination","Month","Day","CRSDepartureTime","CRSArrivalTime","UniqueCarrier","FlightNumber","ActualElapsedTime","Distance","Diverted"])
x_test_all_df = pd.DataFrame(x_test_all, columns = ["Origin", "Destination","Month","Day","CRSDepartureTime","CRSArrivalTime","UniqueCarrier","FlightNumber","ActualElapsedTime","Distance","Diverted"])

x_train_df.head(5)

Unnamed: 0,Origin,Destination,Month,Day,CRSDepartureTime,CRSArrivalTime,UniqueCarrier,FlightNumber,ActualElapsedTime,Distance,Diverted
0,35,96,7,11,935,1255,18,2568,124,748,0
1,232,61,4,8,1410,1602,18,3070,117,540,0
2,18,35,9,19,2012,2020,6,1524,72,214,0
3,286,175,10,29,1715,1810,17,2316,57,223,0
4,210,68,10,11,2050,2222,14,6554,165,911,0


In [98]:
#x_train_df['ArrivalDelay']=y_train
#x_train_df.to_csv('train.csv',index=False)
#x_test_df['ArrivalDelay']=y_test
#x_test_df.to_csv('pass.csv',index=False)
#x_test_drifted_df['ArrivalDelay']=y_test_drifted
#x_test_drifted_df.to_csv('fail.csv',index=False)

In [99]:
sameDay

Unnamed: 0,Origin,Destination,Month,Day,CRSDepartureTime,CRSArrivalTime,UniqueCarrier,FlightNumber,ActualElapsedTime,Distance,Diverted,ArrivalDelay
0,157,87,1,1,10,737,12,334,274,1979,0,-3
1,268,150,1,1,15,823,4,170,299,2521,0,-19
2,268,81,1,1,25,535,1,620,182,1431,0,12
3,220,150,1,1,25,709,6,428,266,2153,0,-24
4,16,256,1,1,30,444,3,196,222,1449,0,24
...,...,...,...,...,...,...,...,...,...,...,...,...
5810413,164,129,10,31,2254,2324,10,322,36,102,0,50
5810414,136,70,10,31,2255,2352,18,2296,46,201,0,-21
5810415,136,200,10,31,2255,2357,18,3118,55,305,0,-12
5810421,257,111,10,31,2300,2359,3,695,44,224,0,-23


In [100]:
#for l in x_train:
#    print(l)

In [101]:
x_train_df

Unnamed: 0,Origin,Destination,Month,Day,CRSDepartureTime,CRSArrivalTime,UniqueCarrier,FlightNumber,ActualElapsedTime,Distance,Diverted
0,35,96,7,11,935,1255,18,2568,124,748,0
1,232,61,4,8,1410,1602,18,3070,117,540,0
2,18,35,9,19,2012,2020,6,1524,72,214,0
3,286,175,10,29,1715,1810,17,2316,57,223,0
4,210,68,10,11,2050,2222,14,6554,165,911,0
...,...,...,...,...,...,...,...,...,...,...,...
5393149,243,18,4,30,800,931,7,4824,92,357,0
5393150,138,199,10,2,800,1135,0,2820,151,939,0
5393151,155,136,4,28,1135,1643,5,494,177,1222,0
5393152,18,278,3,20,2029,2251,7,4730,159,784,0


In [102]:
x_train_df.CRSDepartureTime = (x_train_df.CRSDepartureTime / 100).astype(int) * 60 + x_train_df.CRSDepartureTime % 100
x_train_df.CRSArrivalTime = (x_train_df.CRSArrivalTime / 100).astype(int) * 60 + x_train_df.CRSArrivalTime % 100

x_test_df.CRSDepartureTime = (x_test_df.CRSDepartureTime / 100).astype(int) * 60 + x_test_df.CRSDepartureTime % 100
x_test_df.CRSArrivalTime = (x_test_df.CRSArrivalTime / 100).astype(int) * 60 + x_test_df.CRSArrivalTime % 100

x_test_drifted_df.CRSDepartureTime = (x_test_drifted_df.CRSDepartureTime / 100).astype(int) * 60 + x_test_drifted_df.CRSDepartureTime % 100
x_test_drifted_df.CRSArrivalTime = (x_test_drifted_df.CRSArrivalTime / 100).astype(int) * 60 + x_test_drifted_df.CRSArrivalTime % 100

x_test_all_df.CRSDepartureTime = (x_test_all_df.CRSDepartureTime / 100).astype(int) * 60 + x_test_all_df.CRSDepartureTime % 100
x_test_all_df.CRSArrivalTime = (x_test_all_df.CRSArrivalTime / 100).astype(int) * 60 + x_test_all_df.CRSArrivalTime % 100

x_train_df.head(5)

Unnamed: 0,Origin,Destination,Month,Day,CRSDepartureTime,CRSArrivalTime,UniqueCarrier,FlightNumber,ActualElapsedTime,Distance,Diverted
0,35,96,7,11,575,775,18,2568,124,748,0
1,232,61,4,8,850,962,18,3070,117,540,0
2,18,35,9,19,1212,1220,6,1524,72,214,0
3,286,175,10,29,1035,1090,17,2316,57,223,0
4,210,68,10,11,1250,1342,14,6554,165,911,0


In [103]:
x_train_df

Unnamed: 0,Origin,Destination,Month,Day,CRSDepartureTime,CRSArrivalTime,UniqueCarrier,FlightNumber,ActualElapsedTime,Distance,Diverted
0,35,96,7,11,575,775,18,2568,124,748,0
1,232,61,4,8,850,962,18,3070,117,540,0
2,18,35,9,19,1212,1220,6,1524,72,214,0
3,286,175,10,29,1035,1090,17,2316,57,223,0
4,210,68,10,11,1250,1342,14,6554,165,911,0
...,...,...,...,...,...,...,...,...,...,...,...
5393149,243,18,4,30,480,571,7,4824,92,357,0
5393150,138,199,10,2,480,695,0,2820,151,939,0
5393151,155,136,4,28,695,1003,5,494,177,1222,0
5393152,18,278,3,20,1229,1371,7,4730,159,784,0


In [104]:
print("Train: ", x_train.shape, y_train.shape)
print("Regular Test: ", x_test.shape, y_test.shape)
print("Drifted Test: ", x_test_drifted.shape, y_test_drifted.shape)
print("All Test: ", x_test_all.shape, y_test_all.shape)

Train:  (5393154, 11) (5393154,)
Regular Test:  (283851, 11) (283851,)
Drifted Test:  (133457, 11) (133457,)
All Test:  (417308, 11) (417308,)


In [105]:
assertions = di.learn_assertions(x_test_df, max_self_violation=1)
assertions

Mixed Assertion:
		Constraint: None --> Number of assertions: 12, Detailed assertions: Eigen invariant: 1.0*Diverted, mean: -1.7e-16, stddev: 1.1e-15, min: -4.4e-15, max: 1.2e-14 && Eigen invariant: -0.0005*Origin + -0.0005*Destination + -0.0187*Month + -0.0052*Day + 0.0005*CRSDepartureTime + -0.0008*CRSArrivalTime + -0.0088*UniqueCarrier + -0.0*FlightNumber + -0.0041*ActualElapsedTime + 0.0005*Distance + 0.9998*_one, mean: 0.023, stddev: 0.15, min: -1.6, max: 0.63 && Eigen invariant: -0.0029*Origin + -0.003*Destination + 0.9961*Month + -0.044*Day + 0.0025*CRSDepartureTime + -0.0044*CRSArrivalTime + -0.073*UniqueCarrier + -0.0001*FlightNumber + -0.0131*ActualElapsedTime + 0.0014*Distance + 0.0177*_one, mean: 0.17, stddev: 3, min: -8.2, max: 8 && Eigen invariant: -0.0156*Origin + -0.0152*Destination + 0.067*Month + -0.1172*Day + 0.0001*CRSDepartureTime + -0.0044*CRSArrivalTime + 0.9905*UniqueCarrier + -0.0005*FlightNumber + -0.0077*ActualElapsedTime + 0.002*Distance + 0.0093*_one, mean:

In [None]:
print("Violation on train", assertions.evaluate(x_train_df).avg_violation)
print("Violation on regular test", assertions.evaluate(x_test_df).avg_violation)
print("Violation on drifted test", assertions.evaluate(x_test_drifted_df).avg_violation)
print("Violation on all test", assertions.evaluate(x_test_all_df).avg_violation)

In [None]:
def transformation(df):
    CRSArrivalTimecol=[]
    for index,row in df.iterrows():
        if row['CRSDepartureTime'] > row['CRSArrivalTime']:
            CRSArrivalTimecol.append(row['CRSArrivalTime']+24*60)
        else:
            CRSArrivalTimecol.append(row['CRSArrivalTime'])
    df['CRSArrivalTime']=CRSArrivalTimecol
    return df
x_test_drifted_df=transformation(x_test_drifted_df)

In [95]:
print("Violation on drifted test", assertions.evaluate(x_test_drifted_df).avg_violation)


Violation on drifted test 0.03270331729689643


In [54]:
x_test_drifted_df

Unnamed: 0,Origin,Destination,Month,Day,CRSDepartureTime,CRSArrivalTime,UniqueCarrier,FlightNumber,ActualElapsedTime,Distance,Diverted
0,119,210,1,1,360,1793,14,5886,79,137,0
1,23,210,1,1,370,370,11,4260,79,122,0
2,195,117,1,1,395,1820,11,4396,51,134,0
3,301,142,1,1,435,1847,14,5772,30,58,0
4,109,210,1,1,435,435,11,4266,249,157,0
...,...,...,...,...,...,...,...,...,...,...,...
133452,220,150,10,31,1435,1890,4,178,279,2153,0
133453,16,98,10,31,1438,1494,3,193,55,261,0
133454,150,40,10,31,1439,1667,4,724,208,1576,0
133455,150,228,10,31,1439,1676,4,736,226,1617,0


In [55]:
x_train_df

Unnamed: 0,Origin,Destination,Month,Day,CRSDepartureTime,CRSArrivalTime,UniqueCarrier,FlightNumber,ActualElapsedTime,Distance,Diverted
0,35,96,7,11,575,775,18,2568,124,748,0
1,232,61,4,8,850,962,18,3070,117,540,0
2,18,35,9,19,1212,1220,6,1524,72,214,0
3,286,175,10,29,1035,1090,17,2316,57,223,0
4,210,68,10,11,1250,1342,14,6554,165,911,0
...,...,...,...,...,...,...,...,...,...,...,...
5393149,243,18,4,30,480,571,7,4824,92,357,0
5393150,138,199,10,2,480,695,0,2820,151,939,0
5393151,155,136,4,28,695,1003,5,494,177,1222,0
5393152,18,278,3,20,1229,1371,7,4730,159,784,0


In [82]:
reg = LinearRegression().fit(x_train, y_train)

print("MAE on train", mean_absolute_error(y_train, reg.predict(x_train)))
print("MAE on regular test", mean_absolute_error(y_test, reg.predict(x_test)))
print("MAE on drifted test", mean_absolute_error(y_test_drifted, reg.predict(x_test_drifted)))
print("MAE on all test", mean_absolute_error(y_test_all, reg.predict(x_test_all)))

MAE on train 18.952971110120313
MAE on regular test 18.89662295287625
MAE on drifted test 81.05010924938729
MAE on all test 38.77359109313111
