Import libraries

In [2]:
# Data manipulation and file handling
import pandas as pd  # for data manipulation
import pyarrow.feather as feather  # for reading/writing Feather files

# Modeling and machine learning
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier  # example model
from sklearn.metrics import accuracy_score, classification_report  # evaluation metrics

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")


In [3]:
# Open training 
df= pd.read_feather('../data/processed/train_data.feather')

In [11]:
# Open testing
tdf = pd.read_feather('../data/processed/test_data.feather')

In [4]:
df.head()

Unnamed: 0,searchDate,flightDate,startingAirport,destinationAirport,isNonStop,totalFare,totalTravelDistance,segmentsArrivalAirportCode,DepartureTimeHour,CabinCode,AirlineNameScore,date_diff_days,weekday
3767896,2022-04-18,2022-05-06,LGA,DFW,True,93.599998,1380.0,DFW,8,1.0,2,18,4
929418,2022-05-11,2022-05-18,BOS,IAD,False,207.600006,406.0,JFK||IAD,11,1.0,4,7,2
5456598,2022-05-08,2022-06-21,ORD,EWR,False,358.600006,725.0,DTW||EWR,14,1.0,4,44,1
4876708,2022-05-08,2022-05-18,OAK,BOS,False,728.599976,2688.0,SLC||BOS,17,1.0,4,10,2
2004101,2022-05-12,2022-07-01,DFW,ATL,True,228.600006,725.0,ATL,19,1.0,4,50,4


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5106602 entries, 3767896 to 1692743
Data columns (total 13 columns):
 #   Column                      Dtype         
---  ------                      -----         
 0   searchDate                  datetime64[ns]
 1   flightDate                  datetime64[ns]
 2   startingAirport             object        
 3   destinationAirport          object        
 4   isNonStop                   bool          
 5   totalFare                   float32       
 6   totalTravelDistance         float32       
 7   segmentsArrivalAirportCode  object        
 8   DepartureTimeHour           uint8         
 9   CabinCode                   float32       
 10  AirlineNameScore            uint8         
 11  date_diff_days              uint16        
 12  weekday                     uint8         
dtypes: bool(1), datetime64[ns](2), float32(3), object(3), uint16(1), uint8(3)
memory usage: 321.4+ MB


In [7]:
df.shape

(5106602, 13)

In [8]:

# Calculate average price per route
route_avg_price = (
    df.groupby(['startingAirport', 'destinationAirport'])['totalFare']
    .mean()
    .reset_index()
    .rename(columns={'totalFare': 'average_price'})
)

# Merge this back to the original dataset
data = df.merge(route_avg_price, on=['startingAirport', 'destinationAirport'], how='left')

In [9]:
data.head()

Unnamed: 0,searchDate,flightDate,startingAirport,destinationAirport,isNonStop,totalFare,totalTravelDistance,segmentsArrivalAirportCode,DepartureTimeHour,CabinCode,AirlineNameScore,date_diff_days,weekday,average_price
0,2022-04-18,2022-05-06,LGA,DFW,True,93.599998,1380.0,DFW,8,1.0,2,18,4,218.513184
1,2022-05-11,2022-05-18,BOS,IAD,False,207.600006,406.0,JFK||IAD,11,1.0,4,7,2,233.873978
2,2022-05-08,2022-06-21,ORD,EWR,False,358.600006,725.0,DTW||EWR,14,1.0,4,44,1,233.211212
3,2022-05-08,2022-05-18,OAK,BOS,False,728.599976,2688.0,SLC||BOS,17,1.0,4,10,2,597.794067
4,2022-05-12,2022-07-01,DFW,ATL,True,228.600006,725.0,ATL,19,1.0,4,50,4,260.961975


In [10]:
# Display a few sample rows with 'startingAirport', 'destinationAirport', 'totalFare', and 'average_price'
print(data[['startingAirport', 'destinationAirport', 'totalFare', 'average_price']].sample(10))


        startingAirport destinationAirport   totalFare  average_price
3898012             DFW                ORD  162.600006     299.667816
2237872             EWR                IAD  139.210007     291.370392
4372253             BOS                DTW  210.100006     234.250565
919424              OAK                LAX  470.209991     297.248566
973749              DFW                MIA  227.600006     271.625336
4743737             ATL                SFO  450.200012     550.965881
1196766             JFK                LAX  497.600006     414.615814
3271967             OAK                LAX  389.209991     297.248566
3809757             EWR                DFW  323.609985     280.545776
1382625             DFW                CLT  187.600006     296.208527


In [12]:
# Step 3: Merge the same average price into the test set
# This will apply the training-set-based averages to the test set routes
test_data = tdf.merge(route_avg_price, on=['startingAirport', 'destinationAirport'], how='left')