## Problem statement : Predict total fare for each flight for all carriers on a given day.

In [205]:
import pandas as pd
import re
import warnings

warnings.filterwarnings('ignore')

In [206]:
# Loading the datasets

#Train Dataframe
service_index_df=pd.read_csv("training\\training\\service_index.csv",index_col=0)
train_fares_df=pd.read_csv("training\\training\\train_fares.csv",index_col=0)
train_schedules_df=pd.read_csv("training\\training\\train_schedules.csv",index_col=0)

#Test Dataframe
test_fares_df=pd.read_csv("test\\test\\test_fares_data.csv",index_col=0)
test_schedules_df=pd.read_csv("test\\test\\test_schedules.csv",index_col=0)

In [207]:
#Basic testing and cleaning to make sure I dont face any joining issues afterwards
def data_check(df):
    for col in df.select_dtypes(exclude='number').columns:
        print(f'''\n================================{[ col ]}====================================''')
        print(df[col].unique())   
        print(df[col].isnull().sum())
data_check(test_schedules_df)


['L1' 'OTH' 'U3' 'L2' 'U1' 'L3']
0

['Airport26' 'Airport17' 'Airport4' 'Airport20' 'Airport30' 'Airport31'
 'Airport60' 'Airport43']
0

['Airport30' 'Airport4' 'Airport17' 'Airport43' 'Airport31' 'Airport60'
 'Airport26' 'Airport20']
0

['2019-01-01' '2019-01-02' '2019-01-03' '2019-01-04' '2019-01-05'
 '2019-01-06' '2019-01-07']
0

['2019-01-01 18:28:00.0' '2019-01-01 07:50:00.0' '2019-01-01 11:55:00.0'
 ... '2019-01-07 19:49:00.0' '2019-01-07 07:48:00.0'
 '2019-01-07 09:56:00.0']
0

['2019-01-01 19:49:00.0' '2019-01-01 09:12:00.0' '2019-01-01 13:19:00.0'
 ... '2019-01-07 22:47:00.0' '2019-01-07 12:47:00.0'
 '2019-01-07 18:51:00.0']
0

['2019-01-02 00:28:00.0' '2019-01-01 13:50:00.0' '2019-01-01 17:55:00.0'
 ... '2019-01-08 01:49:00.0' '2019-01-07 13:48:00.0'
 '2019-01-07 15:56:00.0']
0

['2019-01-02 03:49:00.0' '2019-01-01 17:12:00.0' '2019-01-01 21:19:00.0'
 ... '2019-01-08 04:47:00.0' '2019-01-07 18:47:00.0'
 '2019-01-08 00:51:00.0']
0


### Observations
1. There are no nulls in cat columns
2. Except date columns all column names are consistent


In [208]:
#Make date consistent in all
def make_date_consistent(df,sample_size=500):
    for col in df.select_dtypes(include='object'):
        sample = df[col].head(sample_size)
        if sample.empty:
            continue

        parsed = pd.to_datetime(sample, errors='coerce')
        if parsed.notna().all():
            df[col] = pd.to_datetime(df[col], errors='coerce')
            if not 'yr' in df.columns:
                df['yr'] = df[col].dt.year
                df['mo'] = df[col].dt.month
            df[col]=df[col].dt.strftime('%Y-%m-%d %H-%M-%S')
    return df


for df in [service_index_df,train_fares_df,train_schedules_df,test_fares_df,test_schedules_df]:
    df=make_date_consistent(df)

In [209]:
display(train_fares_df.tail(2))
display(train_schedules_df.tail(2))
display(service_index_df.tail(2))

Unnamed: 0,origin,destination,carrier,flt_num,flt_departure_dt,observation_date,total_fare,origin_city,destination_city,yr,mo
2160015,Airport20,Airport4,L1,7434,2018-10-21 00-00-00,2018-09-19 00-00-00,1428,City19,City4,2018,10
2160016,Airport20,Airport4,L1,5927,2018-10-21 00-00-00,2018-09-19 00-00-00,916,City19,City4,2018,10


Unnamed: 0,carrier,flt_num,origin,destination,flt_departure_dt,flt_departure_local_time,flt_arrival_local_time,flt_departure_gmt,flt_arrival_gmt,yr,mo
63583,OTH,7010,Airport30,Airport60,2018-10-31 00-00-00,2018-10-31 08-45-00,2018-10-31 10-20-00,2018-10-31 15-45-00,2018-10-31 17-20-00,2018,10
63584,L1,1292,Airport30,Airport60,2018-10-31 00-00-00,2018-10-31 16-53-00,2018-10-31 18-32-00,2018-10-31 23-53-00,2018-11-01 01-32-00,2018,10


Unnamed: 0,yr,mo,origin,destination,carrier,scaled_demand,scaled_share
18656,2018,12,Airport66,Airport43,L3,3374,168
18657,2018,12,Airport8,Airport20,U2,2600,279


In [210]:
# Joining Dataframes 
train_df=pd.merge(train_fares_df,train_schedules_df,how='inner',on=['origin','destination','flt_num','carrier','flt_departure_dt','yr','mo'])
train_df=pd.merge(train_df,service_index_df,how='inner',on=['origin','destination','carrier','yr','mo'])
train_df

Unnamed: 0,origin,destination,carrier,flt_num,flt_departure_dt,observation_date,total_fare,origin_city,destination_city,yr,mo,flt_departure_local_time,flt_arrival_local_time,flt_departure_gmt,flt_arrival_gmt,scaled_demand,scaled_share
0,Airport4,Airport43,L1,5911,2018-01-03 00-00-00,2017-11-29 00-00-00,538,City4,City39,2018,1,2018-01-03 17-00-00,2018-01-03 19-37-00,2018-01-03 23-00-00,2018-01-04 01-37-00,1378,305
1,Airport4,Airport43,L1,6589,2018-01-03 00-00-00,2017-11-29 00-00-00,538,City4,City39,2018,1,2018-01-03 08-55-00,2018-01-03 11-34-00,2018-01-03 14-55-00,2018-01-03 17-34-00,1378,305
2,Airport4,Airport43,L1,2689,2018-01-03 00-00-00,2017-11-29 00-00-00,538,City4,City39,2018,1,2018-01-03 06-45-00,2018-01-03 09-25-00,2018-01-03 12-45-00,2018-01-03 15-25-00,1378,305
3,Airport4,Airport43,L2,8244,2018-01-03 00-00-00,2017-11-29 00-00-00,557,City4,City39,2018,1,2018-01-03 08-10-00,2018-01-03 10-50-00,2018-01-03 14-10-00,2018-01-03 16-50-00,1378,320
4,Airport4,Airport43,L2,8523,2018-01-03 00-00-00,2017-11-29 00-00-00,557,City4,City39,2018,1,2018-01-03 18-27-00,2018-01-03 21-08-00,2018-01-04 00-27-00,2018-01-04 03-08-00,1378,320
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1987058,Airport20,Airport4,L1,7431,2018-10-20 00-00-00,2018-09-19 00-00-00,737,City19,City4,2018,10,2018-10-20 07-30-00,2018-10-20 10-25-00,2018-10-20 11-30-00,2018-10-20 15-25-00,2134,340
1987059,Airport20,Airport4,L1,5927,2018-10-20 00-00-00,2018-09-19 00-00-00,498,City19,City4,2018,10,2018-10-20 20-17-00,2018-10-20 23-05-00,2018-10-21 00-17-00,2018-10-21 04-05-00,2134,340
1987060,Airport20,Airport4,L1,7431,2018-10-21 00-00-00,2018-09-19 00-00-00,599,City19,City4,2018,10,2018-10-21 07-30-00,2018-10-21 10-23-00,2018-10-21 11-30-00,2018-10-21 15-23-00,2134,340
1987061,Airport20,Airport4,L1,7434,2018-10-21 00-00-00,2018-09-19 00-00-00,1428,City19,City4,2018,10,2018-10-21 12-00-00,2018-10-21 14-48-00,2018-10-21 16-00-00,2018-10-21 19-48-00,2134,340


In [211]:
train_fares_df.head(2)

Unnamed: 0,origin,destination,carrier,flt_num,flt_departure_dt,observation_date,total_fare,origin_city,destination_city,yr,mo
1,Airport4,Airport43,L1,5911,2018-01-03 00-00-00,2017-11-29 00-00-00,538,City4,City39,2018,1
2,Airport4,Airport43,L1,6589,2018-01-03 00-00-00,2017-11-29 00-00-00,538,City4,City39,2018,1


In [212]:
train_schedules_df.head(2)

Unnamed: 0,carrier,flt_num,origin,destination,flt_departure_dt,flt_departure_local_time,flt_arrival_local_time,flt_departure_gmt,flt_arrival_gmt,yr,mo
1,L1,6800,Airport26,Airport30,2018-02-01 00-00-00,2018-02-01 18-43-00,2018-02-01 20-09-00,2018-02-02 00-43-00,2018-02-02 04-09-00,2018,2
2,OTH,783,Airport26,Airport30,2018-02-01 00-00-00,2018-02-01 19-45-00,2018-02-01 21-05-00,2018-02-02 01-45-00,2018-02-02 05-05-00,2018,2
