In [1]:
import numpy as np
import pandas as pd
import os
import pickle
from pathlib import Path
import pytz
from datetime import datetime

In [2]:
def get_yes_no_answer():
    '''
    
    Gets user response as 'y' or 'n' or 'yes' or 'no' or their case variations.
    
    '''
    
    while True:
        reply = str(input('Combine?: (y/n): ')).lower().strip()
        
        if reply == 'y' or reply == 'n':
            break
        else:
            print("Please select 'yes' or 'no'")
    
    return reply

In [3]:
def drop_dummy_scheds(df):
    '''
    
    Dummy schedules are identified when their Effective From Date is the same as Effective To Date.
    
    '''
    
    idx = [i for i in range(df.shape[0]) if df['eff_from'][i] == df['eff_to'][i]]
    df = df.drop(idx, axis = 0)
    df = df.reset_index(drop = True)
    
    return df

In [4]:
def drop_noncurrent_scheds(df, tz):
    '''
    
    Schedules that are, as of today, yet to commence or are no longer in operations.
    
    '''
    
    today = datetime.now(tz).date()
    idx = [i for i in range(df.shape[0]) 
           if (today <=  datetime.date(df['eff_from'][i])) or (today >= datetime.date(df['eff_to'][i]))]
    
    df = df.drop(idx, axis = 0)
    df = df.reset_index(, drop = True)
    
    return df

In [5]:
def wrangle_frequency(df):
    '''
    
    Standardize <frequency> values for all schedules as a list of numbers that can take values from 1 to 7 for the
    weekdays on which a schedule is operational.
    
    '''

# Convert frequency into a string of numbers
    for i in range(df.shape[0]):
        if isinstance(df.frequency[i], str):
            df.frequency[i] = '1234567'
        else:
            df.frequency[i] = str(int(df.frequency[i]))

# Convert string of numbers into list of numbers
        df.frequency[i] = list(df.frequency[i]) # Split str
        df.frequency[i] = list(map(int, df.frequency[i])) # Convert from str to int
    
    return df

In [6]:
def make_pairs(df, human_intel = 'n'):
    '''
    
    In raw data, each schedule has two legs of information: at origin and at destination. These two legs are
    present as different records. Identify pairs and combine them into a single record.
    
    Allow for human intervetion to identify possible pairs when poor/missing data does not allow for automated 
    identification.
    
    Embedded helper function combine_pairs_func() appropriately combines records [i] and [j] and marks record [j] 
    for deletion.
    
    '''
    
    def combine_pairs_func(i, j, drop_idx):

        drop_idx.append(j)
        
        if df.to_time[i] == '': 
            df.to_time[i] = df.to_time[j]
        else: 
            df.from_time[i] = df.from_time[j]
            
        return drop_idx
    
    drop_idx = []        
    
    # Compare a record with only subsequent records and as long as the subsequent record has not already been
    # paired with some other record.
    for i in range(df.shape[0] - 1):
        if i not in drop_idx:
            for j in range(i+1, df.shape[0]):
                if j not in drop_idx:
                    if df.flight[i] != df.flight[j]: break                   
                    else:
                        # Automated pairing
                        if human_intel == 'n':
                            if df.frequency[i] == df.frequency[j] and df['from'][i] == df['from'][j] and df.to[i] == df.to[j]:
                                drop_idx = combine_pairs_func(i, j, drop_idx)
                                
                        # Pairing with human intervention by relaxing the identical frequency constraint
                        else:
                            if df.to[i] == df.to[j] and df['from'][i] == df['from'][j]:
                                if df.to_time[i] == '' and df.from_time[j] == '' and df.from_time[i] != '' and df.to_time[j] != '':
                                    display(df.iloc[i, :8], df.iloc[j, :8])
                                    display('Possibly Pairs?')
                                    reply = get_yes_no_answer()
                                    
                                    if reply == 'y':
                                        drop_idx = combine_pairs_func(i, j, drop_idx)
                                    
                                elif df.to_time[i] != '' and df.from_time[j] != '' and df.from_time[i] == '' and df.to_time[j] == '':
                                    display(df.iloc[i, :8], df.iloc[j, :8])
                                    display('Possibly Pairs?')
                                    reply = get_yes_no_answer()
                                    
                                    if reply == 'y':
                                        drop_idx = combine_pairs_func(i, j, drop_idx)
                            
    
    df = df.drop(drop_idx, axis = 0)
    df = df.reset_index(drop = True)
    
    return(df)

In [7]:
def merge_freq(df, human_intel = 'n'):
    
    '''
    
    Combine multiple schedule records where the schedules in the set differ only in their frequencies. This 
    requires no human intervention.

    Combine multiple schedules where the schedules in the set differ not only in their frequencies but vary 
    slightly in their <from_time> (or <to_time>) values. This requires human intervention
    
    Embedded helper function combine_frq_func() appropriately combines records [i] and [j] and marks record [j] 
    for deletion.
    
    '''
    
    def combine_freq_func(i, j, drop_idx):
        drop_idx.append(j)
        df.frequency[i] = df.frequency[i] + df.frequency[j]
        df.frequency[i] = list(set(df.frequency[i]))
        return drop_idx
        
    drop_idx = []    
    
    for i in range(df.shape[0] - 1):
        if i not in drop_idx:
            for j in range(i+1, df.shape[0]):
                if j not in drop_idx:
                    if df.flight[i] != df.flight[j]: break
                    else:
                    
                        # Automated Merging
                        if human_intel == 'n':
                            if df['from'][i] == df['from'][j] and df.to[i] == df.to[j]:
                                if df.to_time[i] == df.to_time[j] and df.from_time[i] == df.from_time[j]:
                                    drop_idx = combine_freq_func(i, j, drop_idx)

                        # Merging with human intervention by requiring only either to_time or from_time be identical  
                        else:
                            if df['from'][i] == df['from'][j] and df.to[i] == df.to[j]:
                                if df.to_time[i] == df.to_time[j] or df.from_time[i] == df.from_time[j]:

                                    display(df.iloc[i, :8], '\n', df.iloc[j, :8])
                                    display('Probably the same flight.')

                                    reply = get_yes_no_answer()

                                    if reply == 'y':
                                        drop_idx = combine_freq_func(i, j, drop_idx)
      
    df = df.drop(drop_idx, axis = 0)
    df = df.reset_index(drop = True)
    
    return(df)

In [8]:
def separate_orphan_scheds(df, orphan_scheds):
    
    '''
    
    After all the processing steps, we are still left with some records that do not have either a corresponding 
    <from> or a <to> schedule. This is usually due to bad data and will require significant manual effort to 
    enrich the records and make them available for future analysis.
    
    We separate such records into a file.
    
    '''
    
    orphan_idx = [i for i in range(df.shape[0]) if df.to_time[i] == '' or df.from_time[i] == '']
    orphan_scheds = pd.concat([orphan_scheds, df.iloc[orphan_idx]], ignore_index = True)        

    df = df.drop(orphan_idx, axis = 0)
    df = df.reset_index(drop = True)

    return df, orphan_scheds, len(orphan_idx)

In [9]:
def to_base10_time(df):
    
    '''
    
    For easier analysis, we convert the <from_time> and <to_time> from the HH:MM format to HH:XX format, 
    where the hours HH are from 0-24 but the minutes are converted from 0-60 to 0-99 i.e. the minutes are
    represented on a decimel scale.
    
    '''
    
    for i in range(df.shape[0]):
        foo = df.from_time[i].split(':')
        df.from_time[i] = round((int(foo[0]) + int(foo[1])/60), 2)
        foo = df.to_time[i].split(':')
        df.to_time[i] = round((int(foo[0]) + int(foo[1])/60), 2)
        
    return df

In [14]:
'''

Read the data files processed in the earlier stage, one at a time, and run them through the various 
processing steps. Details and workings of individual processing steps are captured in the helper functions 
above.

'''

# Set up defaults
stage_1_data_dir_path = './data/processed/stage-1'
stage_2_data_dir_path = './data/processed/stage-2'
orphan_data_dir_path = './data/processed/orphan_scheds'
tz = pytz.timezone('Asia/Calcutta')

# Getting list of files in the stage-1 directory and stage-2 file directories
_, _, stage_1_file_names = next(os.walk(stage_1_data_dir_path))
_, _, stage_2_file_names = next(os.walk(stage_2_data_dir_path))

# Retaining only pkl files from stage-1 directory and stage-2 file directories
stage_1_file_names = [value for value in stage_1_file_names if '.pkl' in value]
stage_2_file_names = [value for value in stage_2_file_names if '.pkl' in value]

file_names = [value for value in stage_1_file_names if value not in stage_2_file_names]

#file_names = ['TruJet.pkl']

# Processing individual files
for file_name in file_names:
    print('\nProcessing for %s' % file_name.split('.')[0])
    readpath = stage_1_data_dir_path + '/' + file_name
    dat = pd.read_pickle(readpath)  
    dat = drop_dummy_scheds(dat)
    dat = drop_noncurrent_scheds(dat, tz)
    print('(Step VI) Read Stage-1 processed data')
    print('          Dropped dummy schedules')
    print('          Dropped non-current schedules: now has %i records' % dat.shape[0])
    
    #####################################################
    
    print('\nStandardizing frequency and finding <from> and <to> legs of schedules... (may take time)')
    dat = wrangle_frequency(dat)
    # Sorting the records on flight number. This makes the subsequent steps faster
    dat = dat.astype({'flight' : 'str'})
    dat = dat.sort_values(by = ['flight'], axis = 0, ignore_index = True) 
    dat = make_pairs(dat)
    print('(Step VII) Standardized <frequency> data')
    print('           Paired the <from> and <to> legs of schedules: now has %i records' % dat.shape[0])
    
    ######################################################
        
    print('\nFinding schedules split only by frequency... (may take time)')
    dat = merge_freq(dat)    
    dat = make_pairs(dat)
    print('(Step VIII) Merged schedules split only by frequency')
    print('            Paired resultant <from> and <to> legs of a schedule: now has %i records' % dat.shape[0])
    
    ######################################################
    
    print('\nFinding schedules split by frequency for human confirmation\n')
    dat = merge_freq(dat, 'y')
    dat = make_pairs(dat)
    print('(Step IX) Merge schedules split by frequency but with human confirmation')
    print('          Paired resultant <from> and <to> legs of a schedule: now has %i records' % dat.shape[0])
    
    ######################################################
    
    print('\nFinding possible pairs for human confirmation\n')
    dat = make_pairs(dat, 'y')
    print('(Step X) Pair <from> and <to> legs of a schedule but with human confirmantion')
    print('         : now has %i records' % dat.shape[0])
    
    ######################################################
    
    orphan_scheds = pd.DataFrame(data = None, columns = dat.columns)
    dat, orphan_scheds, orphans = separate_orphan_scheds(dat, orphan_scheds)
    print('(Step XI) Separated % i orphan schedules: now has %i records' % (orphans, dat.shape[0]))
    
    ######################################################
    
    dat = to_base10_time(dat)
    dat.to_pickle(stage_2_data_dir_path + '/' + file_name)
    orphan_scheds.to_pickle(orphan_data_dir_path + '/orphan_' + file_name)
    print('(Step XII) Translated <from_time> and <to_time> to base-10 format')
    print('           and added %i records to final processed schedule file' % dat.shape[0])



Processing for TruJet
(Stage VI) Read Stage-1 processed data
           Dropped dummy schedules
           Dropped non-current schedules: now has 160 records

Standardizing frequency and finding <from> and <to> legs of schedules... (will take time)
(Stage VII) Standardized <frequency> data
            Paired the <from> and <to> legs of schedules: now has 104 records

Finding schedules split only by frequency... (will take time)
(Stage VIII) Merged schedules split only by frequency
             Paired resultant <from> and <to> legs of a schedule: now has 94 records

Finding schedules split by frequency for human confirmation



operator        TRJ
flight       2T 110
aircraft     ATR 72
frequency       [2]
from            BLR
from_time     11:35
to              HYD
to_time            
Name: 3, dtype: object

'\n'

operator                    TRJ
flight                   2T 110
aircraft                 ATR 72
frequency    [1, 3, 4, 5, 6, 7]
from                        BLR
from_time                 12:10
to                          HYD
to_time                        
Name: 5, dtype: object

'Probably the same flight.'

Combine?: (y/n): n


operator                    TRJ
flight                   2T 121
aircraft                 ATR 72
frequency    [1, 3, 4, 5, 6, 7]
from                        HYD
from_time                 10:55
to                          GOI
to_time                        
Name: 6, dtype: object

'\n'

operator        TRJ
flight       2T 121
aircraft     ATR 72
frequency       [2]
from            HYD
from_time     11:00
to              GOI
to_time            
Name: 7, dtype: object

'Probably the same flight.'

Combine?: (y/n): y


operator                    TRJ
flight                   2T 192
aircraft                 ATR 72
frequency    [1, 3, 4, 5, 6, 7]
from                        VTZ
from_time                      
to                          HYD
to_time                   20:05
Name: 31, dtype: object

'\n'

operator        TRJ
flight       2T 192
aircraft     ATR 72
frequency       [2]
from            VTZ
from_time          
to              HYD
to_time       21:10
Name: 32, dtype: object

'Probably the same flight.'

Combine?: (y/n): n


operator        TRJ
flight       2T 511
aircraft     ATR 72
frequency       [2]
from            HYD
from_time     12:35
to              CDP
to_time            
Name: 39, dtype: object

'\n'

operator                    TRJ
flight                   2T 511
aircraft                 ATR 72
frequency    [1, 3, 4, 5, 6, 7]
from                        HYD
from_time                 13:00
to                          CDP
to_time                        
Name: 40, dtype: object

'Probably the same flight.'

Combine?: (y/n): y


operator        TRJ
flight       2T 512
aircraft     ATR 72
frequency       [2]
from            CDP
from_time          
to              HYD
to_time       12:10
Name: 42, dtype: object

'\n'

operator                    TRJ
flight                   2T 512
aircraft                 ATR 72
frequency    [1, 3, 4, 5, 6, 7]
from                        CDP
from_time                      
to                          HYD
to_time                   12:35
Name: 44, dtype: object

'Probably the same flight.'

Combine?: (y/n): y


operator                    TRJ
flight                   2T 548
aircraft                 ATR 72
frequency    [1, 3, 4, 5, 6, 7]
from                        TIR
from_time                      
to                          IXG
to_time                    9:25
Name: 66, dtype: object

'\n'

operator        TRJ
flight       2T 548
aircraft     ATR 72
frequency       [2]
from            TIR
from_time          
to              IXG
to_time        9:15
Name: 67, dtype: object

'Probably the same flight.'

Combine?: (y/n): y


operator        TRJ
flight       2T 626
aircraft     ATR 72
frequency       [2]
from            BDR
from_time          
to              BLR
to_time       11:05
Name: 73, dtype: object

'\n'

operator                    TRJ
flight                   2T 626
aircraft                 ATR 72
frequency    [1, 3, 4, 5, 6, 7]
from                        BDR
from_time                      
to                          BLR
to_time                   11:40
Name: 74, dtype: object

'Probably the same flight.'

Combine?: (y/n): n


operator        TRJ
flight       2T 711
aircraft     ATR 72
frequency       [7]
from            AMD
from_time     15:05
to              IXY
to_time            
Name: 83, dtype: object

'\n'

operator                    TRJ
flight                   2T 711
aircraft                 ATR 72
frequency    [1, 2, 3, 4, 5, 6]
from                        AMD
from_time                 15:05
to                          IXY
to_time                   16:00
Name: 84, dtype: object

'Probably the same flight.'

Combine?: (y/n): y
(Stage IX) Merge schedules split by frequency but with human confirmation
           Paired resultant <from> and <to> legs of a schedule: now has 86 records

Finding possible pairs for human confirmation



operator        TRJ
flight       2T 110
aircraft     ATR 72
frequency       [2]
from            BLR
from_time     11:35
to              HYD
to_time            
Name: 3, dtype: object

operator                       TRJ
flight                      2T 110
aircraft                    ATR 72
frequency    [1, 2, 3, 4, 5, 6, 7]
from                           BLR
from_time                         
to                             HYD
to_time                      13:30
Name: 4, dtype: object

'Possibly Pairs?'

Combine?: (y/n): n


operator                       TRJ
flight                      2T 110
aircraft                    ATR 72
frequency    [1, 2, 3, 4, 5, 6, 7]
from                           BLR
from_time                         
to                             HYD
to_time                      13:30
Name: 4, dtype: object

operator                    TRJ
flight                   2T 110
aircraft                 ATR 72
frequency    [1, 3, 4, 5, 6, 7]
from                        BLR
from_time                 12:10
to                          HYD
to_time                        
Name: 5, dtype: object

'Possibly Pairs?'

Combine?: (y/n): y


operator                    TRJ
flight                   2T 192
aircraft                 ATR 72
frequency    [1, 3, 4, 5, 6, 7]
from                        VTZ
from_time                      
to                          HYD
to_time                   20:05
Name: 29, dtype: object

operator                       TRJ
flight                      2T 192
aircraft                    ATR 72
frequency    [1, 2, 3, 4, 5, 6, 7]
from                           VTZ
from_time                    18:30
to                             HYD
to_time                           
Name: 31, dtype: object

'Possibly Pairs?'

Combine?: (y/n): y


operator                       TRJ
flight                      2T 545
aircraft                    ATR 72
frequency    [1, 2, 3, 4, 5, 6, 7]
from                           IXG
from_time                         
to                             CDP
to_time                      11:15
Name: 55, dtype: object

operator                    TRJ
flight                   2T 545
aircraft                 ATR 72
frequency    [1, 3, 4, 5, 6, 7]
from                        IXG
from_time                  9:45
to                          CDP
to_time                        
Name: 56, dtype: object

'Possibly Pairs?'

Combine?: (y/n): y


operator                       TRJ
flight                      2T 707
aircraft                    ATR 72
frequency    [1, 2, 3, 4, 5, 6, 7]
from                           AMD
from_time                    17:50
to                             ISK
to_time                           
Name: 72, dtype: object

operator                    TRJ
flight                   2T 707
aircraft                 ATR 72
frequency    [1, 2, 3, 4, 5, 6]
from                        AMD
from_time                      
to                          ISK
to_time                   19:00
Name: 73, dtype: object

'Possibly Pairs?'

Combine?: (y/n): y


operator                    TRJ
flight                   2T 708
aircraft                 ATR 72
frequency    [1, 2, 3, 4, 5, 6]
from                        ISK
from_time                 19:20
to                          AMD
to_time                        
Name: 74, dtype: object

operator                       TRJ
flight                      2T 708
aircraft                    ATR 72
frequency    [1, 2, 3, 4, 5, 6, 7]
from                           ISK
from_time                         
to                             AMD
to_time                      20:25
Name: 75, dtype: object

'Possibly Pairs?'

Combine?: (y/n): y


operator                    TRJ
flight                   2T 712
aircraft                 ATR 72
frequency    [1, 2, 3, 4, 5, 6]
from                        IXY
from_time                 16:20
to                          AMD
to_time                        
Name: 77, dtype: object

operator                       TRJ
flight                      2T 712
aircraft                    ATR 72
frequency    [1, 2, 3, 4, 5, 6, 7]
from                           IXY
from_time                         
to                             AMD
to_time                      17:20
Name: 78, dtype: object

'Possibly Pairs?'

Combine?: (y/n): y
(Stage X) Pair <from> and <to> legs of a schedule but with human confirmantion
          : now has 80 records
(Stage XI) Separated  11 orphan schedules: now has 69 records
(Stage XII) Translated <from_time> and <to_time> to base-10 format
            and added 69 records to final processed schedule file


In [19]:
'''

Combine the individual files processed earlier into a single file - for schedules and for orphans - and store the 
final files in three different formats - pickle, excel and csv.

'''

# Set up defaults
stage_2_data_dir_path = './data/processed/stage-2'
orphan_data_dir_path = './data/processed/orphan_scheds'
final_data_dir_path = './data/processed/final'

# Getting list of files in the processed data directory and orphan data directory
_, _, stage_2_file_names = next(os.walk(stage_2_data_dir_path))
_, _, orphan_file_names = next(os.walk(orphan_data_dir_path))

# Retaining only pkl files from processed data directory and orphan data directory
stage_2_file_names = [value for value in stage_2_file_names if '.pkl' in value]
orphan_file_names = [value for value in orphan_file_names if '.pkl' in value]

final_data = pd.DataFrame(data = None, columns = dat.columns)
orphan_data = pd.DataFrame(data = None, columns = dat.columns)

for file_name in stage_2_file_names:
    readpath = stage_2_data_dir_path + '/' + file_name
    foo = pd.read_pickle(readpath)
    final_data = pd.concat([final_data, foo], ignore_index = True)
    
for file_name in orphan_file_names:
    readpath = orphan_data_dir_path + '/' + file_name
    foo = pd.read_pickle(readpath)
    orphan_data = pd.concat([orphan_data, foo], ignore_index = True)


final_data.to_pickle(final_data_dir_path + '/' + 'all-sched.pkl')
orphan_data.to_pickle(final_data_dir_path + '/' + 'all-orphan.pkl')

final_data.to_excel(final_data_dir_path + '/' + 'all-sched.xlsx', index = False)
orphan_data.to_excel(final_data_dir_path + '/' + 'all-orphan.xlsx', index = False)
                     
final_data.to_csv(final_data_dir_path + '/' + 'all-sched.csv', index = False)
orphan_data.to_csv(final_data_dir_path + '/' + 'all-orphan.csv', index = False)