In [36]:
import numpy as np
import pandas as pd
import os
import pickle
from pathlib import Path
import pytz
from datetime import datetime

In [37]:
def get_yes_no_answer():
    '''
    
    Gets user response as 'y' or 'n' or 'yes' or 'no' or their case variations.
    
    '''
    
    while True:
        reply = str(input('Combine?: (y/n): ')).lower().strip()
        
        if reply == 'y' or reply == 'n':
            break
        else:
            print("Please select 'yes' or 'no'")
    
    return reply

In [38]:
def drop_dummy_scheds(df):
    '''
    
    Dummy schedules are identified when their Effective From Date is the same as Effective To Date.
    
    '''
    
    idx = [i for i in range(df.shape[0]) if df['eff_from'][i] == df['eff_to'][i]]
    df = df.drop(idx, axis = 0)
    df = df.reset_index(drop = True)
    
    return df

In [39]:
def drop_noncurrent_scheds(df, tz):
    '''
    
    Schedules that are, as of today, yet to commence or are no longer in operations.
    
    '''
    
    today = datetime.now(tz).date()
    idx = [i for i in range(df.shape[0]) 
           if (today <=  datetime.date(df['eff_from'][i])) or (today >= datetime.date(df['eff_to'][i]))]
    
    df = df.drop(idx, axis = 0)
    df = df.reset_index(drop = True)
    
    return df

In [40]:
def wrangle_frequency(df):
    '''
    
    Standardize <frequency> values for all schedules as a list of numbers that can take values from 1 to 7 for the
    weekdays on which a schedule is operational.
    
    '''

# Convert frequency into a string of numbers
    for i in range(df.shape[0]):
        if isinstance(df.frequency[i], str):
            df.frequency[i] = '1234567'
        else:
            df.frequency[i] = str(int(df.frequency[i]))

# Convert string of numbers into list of numbers
        df.frequency[i] = list(df.frequency[i]) # Split str
        df.frequency[i] = list(map(int, df.frequency[i])) # Convert from str to int
    
    return df

In [41]:
def make_pairs(df, human_intel = 'n'):
    '''
    
    In raw data, each schedule has two legs of information: at origin and at destination. These two legs are
    present as different records. Identify pairs and combine them into a single record.
    
    Allow for human intervetion to identify possible pairs when poor/missing data does not allow for automated 
    identification.
    
    Embedded helper function combine_pairs_func() appropriately combines records [i] and [j] and marks record [j] 
    for deletion.
    
    '''
    
    def combine_pairs_func(i, j, drop_idx):

        drop_idx.append(j)
        
        if df.to_time[i] == '': 
            df.to_time[i] = df.to_time[j]
        else: 
            df.from_time[i] = df.from_time[j]
            
        return drop_idx
    
    drop_idx = []        
    
    # Compare a record with only subsequent records and as long as the subsequent record has not already been
    # paired with some other record.
    for i in range(df.shape[0] - 1):
        if i not in drop_idx:
            for j in range(i+1, df.shape[0]):
                if j not in drop_idx:
                    if df.flight[i] != df.flight[j]: break                   
                    else:
                        # Automated pairing
                        if human_intel == 'n':
                            if df.frequency[i] == df.frequency[j] and df['from'][i] == df['from'][j] and df.to[i] == df.to[j]:
                                drop_idx = combine_pairs_func(i, j, drop_idx)
                                
                        # Pairing with human intervention by relaxing the identical frequency constraint
                        else:
                            if df.to[i] == df.to[j] and df['from'][i] == df['from'][j]:
                                if df.to_time[i] == '' and df.from_time[j] == '' and df.from_time[i] != '' and df.to_time[j] != '':
                                    display(df.iloc[i, :8], df.iloc[j, :8])
                                    display('Possibly Pairs?')
                                    reply = get_yes_no_answer()
                                    
                                    if reply == 'y':
                                        drop_idx = combine_pairs_func(i, j, drop_idx)
                                    
                                elif df.to_time[i] != '' and df.from_time[j] != '' and df.from_time[i] == '' and df.to_time[j] == '':
                                    display(df.iloc[i, :8], df.iloc[j, :8])
                                    display('Possibly Pairs?')
                                    reply = get_yes_no_answer()
                                    
                                    if reply == 'y':
                                        drop_idx = combine_pairs_func(i, j, drop_idx)
                            
    
    df = df.drop(drop_idx, axis = 0)
    df = df.reset_index(drop = True)
    
    return(df)

In [42]:
def merge_freq(df, human_intel = 'n'):
    
    '''
    
    Combine multiple schedule records where the schedules in the set differ only in their frequencies. This 
    requires no human intervention.

    Combine multiple schedules where the schedules in the set differ not only in their frequencies but vary 
    slightly in their <from_time> (or <to_time>) values. This requires human intervention
    
    Embedded helper function combine_frq_func() appropriately combines records [i] and [j] and marks record [j] 
    for deletion.
    
    '''
    
    def combine_freq_func(i, j, drop_idx):
        drop_idx.append(j)
        df.frequency[i] = df.frequency[i] + df.frequency[j]
        df.frequency[i] = list(set(df.frequency[i]))
        return drop_idx
        
    drop_idx = []    
    
    for i in range(df.shape[0] - 1):
        if i not in drop_idx:
            for j in range(i+1, df.shape[0]):
                if j not in drop_idx:
                    if df.flight[i] != df.flight[j]: break
                    else:
                    
                        # Automated Merging
                        if human_intel == 'n':
                            if df['from'][i] == df['from'][j] and df.to[i] == df.to[j]:
                                if df.to_time[i] == df.to_time[j] and df.from_time[i] == df.from_time[j]:
                                    drop_idx = combine_freq_func(i, j, drop_idx)

                        # Merging with human intervention by requiring only either to_time or from_time be identical  
                        else:
                            if df['from'][i] == df['from'][j] and df.to[i] == df.to[j]:
                                if df.to_time[i] == df.to_time[j] or df.from_time[i] == df.from_time[j]:

                                    display(df.iloc[i, :8], '\n', df.iloc[j, :8])
                                    display('Probably the same flight.')

                                    reply = get_yes_no_answer()

                                    if reply == 'y':
                                        drop_idx = combine_freq_func(i, j, drop_idx)
      
    df = df.drop(drop_idx, axis = 0)
    df = df.reset_index(drop = True)
    
    return(df)

In [43]:
def separate_orphan_scheds(df, orphan_scheds):
    
    '''
    
    After all the processing steps, we are still left with some records that do not have either a 
    corresponding <from> or a <to> schedule. This is usually due to bad data and will require significant 
    manual effort to enrich the records and make them available for future analysis.
    
    We separate such records into a file.
    
    '''
    
    orphan_idx = [i for i in range(df.shape[0]) if df.to_time[i] == '' or df.from_time[i] == '']
    orphan_scheds = pd.concat([orphan_scheds, df.iloc[orphan_idx]], ignore_index = True)        

    df = df.drop(orphan_idx, axis = 0)
    df = df.reset_index(drop = True)

    return df, orphan_scheds, len(orphan_idx)

In [44]:
def to_base10_time(df):
    
    '''
    
    For easier analysis, we convert the <from_time> and <to_time> from the HH:MM format to HH:XX format, 
    where the hours HH are from 0-24 but the minutes are converted from 0-60 to 0-99 i.e. the minutes are
    represented on a decimel scale.
    
    '''
    
    for i in range(df.shape[0]):
        foo = df.from_time[i].split(':')
        df.from_time[i] = round((int(foo[0]) + int(foo[1])/60), 2)
        foo = df.to_time[i].split(':')
        df.to_time[i] = round((int(foo[0]) + int(foo[1])/60), 2)
        
    return df

In [45]:
'''

Read the data files processed in the earlier stage, one at a time, and run them through the various 
processing steps. Details and workings of individual processing steps are captured in the helper functions 
above.

'''

# Set up defaults
stage_1_data_dir_path = './data/processed/stage-1'
stage_2_data_dir_path = './data/processed/stage-2'
orphan_data_dir_path = './data/processed/orphan_scheds'
tz = pytz.timezone('Asia/Calcutta')

# Getting list of files in the stage-1 directory and stage-2 file directories
_, _, stage_1_file_names = next(os.walk(stage_1_data_dir_path))
_, _, stage_2_file_names = next(os.walk(stage_2_data_dir_path))

# Retaining only pkl files from stage-1 directory and stage-2 file directories
stage_1_file_names = [value for value in stage_1_file_names if '.pkl' in value]
stage_2_file_names = [value for value in stage_2_file_names if '.pkl' in value]

file_names = [value for value in stage_1_file_names if value not in stage_2_file_names]

# Processing individual files
for file_name in file_names:
    print('\nProcessing for %s' % file_name.split('.')[0])
    readpath = stage_1_data_dir_path + '/' + file_name
    dat = pd.read_pickle(readpath)  
    dat = drop_dummy_scheds(dat)
    dat = drop_noncurrent_scheds(dat, tz)
    print('(Step VI) Read Stage-1 processed data')
    print('          Dropped dummy schedules')
    print('          Dropped non-current schedules: now has %i records' % dat.shape[0])
    
    #####################################################
    
    print('\nStandardizing frequency and finding <from> and <to> legs of schedules... (may take time)')
    dat = wrangle_frequency(dat)
    # Sorting the records on flight number. This makes the subsequent steps faster
    dat = dat.astype({'flight' : 'str'})
    dat = dat.sort_values(by = ['flight'], axis = 0, ignore_index = True) 
    dat = make_pairs(dat)
    print('(Step VII) Standardized <frequency> data')
    print('           Paired the <from> and <to> legs of schedules: now has %i records' % dat.shape[0])
    
    ######################################################
        
    print('\nFinding schedules split only by frequency... (may take time)')
    dat = merge_freq(dat)    
    dat = make_pairs(dat)
    print('(Step VIII) Merged schedules split only by frequency')
    print('            Paired resultant <from> and <to> legs of a schedule: now has %i records' % dat.shape[0])
    
    ######################################################
    
    print('\nFinding schedules split by frequency for human confirmation\n')
    dat = merge_freq(dat, 'y')
    dat = make_pairs(dat)
    print('(Step IX) Merge schedules split by frequency but with human confirmation')
    print('          Paired resultant <from> and <to> legs of a schedule: now has %i records' % dat.shape[0])
    
    ######################################################
    
    print('\nFinding possible pairs for human confirmation\n')
    dat = make_pairs(dat, 'y')
    print('(Step X) Pair <from> and <to> legs of a schedule but with human confirmantion')
    print('         : now has %i records' % dat.shape[0])
    
    ######################################################
    
    orphan_scheds = pd.DataFrame(data = None, columns = dat.columns)
    dat, orphan_scheds, orphans = separate_orphan_scheds(dat, orphan_scheds)
    print('(Step XI) Separated % i orphan schedules: now has %i records' % (orphans, dat.shape[0]))
    
    ######################################################
    
    dat = to_base10_time(dat)
    dat.to_pickle(stage_2_data_dir_path + '/' + file_name)
    orphan_scheds.to_pickle(orphan_data_dir_path + '/orphan_' + file_name)
    print('(Step XII) Translated <from_time> and <to_time> to base-10 format')
    print('           and added %i records to final processed schedule file' % dat.shape[0])



Processing for Alliance_Air
(Step VI) Read Stage-1 processed data
          Dropped dummy schedules
          Dropped non-current schedules: now has 356 records

Standardizing frequency and finding <from> and <to> legs of schedules... (may take time)
(Step VII) Standardized <frequency> data
           Paired the <from> and <to> legs of schedules: now has 231 records

Finding schedules split only by frequency... (may take time)
(Step VIII) Merged schedules split only by frequency
            Paired resultant <from> and <to> legs of a schedule: now has 189 records

Finding schedules split by frequency for human confirmation



operator                    AAS
flight                   9I 538
aircraft                 ATR 72
frequency    [1, 2, 3, 4, 5, 7]
from                        HYD
from_time                 17:25
to                          VGA
to_time                        
Name: 20, dtype: object

'\n'

operator        AAS
flight       9I 538
aircraft     ATR 72
frequency       [6]
from            HYD
from_time     17:30
to              VGA
to_time            
Name: 21, dtype: object

'Probably the same flight.'

Combine?: (y/n): y


operator        AAS
flight       9I 752
aircraft     ATR 72
frequency       [6]
from            IXI
from_time     15:20
to              CCU
to_time            
Name: 108, dtype: object

'\n'

operator                    AAS
flight                   9I 752
aircraft                 ATR 72
frequency    [1, 2, 3, 4, 5, 7]
from                        IXI
from_time                 16:35
to                          CCU
to_time                        
Name: 109, dtype: object

'Probably the same flight.'

Combine?: (y/n): n


operator                 AAS
flight                9I 873
aircraft              ATR 72
frequency    [1, 2, 3, 4, 5]
from                     ISK
from_time               8:20
to                       AMD
to_time                     
Name: 147, dtype: object

'\n'

operator        AAS
flight       9I 873
aircraft     ATR 72
frequency       [6]
from            ISK
from_time     11:30
to              AMD
to_time            
Name: 148, dtype: object

'Probably the same flight.'

Combine?: (y/n): n


operator        AAS
flight       9I 874
aircraft     ATR 72
frequency       [6]
from            AMD
from_time          
to              ISK
to_time       14:30
Name: 149, dtype: object

'\n'

operator                 AAS
flight                9I 874
aircraft              ATR 72
frequency    [1, 2, 3, 4, 5]
from                     AMD
from_time                   
to                       ISK
to_time                14:25
Name: 152, dtype: object

'Probably the same flight.'

Combine?: (y/n): y


operator                    AAS
flight                   9I 893
aircraft                 ATR 72
frequency    [1, 3, 4, 5, 6, 7]
from                        MYQ
from_time                  8:30
to                          COK
to_time                        
Name: 177, dtype: object

'\n'

operator        AAS
flight       9I 893
aircraft     ATR 72
frequency       [2]
from            MYQ
from_time     10:25
to              COK
to_time            
Name: 178, dtype: object

'Probably the same flight.'

Combine?: (y/n): n


operator                    AAS
flight                   9I 895
aircraft                 ATR 72
frequency    [1, 3, 4, 5, 6, 7]
from                        MYQ
from_time                 15:20
to                          GOI
to_time                        
Name: 181, dtype: object

'\n'

operator        AAS
flight       9I 895
aircraft     ATR 72
frequency       [2]
from            MYQ
from_time     17:05
to              GOI
to_time            
Name: 182, dtype: object

'Probably the same flight.'

Combine?: (y/n): n


operator        AAS
flight       9I 896
aircraft     ATR 72
frequency       [2]
from            GOI
from_time          
to              MYQ
to_time       20:30
Name: 183, dtype: object

'\n'

operator                    AAS
flight                   9I 896
aircraft                 ATR 72
frequency    [1, 3, 4, 5, 6, 7]
from                        GOI
from_time                      
to                          MYQ
to_time                   18:50
Name: 184, dtype: object

'Probably the same flight.'

Combine?: (y/n): n
(Step IX) Merge schedules split by frequency but with human confirmation
          Paired resultant <from> and <to> legs of a schedule: now has 185 records

Finding possible pairs for human confirmation



operator           AAS
flight          9I 731
aircraft        ATR 72
frequency    [4, 6, 7]
from               GAU
from_time             
to                 TEZ
to_time           8:40
Name: 92, dtype: object

operator           AAS
flight          9I 731
aircraft        ATR 72
frequency    [3, 6, 7]
from               GAU
from_time         8:00
to                 TEZ
to_time               
Name: 94, dtype: object

'Possibly Pairs?'

Combine?: (y/n): y


operator           AAS
flight          9I 732
aircraft        ATR 72
frequency    [4, 6, 7]
from               TEZ
from_time         9:00
to                 GAU
to_time               
Name: 96, dtype: object

operator           AAS
flight          9I 732
aircraft        ATR 72
frequency    [3, 6, 7]
from               TEZ
from_time             
to                 GAU
to_time          10:00
Name: 97, dtype: object

'Possibly Pairs?'

Combine?: (y/n): y


operator              AAS
flight             9I 733
aircraft           ATR 72
frequency    [1, 2, 4, 5]
from                  GAU
from_time            8:00
to                    IXT
to_time                  
Name: 99, dtype: object

operator              AAS
flight             9I 733
aircraft           ATR 72
frequency    [1, 2, 3, 5]
from                  GAU
from_time                
to                    IXT
to_time              9:25
Name: 101, dtype: object

'Possibly Pairs?'

Combine?: (y/n): y


operator                    AAS
flight                   9I 873
aircraft                 ATR 72
frequency    [1, 2, 3, 4, 5, 6]
from                        ISK
from_time                      
to                          AMD
to_time                    9:35
Name: 144, dtype: object

operator                 AAS
flight                9I 873
aircraft              ATR 72
frequency    [1, 2, 3, 4, 5]
from                     ISK
from_time               8:20
to                       AMD
to_time                     
Name: 145, dtype: object

'Possibly Pairs?'

Combine?: (y/n): y
(Step X) Pair <from> and <to> legs of a schedule but with human confirmantion
         : now has 181 records
(Step XI) Separated  30 orphan schedules: now has 151 records
(Step XII) Translated <from_time> and <to_time> to base-10 format
           and added 151 records to final processed schedule file


In [47]:
'''

Combine the individual files processed earlier into a single file - for schedules and for orphans - and store the 
final files in three different formats - pickle, excel and csv.

'''

# Set up defaults
stage_2_data_dir_path = './data/processed/stage-2'
orphan_data_dir_path = './data/processed/orphan_scheds'
final_data_dir_path = './data/processed/final'

# Getting list of files in the processed data directory and orphan data directory
_, _, stage_2_file_names = next(os.walk(stage_2_data_dir_path))
_, _, orphan_file_names = next(os.walk(orphan_data_dir_path))

# Retaining only pkl files from processed data directory and orphan data directory
stage_2_file_names = [value for value in stage_2_file_names if '.pkl' in value]
orphan_file_names = [value for value in orphan_file_names if '.pkl' in value]

final_data = pd.DataFrame(data = None, columns = dat.columns)
orphan_data = pd.DataFrame(data = None, columns = dat.columns)


for file_name in stage_2_file_names:
    readpath = stage_2_data_dir_path + '/' + file_name
    foo = pd.read_pickle(readpath)
    final_data = pd.concat([final_data, foo], ignore_index = True)
    
for file_name in orphan_file_names:
    readpath = orphan_data_dir_path + '/' + file_name
    foo = pd.read_pickle(readpath)
    orphan_data = pd.concat([orphan_data, foo], ignore_index = True)


final_data.to_pickle(final_data_dir_path + '/' + 'all-sched.pkl')
orphan_data.to_pickle(final_data_dir_path + '/' + 'all-orphan.pkl')

final_data.to_excel(final_data_dir_path + '/' + 'all-sched.xlsx', index = False)
orphan_data.to_excel(final_data_dir_path + '/' + 'all-orphan.xlsx', index = False)
                     
final_data.to_csv(final_data_dir_path + '/' + 'all-sched.csv', index = False)
orphan_data.to_csv(final_data_dir_path + '/' + 'all-orphan.csv', index = False)