In [98]:
import numpy as np
import pandas as pd
import os
import pickle
from pathlib import Path
import pytz
from datetime import datetime

In [99]:
def get_yes_no_answer():
    '''
    
    Gets user response as 'y' or 'n' or 'yes' or 'no' or their case variations.
    
    '''
    
    while True:
        reply = str(input('Combine?: (y/n): ')).lower().strip()
        
        if reply == 'y' or reply == 'n':
            break
        else:
            print("Please select 'yes' or 'no'")
    
    return reply

In [100]:
def read_format_csv(path):
    '''
    
    Read the previously stage-1 processed csv file and set the appropriate column formats.
    
    '''
    
    df = pd.read_pickle(readpath)
    '''df = df.fillna('')
    
    for i in range(df.shape[0]):
        df.eff_from[i] = datetime.strptime(df.eff_from[i], '%Y-%m-%d %H:%M:%S')
        df.eff_to[i] = datetime.strptime(df.eff_to[i], '%Y-%m-%d %H:%M:%S')'''
    
    return df
    

In [101]:
def drop_dummy_scheds(df):
    '''
    
    Dummy schedules are identified when their Effective From Date is the same as Effective To Date.
    
    '''
    
    idx = [i for i in range(df.shape[0]) if df['eff_from'][i] == df['eff_to'][i]]
    df = df.drop(idx, axis = 0)
    df = df.reset_index()
    df = df.drop(['index'], axis = 1)
    
    return df

In [102]:
def drop_noncurrent_scheds(df, tz):
    '''
    
    Schedules that are, as of today, yet to commence or are no longer in operations.
    
    '''
    
    today = datetime.now(tz).date()
    idx = [i for i in range(df.shape[0]) 
           if (today <=  datetime.date(df['eff_from'][i])) or (today >= datetime.date(df['eff_to'][i]))]
    
    df = df.drop(idx, axis = 0)
    df = df.reset_index()
    df = df.drop('index', axis = 1)
    
    return df

In [103]:
def wrangle_frequency(df):
    '''
    
    Standardize <frequency> values for all schedules as a list of numbers that can take values from 1 to 7 for the
    weekdays on which a schedule is operational.
    
    '''

# Convert frequency into a string of numbers
    for i in range(df.shape[0]):
        if isinstance(df.frequency[i], str):
            df.frequency[i] = '1234567'
        else:
            df.frequency[i] = str(int(df.frequency[i]))

# Convert string of numbers into list of numbers
        df.frequency[i] = list(df.frequency[i]) # Split str
        df.frequency[i] = list(map(int, df.frequency[i])) # Convert from str to int
    
    return df

In [104]:
def make_pairs(df, human_intel = 'n'):
    '''
    
    In raw data, each schedule has two legs of information: at origin and at destination. These two legs are
    present as different records. Identify pairs and combine them into a single record.
    
    Allow for human intervetion to identify possible pairs when poor/missing data does not allow for automated 
    identification.
    
    Embedded helper function combine_pairs_func() appropriately combines records [i] and [j] and marks record [j] 
    for deletion.
    
    '''
    
    def combine_pairs_func(i, j, drop_idx):

        drop_idx.append(j)
        
        if df.to_time[i] == '': 
            df.to_time[i] = df.to_time[j]
        else: 
            df.from_time[i] = df.from_time[j]
            
        return drop_idx
    
    drop_idx = []        
    
    # Compare a record with only subsequent records and as long as the subsequent record has not already been
    # paired with some other record.
    for i in range(df.shape[0] - 1):
        if i not in drop_idx:
            for j in range(i+1, df.shape[0]):
                if j not in drop_idx:
                    
                    # Automated pairing
                    if human_intel == 'n':
                        if df.flight[i] == df.flight[j] and df.frequency[i] == df.frequency[j]:
                            if df['from'][i] == df['from'][j] and df.to[i] == df.to[j]:
                                drop_idx = combine_pairs_func(i, j, drop_idx)
                                
                    # Pairing with human intervention by relaxing the identical frequency constraint
                    else:
                        if df.flight[i] == df.flight[j]:
                            if df.to[i] == df.to[j] and df['from'][i] == df['from'][j]:
                                if df.to_time[i] == '' and df.from_time[j] == '' and df.from_time[i] != '' and df.to_time[j] != '':
                                    display(df.iloc[i, :8], df.iloc[j, :8])
                                    display('Possibly Pairs?')
                                    reply = get_yes_no_answer()
                                    
                                    if reply == 'y':
                                        drop_idx = combine_pairs_func(i, j, drop_idx)
                                    
                                elif df.to_time[i] != '' and df.from_time[j] != '' and df.from_time[i] == '' and df.to_time[j] == '':
                                    display(df.iloc[i, :8], df.iloc[j, :8])
                                    display('Possibly Pairs?')
                                    reply = get_yes_no_answer()
                                    
                                    if reply == 'y':
                                        drop_idx = combine_pairs_func(i, j, drop_idx)
                            
    
    df = df.drop(drop_idx, axis = 0)
    df = df.reset_index()
    df = df.drop('index', axis = 1)
    
    return(df)

In [105]:
def merge_freq(df, human_intel = 'n'):
    
    '''
    
    Combine multiple schedule records where the schedules in the set differ only in their frequencies. This 
    requires no human intervention.

    Combine multiple schedules where the schedules in the set differ not only in their frequencies but vary 
    slightly in their <from_time> (or <to_time>) values. This requires human intervention
    
    Embedded helper function combine_frq_func() appropriately combines records [i] and [j] and marks record [j] 
    for deletion.
    
    '''
    
    def combine_freq_func(i, j, drop_idx):
        drop_idx.append(j)
        df.frequency[i] = df.frequency[i] + df.frequency[j]
        df.frequency[i] = list(set(df.frequency[i]))
        return drop_idx
        
    drop_idx = []    
    
    for i in range(df.shape[0] - 1):
        if i not in drop_idx:
            for j in range(i+1, df.shape[0]):
                if j not in drop_idx:
                    
                    # Automated Merging
                    if human_intel == 'n':
                        if df.flight[i] == df.flight[j] and df['from'][i] == df['from'][j] and df.to[i] == df.to[j]:
                            if df.to_time[i] == df.to_time[j] and df.from_time[i] == df.from_time[j]:
                                drop_idx = combine_freq_func(i, j, drop_idx)
                    
                    # Merging with human intervention by requiring only either to_time or from_time be identical  
                    else:
                        if df.flight[i] == df.flight[j] and df['from'][i] == df['from'][j] and df.to[i] == df.to[j]:
                            if df.to_time[i] == df.to_time[j] or df.from_time[i] == df.from_time[j]:
                                
                                display(df.iloc[i, :8], '\n', df.iloc[j, :8])
                                display('Probably the same flight.')

                                reply = get_yes_no_answer()
    
                                if reply == 'y':
                                    drop_idx = combine_freq_func(i, j, drop_idx)
      
    df = df.drop(drop_idx, axis = 0)
    df = df.reset_index()
    df = df.drop('index', axis = 1)
    
    return(df)

In [106]:
def separate_orphan_scheds(df, orphan_scheds):
    
    orphan_idx = [i for i in range(df.shape[0]) if df.to_time[i] == '' or df.from_time[i] == '']
    orphan_scheds = pd.concat([orphan_scheds, df.iloc[orphan_idx]], ignore_index = True)        

    df = df.drop(orphan_idx, axis = 0)
    df = df.reset_index()
    df = df.drop('index', axis = 1)

    return df, orphan_scheds, len(orphan_idx)

In [107]:
def to_base10_time(df):
    
    for i in range(df.shape[0]):
        foo = df.from_time[i].split(':')
        df.from_time[i] = round((int(foo[0]) + int(foo[1])/60), 2)
        foo = df.to_time[i].split(':')
        df.to_time[i] = round((int(foo[0]) + int(foo[1])/60), 2)
        
    return df

In [108]:
# Set up defaults
raw_data_dir_path ='./data/raw/'
stage_1_data_dir_path = './data/processed/stage-1'
processed_data_dir_path = './data/processed'
tz = pytz.timezone('Asia/Calcutta')

# File to collect schedules that can't be processed
orphan_file = Path('./data/processed/stage-1/orphan_scheds.csv')
if orphan_file.exists(): orphan_file_flag = 'y'
else: orphan_file_flag = 'n'

# Getting list of files in the directory
_, _, stage_1_file_names = next(os.walk(stage_1_data_dir_path))

# Retaining only CSV files from stage-1 data
stage_1_file_names = [value for value in stage_1_file_names if '.csv' in value]

stage_1_file_names = ['Vistara.csv']

# Processing individual files
for file_name in stage_1_file_names:
    print('\nProcessing for %s' % file_name.split('.')[0])
    
    readpath = stage_1_data_dir_path + '/' + file_name
    dat = read_format_csv(readpath)
    print('(Stage VI) Read Stage-1 processed data: has %i records' % dat.shape[0])
       
    dat = drop_dummy_scheds(dat)
    print('(Stage VII) Dropped dummy schedules: now has %i records' % dat.shape[0])
    
    dat = drop_noncurrent_scheds(dat, tz)
    print('(Stage VIII) Dropped non-current schedules: now has %i records' % dat.shape[0])
    
    dat = wrangle_frequency(dat)
    dat = make_pairs(dat)
    print('(Stage IX) Standardized <frequency> data')
    print('           Paired the <from> and <to> legs of a schedule: now has %i records' % dat.shape[0])
    
    dat = merge_freq(dat)
    dat = make_pairs(dat)
    print('(Stage X) Merge schedules split only by frequency')
    print('          Paired resultant <from> and <to> legs of a schedule: now has %i records' % dat.shape[0])
    
    dat = merge_freq(dat, 'y')
    dat = make_pairs(dat)
    print('(Stage XI) Merge schedules split by frequency but with human confirmation')
    print('           Paired resultant <from> and <to> legs of a schedule: now has %i records' % dat.shape[0])
    
    dat = make_pairs(dat, 'y')
    print('(Stage XII) Pair <from> and <to> legs of a schedule but with human confirmantion')
    print('            : now has %i records' % dat.shape[0])
    
    if orphan_file_flag == 'y':
        orphan_scheds = pd.read_csv(orphan_file, infer_datetime_format = True)
    else:
        orphan_scheds = pd.DataFrame(data = None, columns = dat.columns)
        orphan_file_flag = 'y'
        
    dat, orphan_scheds, orphans = separate_orphan_scheds(dat, orphan_scheds)
    print('(Stage XIII) Separated % i orphan schedules: now has %i records' % (orphans, dat.shape[0]))
    
    dat = to_base10_time(dat)
    if file_name == stage_1_file_names[0]:
        processed_scheds = dat
    else :
        processed_scheds = pd.concat([processed_scheds, dat], ignore_index = True)
    
    print('(Stage XIV) Translated <from_time> and <to_time> to base-10 format')
    print('            and added %i records to final processed schedule file' % dat.shape[0])



Processing for Vistara
(Stage VI) Read Stage-1 processed data: has 515 records
(Stage VII) Dropped dummy schedules: now has 486 records
(Stage VIII) Dropped non-current schedules: now has 462 records
(Stage IX) Standardized <frequency> data
           Paired the <from> and <to> legs of a schedule: now has 292 records
(Stage X) Merge schedules split only by frequency
          Paired resultant <from> and <to> legs of a schedule: now has 253 records


operator         UK
flight       UK 863
aircraft      A 320
frequency       [2]
from            BOM
from_time          
to              BLR
to_time       10:55
Name: 29, dtype: object

'\n'

operator                     UK
flight                   UK 863
aircraft                  A 320
frequency    [1, 3, 4, 5, 6, 7]
from                        BOM
from_time                      
to                          BLR
to_time                   11:10
Name: 30, dtype: object

'Probably the same flight.'

Combine?: (y/n): y


operator                     UK
flight                   UK 873
aircraft                  B 738
frequency    [1, 3, 4, 5, 6, 7]
from                        BOM
from_time                      
to                          HYD
to_time                    7:35
Name: 196, dtype: object

'\n'

operator         UK
flight       UK 873
aircraft      B 738
frequency       [2]
from            BOM
from_time          
to              HYD
to_time        7:45
Name: 197, dtype: object

'Probably the same flight.'

Combine?: (y/n): y


operator                     UK
flight                   UK 874
aircraft                  B 738
frequency    [1, 3, 4, 5, 6, 7]
from                        HYD
from_time                  8:15
to                          BOM
to_time                        
Name: 198, dtype: object

'\n'

operator         UK
flight       UK 874
aircraft      B 738
frequency       [2]
from            HYD
from_time      8:20
to              BOM
to_time            
Name: 199, dtype: object

'Probably the same flight.'

Combine?: (y/n): y
(Stage XI) Merge schedules split by frequency but with human confirmation
           Paired resultant <from> and <to> legs of a schedule: now has 247 records


operator                        UK
flight                      UK 845
aircraft                     B 738
frequency    [1, 2, 3, 4, 5, 6, 7]
from                           BOM
from_time                         
to                             BLR
to_time                       7:35
Name: 22, dtype: object

operator                     UK
flight                   UK 845
aircraft                  B 738
frequency    [1, 2, 3, 4, 5, 6]
from                        BOM
from_time                  6:00
to                          BLR
to_time                        
Name: 226, dtype: object

'Possibly Pairs?'

Combine?: (y/n): y


operator                        UK
flight                      UK 846
aircraft                     B 738
frequency    [1, 2, 3, 4, 5, 6, 7]
from                           BLR
from_time                     8:20
to                             BOM
to_time                           
Name: 23, dtype: object

operator                     UK
flight                   UK 846
aircraft                  B 738
frequency    [1, 2, 3, 4, 5, 6]
from                        BLR
from_time                      
to                          BOM
to_time                    9:55
Name: 229, dtype: object

'Possibly Pairs?'

Combine?: (y/n): y


operator                     UK
flight                   UK 667
aircraft                  A 320
frequency    [1, 2, 3, 4, 5, 6]
from                        DEL
from_time                      
to                          IXC
to_time                    8:15
Name: 54, dtype: object

operator                        UK
flight                      UK 667
aircraft                     A 320
frequency    [1, 2, 3, 4, 5, 6, 7]
from                           DEL
from_time                     7:10
to                             IXC
to_time                           
Name: 99, dtype: object

'Possibly Pairs?'

Combine?: (y/n): y


operator                     UK
flight                   UK 668
aircraft                  A 320
frequency    [1, 2, 3, 4, 5, 6]
from                        IXC
from_time                  8:50
to                          DEL
to_time                        
Name: 55, dtype: object

operator                        UK
flight                      UK 668
aircraft                     A 320
frequency    [1, 2, 3, 4, 5, 6, 7]
from                           IXC
from_time                         
to                             DEL
to_time                      10:00
Name: 112, dtype: object

'Possibly Pairs?'

Combine?: (y/n): y


operator                  UK
flight                UK 943
aircraft               A 320
frequency    [1, 2, 3, 4, 5]
from                     DEL
from_time               7:30
to                       BOM
to_time                     
Name: 102, dtype: object

operator                        UK
flight                      UK 943
aircraft                     A 320
frequency    [1, 2, 3, 4, 5, 6, 7]
from                           DEL
from_time                         
to                             BOM
to_time                       9:45
Name: 228, dtype: object

'Possibly Pairs?'

Combine?: (y/n): y


operator                  UK
flight                UK 930
aircraft               A 320
frequency    [1, 2, 3, 4, 5]
from                     BOM
from_time                   
to                       DEL
to_time                 9:35
Name: 110, dtype: object

operator                        UK
flight                      UK 930
aircraft                     A 320
frequency    [1, 2, 3, 4, 5, 6, 7]
from                           BOM
from_time                     7:30
to                             DEL
to_time                           
Name: 227, dtype: object

'Possibly Pairs?'

Combine?: (y/n): y
(Stage XII) Pair <from> and <to> legs of a schedule but with human confirmantion
            : now has 241 records
(Stage XIII) Separated  49 orphan schedules: now has 192 records
(Stage XIV) Translated <from_time> and <to_time> to base-10 format
            and added 192 records to final processed schedule file
Orphan schedules data for further processing in ./data/processed/stage-1/orphan_scheds.csv
Finished processing. The final schedule has 192 records


In [119]:
orphan_scheds.to_csv(stage_1_data_dir_path + '/orphan_scheds.csv', index = False)
print('\nOrphan schedules data for further processing in %s' % (stage_1_data_dir_path + '/' + 'orphan_scheds.csv'))

processed_scheds.to_csv(processed_data_dir_path + '/processed_scheds.csv', index = False, float_format = 'f0.2')
print('\nFinished processing. The final schedule has %i records' % processed_scheds.shape[0])


Orphan schedules data for further processing in ./data/processed/stage-1/orphan_scheds.csv

Finished processing. The final schedule has 192 records
