In [1]:
import numpy as np
import pandas as pd
import os
import pickle
from pathlib import Path
import pytz
from datetime import datetime

In [2]:
def get_yes_no_answer():
    '''
    
    Gets user response as 'y' or 'n' or 'yes' or 'no' or their case variations.
    
    '''
    
    while True:
        reply = str(input('Combine?: (y/n): ')).lower().strip()
        
        if reply == 'y' or reply == 'n':
            break
        else:
            print("Please select 'yes' or 'no'")
    
    return reply

In [4]:
def drop_dummy_scheds(df):
    '''
    
    Dummy schedules are identified when their Effective From Date is the same as Effective To Date.
    
    '''
    
    idx = [i for i in range(df.shape[0]) if df['eff_from'][i] == df['eff_to'][i]]
    df = df.drop(idx, axis = 0)
    df = df.reset_index()
    df = df.drop(['index'], axis = 1)
    
    return df

In [5]:
def drop_noncurrent_scheds(df, tz):
    '''
    
    Schedules that are, as of today, yet to commence or are no longer in operations.
    
    '''
    
    today = datetime.now(tz).date()
    idx = [i for i in range(df.shape[0]) 
           if (today <=  datetime.date(df['eff_from'][i])) or (today >= datetime.date(df['eff_to'][i]))]
    
    df = df.drop(idx, axis = 0)
    df = df.reset_index()
    df = df.drop('index', axis = 1)
    
    return df

In [6]:
def wrangle_frequency(df):
    '''
    
    Standardize <frequency> values for all schedules as a list of numbers that can take values from 1 to 7 for the
    weekdays on which a schedule is operational.
    
    '''

# Convert frequency into a string of numbers
    for i in range(df.shape[0]):
        if isinstance(df.frequency[i], str):
            df.frequency[i] = '1234567'
        else:
            df.frequency[i] = str(int(df.frequency[i]))

# Convert string of numbers into list of numbers
        df.frequency[i] = list(df.frequency[i]) # Split str
        df.frequency[i] = list(map(int, df.frequency[i])) # Convert from str to int
    
    return df

In [77]:
def make_pairs(df, human_intel = 'n'):
    '''
    
    In raw data, each schedule has two legs of information: at origin and at destination. These two legs are
    present as different records. Identify pairs and combine them into a single record.
    
    Allow for human intervetion to identify possible pairs when poor/missing data does not allow for automated 
    identification.
    
    Embedded helper function combine_pairs_func() appropriately combines records [i] and [j] and marks record [j] 
    for deletion.
    
    '''
    
    def combine_pairs_func(i, j, drop_idx):

        drop_idx.append(j)
        
        if df.to_time[i] == '': 
            df.to_time[i] = df.to_time[j]
        else: 
            df.from_time[i] = df.from_time[j]
            
        return drop_idx
    
    drop_idx = []        
    
    # Compare a record with only subsequent records and as long as the subsequent record has not already been
    # paired with some other record.
    for i in range(df.shape[0] - 1):
        if i not in drop_idx:
            for j in range(i+1, df.shape[0]):
                if j not in drop_idx:
                    if df.flight[i] != df.flight[j]: break                   
                    else:
                        # Automated pairing
                        if human_intel == 'n':
                            if df.frequency[i] == df.frequency[j] and df['from'][i] == df['from'][j] and df.to[i] == df.to[j]:
                                drop_idx = combine_pairs_func(i, j, drop_idx)
                                
                        # Pairing with human intervention by relaxing the identical frequency constraint
                        else:
                            if df.to[i] == df.to[j] and df['from'][i] == df['from'][j]:
                                if df.to_time[i] == '' and df.from_time[j] == '' and df.from_time[i] != '' and df.to_time[j] != '':
                                    display(df.iloc[i, :8], df.iloc[j, :8])
                                    display('Possibly Pairs?')
                                    reply = get_yes_no_answer()
                                    
                                    if reply == 'y':
                                        drop_idx = combine_pairs_func(i, j, drop_idx)
                                    
                                elif df.to_time[i] != '' and df.from_time[j] != '' and df.from_time[i] == '' and df.to_time[j] == '':
                                    display(df.iloc[i, :8], df.iloc[j, :8])
                                    display('Possibly Pairs?')
                                    reply = get_yes_no_answer()
                                    
                                    if reply == 'y':
                                        drop_idx = combine_pairs_func(i, j, drop_idx)
                            
    
    df = df.drop(drop_idx, axis = 0)
    df = df.reset_index()
    df = df.drop('index', axis = 1)
    
    return(df)

In [79]:
def merge_freq(df, human_intel = 'n'):
    
    '''
    
    Combine multiple schedule records where the schedules in the set differ only in their frequencies. This 
    requires no human intervention.

    Combine multiple schedules where the schedules in the set differ not only in their frequencies but vary 
    slightly in their <from_time> (or <to_time>) values. This requires human intervention
    
    Embedded helper function combine_frq_func() appropriately combines records [i] and [j] and marks record [j] 
    for deletion.
    
    '''
    
    def combine_freq_func(i, j, drop_idx):
        drop_idx.append(j)
        df.frequency[i] = df.frequency[i] + df.frequency[j]
        df.frequency[i] = list(set(df.frequency[i]))
        return drop_idx
        
    drop_idx = []    
    
    for i in range(df.shape[0] - 1):
        if i not in drop_idx:
            for j in range(i+1, df.shape[0]):
                if j not in drop_idx:
                    if df.flight[i] != df.flight[j]: break
                    else:
                    
                        # Automated Merging
                        if human_intel == 'n':
                            if df['from'][i] == df['from'][j] and df.to[i] == df.to[j]:
                                if df.to_time[i] == df.to_time[j] and df.from_time[i] == df.from_time[j]:
                                    drop_idx = combine_freq_func(i, j, drop_idx)

                        # Merging with human intervention by requiring only either to_time or from_time be identical  
                        else:
                            if df['from'][i] == df['from'][j] and df.to[i] == df.to[j]:
                                if df.to_time[i] == df.to_time[j] or df.from_time[i] == df.from_time[j]:

                                    display(df.iloc[i, :8], '\n', df.iloc[j, :8])
                                    display('Probably the same flight.')

                                    reply = get_yes_no_answer()

                                    if reply == 'y':
                                        drop_idx = combine_freq_func(i, j, drop_idx)
      
    df = df.drop(drop_idx, axis = 0)
    df = df.reset_index()
    df = df.drop('index', axis = 1)
    
    return(df)

In [9]:
def separate_orphan_scheds(df, orphan_scheds):
    
    orphan_idx = [i for i in range(df.shape[0]) if df.to_time[i] == '' or df.from_time[i] == '']
    orphan_scheds = pd.concat([orphan_scheds, df.iloc[orphan_idx]], ignore_index = True)        

    df = df.drop(orphan_idx, axis = 0)
    df = df.reset_index()
    df = df.drop('index', axis = 1)

    return df, orphan_scheds, len(orphan_idx)

In [11]:
def to_base10_time(df):
    
    for i in range(df.shape[0]):
        foo = df.from_time[i].split(':')
        df.from_time[i] = round((int(foo[0]) + int(foo[1])/60), 2)
        foo = df.to_time[i].split(':')
        df.to_time[i] = round((int(foo[0]) + int(foo[1])/60), 2)
        
    return df

In [82]:
# Set up defaults
stage_1_data_dir_path = './data/processed/stage-1'
processed_data_dir_path = './data/processed'
orphan_data_dir_path = './data/processed/orphan_scheds'
tz = pytz.timezone('Asia/Calcutta')

# Getting list of files in the stage-1 directory and processed file directory
_, _, stage_1_file_names = next(os.walk(stage_1_data_dir_path))
_, _, processed_file_names = next(os.walk(processed_data_dir_path))

# Retaining only pkl files from stage-1 directory and processed file directory
stage_1_file_names = [value for value in stage_1_file_names if '.pkl' in value]
processed_file_names = [value for value in processed_file_names if '.pkl' in value]

file_names = [value for value in stage_1_file_names if value not in processed_file_names]

file_names = ['TruJet.pkl']

# Processing individual files
for file_name in file_names:
    print('\nProcessing for %s' % file_name.split('.')[0])
    
    readpath = stage_1_data_dir_path + '/' + file_name
    dat = pd.read_pickle(readpath)  
    dat = drop_dummy_scheds(dat)
    dat = drop_noncurrent_scheds(dat, tz)
    print('(Stage VI) Read Stage-1 processed data')
    print('           Dropped dummy schedules')
    print('           Dropped non-current schedules: now has %i records' % dat.shape[0])
    
    print('\nStandardizing frequency and finding <from> and <to> legs of schedules... (will take time)')
    dat = wrangle_frequency(dat)
    
    # Sorting the records on flight number. This makes the subsequent steps faster
    dat = dat.astype({'flight' : 'str'})
    dat = dat.sort_values(by = ['flight'], axis = 0, ignore_index = True)
    
    dat = make_pairs(dat)
    print('(Stage VII) Standardized <frequency> data')
    print('            Paired the <from> and <to> legs of schedules: now has %i records' % dat.shape[0])
    
    print('\nFinding schedules split only by frequency... (will take time)')
    dat = merge_freq(dat)
    dat = make_pairs(dat)
    
    # Doing it once over again.....may not be required..... 
    dat = merge_freq(dat)
    dat = make_pairs(dat)
    print('(Stage VIII) Merged schedules split only by frequency')
    print('             Paired resultant <from> and <to> legs of a schedule: now has %i records' % dat.shape[0])
    
    print('\nFinding schedules split by frequency for human confirmation\n')
    dat = merge_freq(dat, 'y')
    dat = make_pairs(dat)
    print('(Stage IX) Merge schedules split by frequency but with human confirmation')
    print('           Paired resultant <from> and <to> legs of a schedule: now has %i records' % dat.shape[0])
    
    print('\nFinding possible pairs for human confirmation\n')
    dat = make_pairs(dat, 'y')
    print('(Stage X) Pair <from> and <to> legs of a schedule but with human confirmantion')
    print('          : now has %i records' % dat.shape[0])
    
    orphan_scheds = pd.DataFrame(data = None, columns = dat.columns)
        
    dat, orphan_scheds, orphans = separate_orphan_scheds(dat, orphan_scheds)
    print('(Stage XI) Separated % i orphan schedules: now has %i records' % (orphans, dat.shape[0]))
    
    dat = to_base10_time(dat)
    dat.to_pickle(processed_data_dir_path + '/' + file_name)
    orphan_scheds.to_pickle(orphan_data_dir_path + '/orphan_' + file_name)
    
    
    print('(Stage XII) Translated <from_time> and <to_time> to base-10 format')
    print('            and added %i records to final processed schedule file' % dat.shape[0])



Processing for TruJet
(Stage VI) Read Stage-1 processed data
           Dropped dummy schedules
           Dropped non-current schedules: now has 160 records

Standardizing frequency and finding <from> and <to> legs of schedules... (will take time)
(Stage VII) Standardized <frequency> data
            Paired the <from> and <to> legs of schedules: now has 104 records

Finding schedules split only by frequency... (will take time)
(Stage VIII) Merged schedules split only by frequency
             Paired resultant <from> and <to> legs of a schedule: now has 94 records

Finding schedules split by frequency for human confirmation



operator        TRJ
flight       2T 110
aircraft     ATR 72
frequency       [2]
from            BLR
from_time     11:35
to              HYD
to_time            
Name: 3, dtype: object

'\n'

operator                    TRJ
flight                   2T 110
aircraft                 ATR 72
frequency    [1, 3, 4, 5, 6, 7]
from                        BLR
from_time                 12:10
to                          HYD
to_time                        
Name: 5, dtype: object

'Probably the same flight.'

KeyboardInterrupt: 

In [91]:
# Set up defaults
processed_data_dir_path = './data/processed'
orphan_data_dir_path = './data/processed/orphan_scheds'

# Getting list of files in the processed data directory and orphan data directory
_, _, processed_file_names = next(os.walk(processed_data_dir_path))
_, _, orphan_file_names = next(os.walk(orphan_data_dir_path))


# Retaining only pkl files from processed data directory and orphan data directory
processed_file_names = [value for value in processed_file_names if '.pkl' in value]
orphan_file_names = [value for value in orphan_file_names if '.pkl' in value]

processed_data = pd.DataFrame(data = None, columns = dat.columns)
orphan_data = pd.DataFrame(data = None, columns = dat.columns)

for file_name in processed_file_names:
    readpath = processed_data_dir_path + '/' + file_name
    foo = pd.read_pickle(readpath)
    processed_data = pd.concat([processed_data, foo], ignore_index = True)
    
for file_name in orphan_file_names:
    readpath = orphan_data_dir_path + '/' + file_name
    foo = pd.read_pickle(readpath)
    orphan_data = pd.concat([orphan_data, foo], ignore_index = True)


processed_data.to_pickle(processed_data_dir_path + '/' + 'all-sched.pkl')
orphan_data.to_pickle(orphan_data_dir_path + '/' + 'all-orphan.pkl')

processed_data.to_excel(processed_data_dir_path + '/' + 'all-sched.xlsx')
orphan_data.to_excel(orphan_data_dir_path + '/' + 'all-orphan.xlsx')
                     
processed_data.to_csv(processed_data_dir_path + '/' + 'all-sched.csv')
orphan_data.to_csv(orphan_data_dir_path + '/' + 'all-orphan.csv')