In [17]:
import numpy as np
import pandas as pd
import os
import pickle

In [None]:
def get_yes_no_answer():
    '''
    
    Gets user response as 'y' or 'n' or 'yes' or 'no' or their case variations.
    
    '''
    
    while True:
        reply = str(input('Combine?: (y/n): ')).lower().strip()
        
        if reply == 'y' or reply == 'n':
            break
        else:
            print("Please select 'yes' or 'no'")
    
    return reply

In [18]:
def drop_dummy_scheds(df):
    '''
    
    Dummy schedules are identified when their Effective From Date is the same as Effective To Date.
    
    '''
    
    idx = [i for i in range(df.shape[0]) if df['eff_from'][i] == df['eff_to'][i]]
    df = df.drop(idx, axis = 0)
    df = df.reset_index()
    df = df.drop(['index'], axis = 1)
    
    return df

In [19]:
def drop_noncurrent_scheds(df, tz):
    '''
    
    Schedules that are, as of today, yet to commence or are no longer in operations.
    
    '''
    
    today = datetime.now(tz).date()
    idx = [i for i in range(df.shape[0]) 
           if (today <=  datetime.date(df['eff_from'][i])) or (today >= datetime.date(df['eff_to'][i]))]
    df = df.drop(idx, axis = 0)
    df = df.reset_index()
    df = df.drop('index', axis = 1)
    
    return df

In [20]:
def wrangle_frequency(df):
    '''
    
    Standardize <frequency> values for all schedules as a list of numbers that can take values from 1 to 7 for the
    weekdays on which a schedule is operational.
    
    '''
    
# Convert frequency into a string of numbers
    for i in range(df.shape[0]):
        if isinstance(df.frequency[i], str):
            df.frequency[i] = '1234567'
        else:
            df.frequency[i] = str(int(df.frequency[i]))

# Convert string of numbers into list of numbers
        df.frequency[i] = list(df.frequency[i]) # Split str
        df.frequency[i] = list(map(int, df.frequency[i])) # Convert from str to int
    
    return df

In [21]:
def combine_pairs(df, human_intel = 'n'):
    '''
    
    In raw data, each schedule has two legs of information: at origin and at destination. These two legs are
    present as different records. Identify pairs and combine them into a single record.
    
    Allow for human intervetion to identify possible pairs when poor/missing data does not allow for automated 
    identification.
    
    Embedded helper function combine_pairs_func() appropriately combines records [i] and [j] and marks record [j] 
    for deletion.
    
    '''
    
    def combine_pairs_func(i, j, drop_idx):

        drop_idx.append(j)
        
        if df.to_time[i] == '': 
            df.to_time[i] = df.to_time[j]
        else: 
            df.from_time[i] = df.from_time[j]
            
        return drop_idx
    
    drop_idx = []        
    
    # Compare a record with only subsequent records and as long as the subsequent record has not already been
    # paired with some other record.
    for i in range(df.shape[0] - 1):
        if i not in drop_idx:
            for j in range(i+1, df.shape[0]):
                if j not in drop_idx:
                    
                    # Automated pairing
                    if human_intel == 'n':
                        if df.flight[i] == df.flight[j] and df.frequency[i] == df.frequency[j]:
                            if df['from'][i] == df['from'][j] and df.to[i] == df.to[j]:
                                drop_idx = combine_pairs_func(i, j, drop_idx)
                                
                    # Pairing with human intervention by relaxing the identical frequency constraint
                    else:
                        if df.flight[i] == df.flight[j]:
                            if df.to[i] == df.to[j] and df['from'][i] == df['from'][j]:
                                if df.to_time[i] == '' and df.from_time[j] == '' and df.from_time[i] != '' and df.to_time[j] != '':
                                    display(df.iloc[i, :8], df.iloc[j, :8])
                                    display('Possibly Pairs?')
                                    reply = get_yes_no_answer()
                                    
                                    if reply == 'y':
                                        drop_idx = combine_pairs_func(i, j, drop_idx)
                                    
                                elif df.to_time[i] != '' and df.from_time[j] != '' and df.from_time[i] == '' and df.to_time[j] == '':
                                    display(df.iloc[i, :8], df.iloc[j, :8])
                                    display('Possibly Pairs?')
                                    reply = get_yes_no_answer()
                                    
                                    if reply == 'y':
                                        drop_idx = combine_pairs_func(i, j, drop_idx)
                            
    
    df = df.drop(drop_idx, axis = 0)
    df = df.reset_index()
    df = df.drop('index', axis = 1)
    
    return(df)

In [22]:
def combine_freq(df, human_intel = 'n'):
    
    '''
    
    Combine multiple schedule records where the schedules in the set differ only in their frequencies. This 
    requires no human intervention.

    Combine multiple schedules where the schedules in the set differ not only in their frequencies but vary 
    slightly in their <from_time> (or <to_time>) values. This requires human intervention
    
    Embedded helper function combine_frq_func() appropriately combines records [i] and [j] and marks record [j] 
    for deletion.
    
    '''
    
    def combine_freq_func(i, j, drop_idx):
        drop_idx.append(j)
        df.frequency[i] = df.frequency[i] + df.frequency[j]
        df.frequency[i] = list(set(df.frequency[i]))
        return drop_idx
        
    drop_idx = []    
    
    for i in range(df.shape[0] - 1):
        if i not in drop_idx:
            for j in range(i+1, df.shape[0]):
                if j not in drop_idx:
                    
                    # Automated Merging
                    if human_intel == 'n':
                        if df.flight[i] == df.flight[j] and df['from'][i] == df['from'][j] and df.to[i] == df.to[j]:
                            if df.to_time[i] == df.to_time[j] and df.from_time[i] == df.from_time[j]:
                                drop_idx = combine_freq_func(i, j, drop_idx)
                    
                    # Merging with human intervention by requiring only either to_time or from_time be identical  
                    else:
                        if df.flight[i] == df.flight[j] and df['from'][i] == df['from'][j] and df.to[i] == df.to[j]:
                            if df.to_time[i] == df.to_time[j] or df.from_time[i] == df.from_time[j]:
                                
                                display(df.iloc[i, :8], '\n', df.iloc[j, :8])
                                display('Probably the same flight.')

                                reply = get_yes_no_answer()
    
                                if reply == 'y':
                                    drop_idx = combine_freq_func(i, j, drop_idx)
      
    df = df.drop(drop_idx, axis = 0)
    df = df.reset_index()
    df = df.drop('index', axis = 1)
    
    return(df)

In [None]:
def drop_orphan_scheds(dat):
    
    trash_idx = [i for i in range(dat.shape[0]) if dat.to_time[i] == '' or dat.from_time[i] == '']

    dat = dat.drop(trash_idx, axis = 0)
    dat = dat.reset_index()
    dat = dat.drop('index', axis = 1)
    dat.shape

In [23]:
raw_data_dir_path ='./data/raw/excel'
stage_1_data_dir_path = './data/processed/stage-1'

# Getting list of files in the directory
_, _, raw_file_names = next(os.walk(raw_data_dir_path))
_, _, stage_1_file_names = next(os.walk(stage_1_data_dir_path))

# Retaining only Excel files from raw and stage-1 data
raw_file_names = [i for i in raw_file_names if '.xlsx' in i]
stage_1_file_names = [i for i in stage_1_file_names if '.xlsx' in i]

# Loading the required dictionaries
##### Correcting know city name spelling errors
with open('./data/processed/dicts/city_spelling_corrected_dict.txt', 'rb') as handle:
    city_names = pickle.loads(handle.read())
##### Mapping city names to corresponding IATA codes
with open('./data/processed/dicts/city_to_codes_dict.txt', 'rb') as handle:
    city_to_codes = pickle.loads(handle.read()) 

file_names = [value for value in raw_file_names if value not in stage_1_file_names]

# Processing individual files
for file_name in file_names:
    print('\nProcessing for %s' % file_name.split('.')[0])
    
    readpath = raw_data_dir_path + '/' + file_name
    dat = read_excel_data(readpath)
    print('(Stage I) Read raw data: has %i records' % dat.shape[0])
    
    dat = wrangle_rows(dat)
    print('(Stage II) Deleted header rows: now has %i records' % dat.shape[0])
    
    dat = wrangle_cols(dat)
    print('(Stage III) Updated the <From>(or <To>) column with Station name')
    
    dat = fix_city_spelling(dat, city_names)
    print('(Stage IV) Fixed known city name spelling errors')
    
    flag, missing_city_names, dat = wrangle_iata_codes(dat, city_to_codes)
    if flag == 'up':
        savepath = './data/processed/stage-1/' + file_name
        dat.to_excel(savepath)
        print('(Stage V) Replaced city names with corresponding IATA codes and saved the processed file')
    else:
        print('(Stage V)There are missing IATA codes and/or incorrectly spelt city names.')
        print('File partially processed and not saved.')
        print('The list of city names either with wrong spelling or missing IATA codes\n', missing_city_names)
        print('Fix errors in the data dictionaries and process again.')
        print(' Process will begin from the file that had missing IATA codes')



Processing for Vistara
(Stage I) Read raw data: has 554 records
(Stage II) Deleted header rows: now has 515 records
(Stage III) Updated the <From>(or <To>) column with Station name
(Stage IV) Fixed known city name spelling errors
(Stage V) Replaced city names with corresponding IATA codes and saved the processed file

Processing for SpiceJet
(Stage I) Read raw data: has 1680 records
(Stage II) Deleted header rows: now has 1592 records
(Stage III) Updated the <From>(or <To>) column with Station name
(Stage IV) Fixed known city name spelling errors
(Stage V) Replaced city names with corresponding IATA codes and saved the processed file

Processing for TruJet
(Stage I) Read raw data: has 201 records
(Stage II) Deleted header rows: now has 174 records
(Stage III) Updated the <From>(or <To>) column with Station name
(Stage IV) Fixed known city name spelling errors
(Stage V) Replaced city names with corresponding IATA codes and saved the processed file

Processing for Air_India
(Stage I) Re