In [17]:
import numpy as np
import pandas as pd
import os
import pickle

In [18]:
def read_excel_data(file_name):
    '''
    
    Read existing raw data file in the excel format, standardize the column names and alter column position
    for better human readability
    
    '''
    
    dat = pd.read_excel(file_name, na_filter = False, convert_float = False)

    # Setting the column names
    dat.columns = ['num', 'flight', 'operator', 'aircraft', 'frequency', 'from', 'to_time', 'to', 
                   'from_time', 'eff_from', 'eff_to']  

    # Altering the column positions
    dat.insert(1, 'operator', dat.pop('operator'))
    dat.insert(6, 'from_time', dat.pop('from_time'))
    dat.insert(7, 'to', dat.pop('to'))
    
    return dat

In [19]:
def wrangle_rows(df):
    '''
    
    Drop repeated instances of header rows
    Drop the city name headers by first moving the city name to the <num> field and renaming the <num> field
    as <station>
    
    '''
    
    # dropping superflous header rows
    trash_idx = [i for i in range(df.shape[0]) if ((not isinstance(df.iloc[i, 0], float)) and 
                                                    (df.iloc[i, 4] == 'Frequency'))]
      
    df = df.drop(trash_idx, axis = 0)
    df = df.reset_index() # resetting the index after dropping rows
    df = df.drop('index', axis = 1) # dropping the previous index
    
    # dropping city names subheader rows
    trash_idx = [i for i in range(df.shape[0]) if isinstance(df.iloc[i, 0], str)]
    
    for i in range(len(trash_idx)-1):
        df.iloc[trash_idx[i] + 1: trash_idx[i+1], 0] = df.iloc[trash_idx[i], 0]
    
    # Separately for the last sub-head in the list
    df.iloc[trash_idx[-1] + 1: df.shape[0], 0] = df.iloc[trash_idx[-1], 0]
        
    df = df.drop(trash_idx, axis = 0) 
    df = df.reset_index() # resetting the index after dropping rows
    df = df.drop('index', axis = 1) # dropping the previous index
    
    df = df.rename(columns = {'num': 'station'}, errors = 'raise')
    
    return df

In [20]:
def wrangle_cols(df):
    '''
    Copying the station name to either the <from> or <to> column appropriately and dropping the <station> column.
    
    '''
    idx = df.loc[:,'from'] == ''
    df.loc[idx, 'from'] = df.loc[idx, 'station']

    idx = df.loc[:, 'to'] == ''
    df.loc[idx, 'to'] = df.loc[idx, 'station']

    df = df.drop(['station'], axis = 1)

    return df

In [21]:
def fix_city_spelling(df, city_names):
    '''
    Fixing known city spelling errors
    '''
    
    for i in range(df.shape[0]):
            if df['from'][i] in list(city_names.keys()): df['from'][i] = city_names[df['from'][i]]
            if df['to'][i] in list(city_names.keys()): df['to'][i] = city_names[df['to'][i]]
            
    return df

In [22]:
def wrangle_iata_codes(df, city_to_codes):
    '''
    Convert the city names in the <from> and <to> columns to its equivalent IATA code. 
    Capture in a separate list, the cities for whom the IATA code is missing.
    '''
    
    # List to capture city names in the df for which an equivalent IATA code is missing
    missing_city_names = []
      
    idx = [i for i in range(df.shape[0]) if not df['from'][i].isupper()]
    for i in idx: 
        try: 
            df['from'][i] = city_to_codes[df['from'][i]]
        except KeyError:
            if df['from'][i] not in missing_city_names:
                missing_city_names.append(df['from'][i])
        
    idx = [i for i in range(df.shape[0]) if not df['to'][i].isupper()]
    for i in idx: 
        try:
            df['to'][i] = city_to_codes[df['to'][i]]
        except KeyError:
            if df['to'][i] not in missing_city_names:
                missing_city_names.append(df['to'][i])
                
    # Flag to track if there are missing city names
    if len(missing_city_names) !=0: flag = 'down'
    else: flag = 'up'

        
    return flag, missing_city_names, df

In [23]:
raw_data_dir_path ='./data/raw/excel'
stage_1_data_dir_path = './data/processed/stage-1'

# Getting list of files in the directory
_, _, raw_file_names = next(os.walk(raw_data_dir_path))
_, _, stage_1_file_names = next(os.walk(stage_1_data_dir_path))

# Retaining only Excel files from raw and stage-1 data
raw_file_names = [i for i in raw_file_names if '.xlsx' in i]
stage_1_file_names = [i for i in stage_1_file_names if '.xlsx' in i]

# Loading the required dictionaries
##### Correcting know city name spelling errors
with open('./data/processed/dicts/city_spelling_corrected_dict.txt', 'rb') as handle:
    city_names = pickle.loads(handle.read())
##### Mapping city names to corresponding IATA codes
with open('./data/processed/dicts/city_to_codes_dict.txt', 'rb') as handle:
    city_to_codes = pickle.loads(handle.read()) 

file_names = [value for value in raw_file_names if value not in stage_1_file_names]

# Processing individual files
for file_name in file_names:
    print('\nProcessing for %s' % file_name.split('.')[0])
    
    readpath = raw_data_dir_path + '/' + file_name
    dat = read_excel_data(readpath)
    print('(Stage I) Read raw data: has %i records' % dat.shape[0])
    
    dat = wrangle_rows(dat)
    print('(Stage II) Deleted header rows: now has %i records' % dat.shape[0])
    
    dat = wrangle_cols(dat)
    print('(Stage III) Updated the <From>(or <To>) column with Station name')
    
    dat = fix_city_spelling(dat, city_names)
    print('(Stage IV) Fixed known city name spelling errors')
    
    flag, missing_city_names, dat = wrangle_iata_codes(dat, city_to_codes)
    if flag == 'up':
        savepath = './data/processed/stage-1/' + file_name
        dat.to_excel(savepath)
        print('(Stage V) Replaced city names with corresponding IATA codes and saved the processed file')
    else:
        print('(Stage V)There are missing IATA codes and/or incorrectly spelt city names.')
        print('File partially processed and not saved.')
        print('The list of city names either with wrong spelling or missing IATA codes\n', missing_city_names)
        print('Fix errors in the data dictionaries and process again.')
        print(' Process will begin from the file that had missing IATA codes')



Processing for Vistara
(Stage I) Read raw data: has 554 records
(Stage II) Deleted header rows: now has 515 records
(Stage III) Updated the <From>(or <To>) column with Station name
(Stage IV) Fixed known city name spelling errors
(Stage V) Replaced city names with corresponding IATA codes and saved the processed file

Processing for SpiceJet
(Stage I) Read raw data: has 1680 records
(Stage II) Deleted header rows: now has 1592 records
(Stage III) Updated the <From>(or <To>) column with Station name
(Stage IV) Fixed known city name spelling errors
(Stage V) Replaced city names with corresponding IATA codes and saved the processed file

Processing for TruJet
(Stage I) Read raw data: has 201 records
(Stage II) Deleted header rows: now has 174 records
(Stage III) Updated the <From>(or <To>) column with Station name
(Stage IV) Fixed known city name spelling errors
(Stage V) Replaced city names with corresponding IATA codes and saved the processed file

Processing for Air_India
(Stage I) Re