In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
dir_path ='./data/raw/excel'

In [3]:
def read_excel_data(file_name):
    dat = pd.read_excel(file_name, na_filter = False, convert_float = False)

    # Setting the column names
    dat.columns = ['num', 'flight', 'operator', 'aircraft', 'frequency', 'from', 'to_time', 'to', 
                   'from_time', 'eff_from', 'eff_to']  

    # Altering the column positions
    dat.insert(6, 'from_time', dat.pop('from_time'))
    dat.insert(7, 'to', dat.pop('to'))
    
    return dat

In [4]:
file_name = 'Vistara.xlsx'

file_name = dir_path + '/' + file_name
dat = read_excel_data(file_name)
dat.head()

Unnamed: 0,num,flight,operator,aircraft,frequency,from,from_time,to,to_time,eff_from,eff_to
0,Ahmedabad-changed,,,,,,,,,,
1,1,UK 939,UK,A 320,Daily,DEL,,,6:25,2019-11-25 00:00:00,2020-03-28 00:00:00
2,2,UK 936,UK,A 320,Daily,,7:05,DEL,,2019-11-25 00:00:00,2020-03-28 00:00:00
3,3,UK 959,UK,A 320,Daily,DEL,,,8:00,2019-10-27 00:00:00,2020-03-28 00:00:00
4,4,UK 946,UK,A 320,Daily,,8:40,DEL,,2019-10-27 00:00:00,2020-03-28 00:00:00


In [5]:
dat.shape

(554, 11)

In [6]:
def wrangle_rows(df):
    # dropping superflous header rows
    
    r_drop_idx = [i for i in range(df.shape[0]) if ((not isinstance(df.iloc[i, 0], float)) and 
                                                    (df.iloc[i, 4] == 'Frequency'))]
      
    df = df.drop(r_drop_idx, axis = 0)
    
    df = df.reset_index() # resetting the index after dropping rows
    df = df.drop('index', axis = 1) # dropping the previous index
    
    # dropping city names subheader rows
    r_drop_idx = [i for i in range(df.shape[0]) if isinstance(df.iloc[i, 0], str)]
    
    for i in range(len(r_drop_idx)-1):
        df.iloc[r_drop_idx[i] + 1: r_drop_idx[i+1], 0] = df.iloc[r_drop_idx[i], 0]
    
    # Separately for the last sub-head in the list
    df.iloc[r_drop_idx[-1] + 1: df.shape[0], 0] = df.iloc[r_drop_idx[-1], 0]
        
    df = df.drop(r_drop_idx, axis = 0) 
    df = df.reset_index() # resetting the index after dropping rows
    df = df.drop('index', axis = 1) # dropping the previous index
    
    df = df.rename(columns = {'num': 'station'}, errors = 'raise')
    
    return df

In [7]:
dat = wrangle_rows(dat)
dat.shape

(515, 11)

In [8]:
def wrangle_cols(df):
    idx = df.loc[:,'from'] == ''
    df.loc[idx, 'from'] = df.loc[idx, 'station']

    idx = df.loc[:, 'to'] == ''
    df.loc[idx, 'to'] = df.loc[idx, 'station']

    df = df.drop(['station'], axis = 1)

    return df

In [9]:
dat = wrangle_cols(dat)
dat.head()

Unnamed: 0,flight,operator,aircraft,frequency,from,from_time,to,to_time,eff_from,eff_to
0,UK 939,UK,A 320,Daily,DEL,,Ahmedabad-changed,6:25,2019-11-25 00:00:00,2020-03-28 00:00:00
1,UK 936,UK,A 320,Daily,Ahmedabad-changed,7:05,DEL,,2019-11-25 00:00:00,2020-03-28 00:00:00
2,UK 959,UK,A 320,Daily,DEL,,Ahmedabad-changed,8:00,2019-10-27 00:00:00,2020-03-28 00:00:00
3,UK 946,UK,A 320,Daily,Ahmedabad-changed,8:40,DEL,,2019-10-27 00:00:00,2020-03-28 00:00:00
4,UK 979,UK,A 320,123456,DEL,,Ahmedabad-changed,10:20,2019-10-28 00:00:00,2020-03-28 00:00:00


In [10]:
def fix_city_spelling(df):
    city_names = {'Delhi' : 'New Delhi', 'Bhubaneshwar' : 'Bhubaneswar',
                  'Trivandrum' : 'Thiruvananthapuram', 'Pondicherry' : 'Puducherry',
                  'Porbander' : 'Porbandar', 'Tirupathi' : 'Tirupati',
                  'Tuticorin' : 'Thoothukudi', 'Vizag' : 'Visakhapatnam',
                  'Cuddapah': 'Kadapa', 'Jalgoan' : 'Jalgaon',
                  'Rajamundry' : 'Rajahmundry', 'Aizawal' : 'Aizawl',
                  'Trichy' : 'Tiruchirappally', 'Bathinda' : 'Bhatinda',
                  'Passighat' : 'Pasighat'}
    
    missing_city_codes = []
    
    for i in range(df.shape[0]):
            if df['from'][i] in list(city_names.keys()): df['from'][i] = city_names[df['from'][i]]
            if df['to'][i] in list(city_names.keys()): df['to'][i] = city_names[df['to'][i]]
            
    return df

In [11]:
dat = fix_city_spelling(dat)
dat.head()

Unnamed: 0,flight,operator,aircraft,frequency,from,from_time,to,to_time,eff_from,eff_to
0,UK 939,UK,A 320,Daily,DEL,,Ahmedabad-changed,6:25,2019-11-25 00:00:00,2020-03-28 00:00:00
1,UK 936,UK,A 320,Daily,Ahmedabad-changed,7:05,DEL,,2019-11-25 00:00:00,2020-03-28 00:00:00
2,UK 959,UK,A 320,Daily,DEL,,Ahmedabad-changed,8:00,2019-10-27 00:00:00,2020-03-28 00:00:00
3,UK 946,UK,A 320,Daily,Ahmedabad-changed,8:40,DEL,,2019-10-27 00:00:00,2020-03-28 00:00:00
4,UK 979,UK,A 320,123456,DEL,,Ahmedabad-changed,10:20,2019-10-28 00:00:00,2020-03-28 00:00:00


In [25]:
def wrangle_iata_codes(df):
    # Load the dictionary, previously created, mapping city names to corresponding IATA codes
    
    import pickle
    
    with open('./data/processed/city_to_codes_dict.txt', 'rb') as handle:
        city_to_codes = pickle.loads(handle.read())
    
    # To capture city names in the df for which an equivalen IATA code is missing
    missing_city_names = []
    
    idx = [i for i in range(df.shape[0]) if not df['from'][i].isupper()]
    for i in idx: 
        try: 
            df['from'][i] = city_to_codes[df['from'][i]]
        except KeyError:
            if df['from'][i] not in missing_city_names:
                missing_city_names.append(df['from'][i])
        
    idx = [i for i in range(df.shape[0]) if not df['to'][i].isupper()]
    for i in idx: 
        try:
            df['to'][i] = city_to_codes[df['to'][i]]
        except KeyError:
            if df['to'][i] not in missing_city_names:
                missing_city_names.append(df['to'][i])
        
    return missing_city_names, df

In [26]:
missing_city_names, dat = wrangle_iata_codes(dat)

print('The list of city names either with wrong spelling or missing IATA codes\n', missing_city_names)

The list of city names either with wrong spelling or missing IATA codes
 ['Ahmedabad-changed', 'Chennai-changed', 'Varanasi-changed']


In [70]:
missing_city_names, dat = wrangle_iata_codes(dat)
print('The list of city names either with wrong spelling or missing IATA codes\n', missing_city_names)
dat.head()

Unnamed: 0,flight,operator,aircraft,frequency,from,from_time,to,to_time,eff_from,eff_to
0,UK 939,UK,A 320,Daily,DEL,,AMD,6:25,2019-11-25 00:00:00,2020-03-28 00:00:00
1,UK 936,UK,A 320,Daily,AMD,7:05,DEL,,2019-11-25 00:00:00,2020-03-28 00:00:00
2,UK 959,UK,A 320,Daily,DEL,,AMD,8:00,2019-10-27 00:00:00,2020-03-28 00:00:00
3,UK 946,UK,A 320,Daily,AMD,8:40,DEL,,2019-10-27 00:00:00,2020-03-28 00:00:00
4,UK 979,UK,A 320,123456,DEL,,AMD,10:20,2019-10-28 00:00:00,2020-03-28 00:00:00


In [71]:
def drop_dummy_scheds(df):
    idx = [i for i in range(df.shape[0]) if df['eff_from'][i] == df['eff_to'][i]]
    df = df.drop(idx, axis = 0)
    df = df.reset_index()
    df = df.drop(['index'], axis = 1)
    
    return df

In [72]:
dat = drop_dummy_scheds(dat)
dat.shape

(486, 10)

In [73]:
def drop_noncurrent_scheds(df, tz):
    today = datetime.now(tz).date()
    idx = [i for i in range(df.shape[0]) 
           if (today <=  datetime.date(df['eff_from'][i])) or (today >= datetime.date(df['eff_to'][i]))]
    df = df.drop(idx, axis = 0)
    df = df.reset_index()
    df = df.drop('index', axis = 1)
    
    return df

In [74]:
from datetime import datetime
import pytz

tz = pytz.timezone('Asia/Calcutta')
dat = drop_noncurrent_scheds(dat, tz)
dat.shape

(462, 10)

In [75]:
def wrangle_frequency(df):
# Convert frequency into a string of numbers
    for i in range(df.shape[0]):
        if isinstance(df.frequency[i], str):
            df.frequency[i] = '1234567'
        else:
            df.frequency[i] = str(int(df.frequency[i]))

# Convert string of numbers into list of numbers
        df.frequency[i] = list(df.frequency[i]) # Split str
        df.frequency[i] = list(map(int, df.frequency[i])) # Convert from str to int
    
    return df

In [76]:
dat = wrangle_frequency(dat)

dat.head()

Unnamed: 0,flight,operator,aircraft,frequency,from,from_time,to,to_time,eff_from,eff_to
0,UK 939,UK,A 320,"[1, 2, 3, 4, 5, 6, 7]",DEL,,AMD,6:25,2019-11-25 00:00:00,2020-03-28 00:00:00
1,UK 936,UK,A 320,"[1, 2, 3, 4, 5, 6, 7]",AMD,7:05,DEL,,2019-11-25 00:00:00,2020-03-28 00:00:00
2,UK 959,UK,A 320,"[1, 2, 3, 4, 5, 6, 7]",DEL,,AMD,8:00,2019-10-27 00:00:00,2020-03-28 00:00:00
3,UK 946,UK,A 320,"[1, 2, 3, 4, 5, 6, 7]",AMD,8:40,DEL,,2019-10-27 00:00:00,2020-03-28 00:00:00
4,UK 979,UK,A 320,"[1, 2, 3, 4, 5, 6]",DEL,,AMD,10:20,2019-10-28 00:00:00,2020-03-28 00:00:00


In [77]:
def combine_pairs(df):
    drop_idx = []
    
    for i in range(df.shape[0] - 1):
        if i not in drop_idx:
            for j in range(i+1, df.shape[0]):
                if j not in drop_idx:
                    if df.flight[i] == df.flight[j] and df.frequency[i] == df.frequency[j] and df['from'][i] == df['from'][j] and df.to[i] == df.to[j]:
                        drop_idx.append(j)
                        if df.to_time[i] == '': 
                            df.to_time[i] = df.to_time[j]
                        else: 
                            df.from_time[i] = df.from_time[j]
    
    df = df.drop(drop_idx, axis = 0)
    df = df.reset_index()
    df = df.drop('index', axis = 1)
    
    return(df)

In [78]:
dat = combine_pairs(dat)

dat.shape

(292, 10)

In [79]:
def combine_freq(df):
    drop_idx = []
    
    for i in range(df.shape[0] - 1):
        if i not in drop_idx:
            for j in range(i+1, df.shape[0]):
                if j not in drop_idx:
                    if df.flight[i] == df.flight[j] and df['from'][i] == df['from'][j] and df.to[i] == df.to[j] and df.to_time[i] == df.to_time[j] and df.from_time[i] == df.from_time[j]:
                        drop_idx.append(j)
                        df.frequency[i] = df.frequency[i] + df.frequency[j]
                        df.frequency[i] = list(set(df.frequency[i]))
                        
    
    df = df.drop(drop_idx, axis = 0)
    df = df.reset_index()
    df = df.drop('index', axis = 1)
    
    return(df)

In [80]:
dat = combine_freq(dat)

dat.shape

(270, 10)

In [81]:
dat = combine_pairs(dat)

dat.shape

(253, 10)

In [82]:
dat.to_excel('./data/processed/vistara.xlsx')

#### BEGIN HERE ####

We consider forming pairs with human intervention

In [113]:
foo.to_excel('./data/processed/foo.xlsx')

In [99]:
foo = vistara.copy()
for i in range(foo.shape[0]):
    if isinstance(foo.frequency[i], float): foo.frequency[i] = int(foo.frequency[i])

foo = foo.astype({'frequency' : 'str'})

for i in range(foo.shape[0]):
    if foo.frequency[i] != 'Daily':
        foo.frequency[i] = list(foo.frequency[i]) # Split into individuals days of week
        foo.frequency[i] = list(map(int, foo.frequency[i])) # Convert from str to int

foo.head()

Unnamed: 0,flight,operator,aircraft,frequency,from,time_at_to,to,time_at_from,eff_from,eff_to
0,UK 939,UK,A 320,Daily,DEL,6:25,AMD,,2019-11-25 00:00:00,2020-03-28 00:00:00
1,UK 936,UK,A 320,Daily,AMD,,DEL,7:05,2019-11-25 00:00:00,2020-03-28 00:00:00
2,UK 959,UK,A 320,Daily,DEL,8:00,AMD,,2019-10-27 00:00:00,2020-03-28 00:00:00
3,UK 946,UK,A 320,Daily,AMD,,DEL,8:40,2019-10-27 00:00:00,2020-03-28 00:00:00
4,UK 979,UK,A 320,"[1, 2, 3, 4, 5, 6]",DEL,10:20,AMD,,2019-10-28 00:00:00,2020-03-28 00:00:00


In [100]:
type(foo.frequency[4][0])

int

In [139]:
idx = [i for i in range(foo.shape[0]) if foo.time_at_to[i] == '' and foo.frequency[i] != 'Daily']
check_idx = []
trash_idx = []

for i in idx:
    if i not in check_idx:
        for j in idx:
            if j not in check_idx and j != i:
                if foo.flight[i] == foo.flight[j] and foo.to[i] == foo.to[j] and foo['from'][i] == foo['from'][j]:
                    check_idx.append(j)
                    check_idx.append(i)
                    
                    
                    
    check_idx = list(set(check_idx))
                #print('New Pair\n', foo.iloc[i, 0:4], '\n', foo.iloc[j, 0:4], '\n')

In [140]:
len(check_idx)

37

In [141]:
len(idx)

57

In [59]:
foo = vistara.copy()
idx = [i for i in range(foo.shape[0]) if foo['time_at_to'][i] == '']
trash_idx = []

for i in idx:
    for j in range(foo.shape[0]):
        if (j not in trash_idx) and (j not in idx):
            if (foo['flight'][i] == foo['flight'][j]) and (foo['frequency'][i] == foo['frequency'][j]) and (foo['from'][i] == foo['from'][j]) and (foo['to'][i] == foo['to'][j]):
                trash_idx.append(j)
                foo['time_at_to'][i] = foo['time_at_to'][j]

foo = foo.drop(trash_idx, axis = 0)
foo = foo.reset_index()
foo = foo.drop(['index'], axis = 1)

foo.shape

(303, 10)

In [60]:
foo.head()

Unnamed: 0,flight,operator,aircraft,frequency,from,time_at_to,to,time_at_from,eff_from,eff_to
0,UK 939,UK,A 320,Daily,DEL,6:25,AMD,,2019-11-25 00:00:00,2020-03-28 00:00:00
1,UK 936,UK,A 320,Daily,AMD,,DEL,7:05,2019-11-25 00:00:00,2020-03-28 00:00:00
2,UK 946,UK,A 320,Daily,AMD,10:25,DEL,8:40,2019-10-27 00:00:00,2020-03-28 00:00:00
3,UK 976,UK,A 320,123456,AMD,12:35,DEL,10:55,2019-10-28 00:00:00,2020-03-28 00:00:00
4,UK 966,UK,A 320,Daily,AMD,,DEL,20:15,2019-10-27 00:00:00,2020-03-28 00:00:00


In [70]:
idx_to = [i for i in range(foo.shape[0]) if foo.time_at_to[i] == '']
idx_from = [i for i in range(foo.shape[0]) if foo.time_at_from[i] == '']

print('total idx_to:', len(idx_to), '\n', idx_to)
print('total idx_from:', len(idx_from), '\n', idx_from)

total idx_to: 74 
 [1, 4, 6, 10, 17, 24, 25, 26, 27, 29, 36, 43, 50, 52, 55, 59, 71, 72, 76, 77, 79, 82, 90, 91, 92, 100, 101, 102, 111, 112, 124, 125, 134, 135, 142, 147, 157, 159, 161, 163, 165, 168, 169, 171, 178, 179, 186, 189, 192, 199, 202, 206, 208, 213, 216, 217, 221, 223, 225, 228, 230, 239, 240, 241, 245, 258, 259, 265, 266, 272, 281, 283, 290, 293]
total idx_from: 70 
 [0, 5, 7, 8, 16, 20, 21, 22, 23, 28, 34, 37, 40, 42, 54, 57, 60, 86, 88, 94, 95, 98, 99, 106, 114, 123, 126, 140, 150, 151, 152, 153, 154, 155, 158, 160, 162, 164, 166, 167, 170, 172, 176, 177, 185, 188, 198, 201, 204, 207, 209, 212, 218, 232, 233, 234, 235, 237, 244, 246, 251, 253, 262, 271, 274, 280, 289, 292, 297, 300]


In [79]:
idx_fr_mat = []
for i in idx_to:
    for j in idx_from:
        if (foo.flight[i] == foo.flight[j]) and (foo.to[i] == foo.to[j]) and (foo['from'][i] == foo['from'][j]):
            idx_fr_mat.append(j)

print([i for i in idx_from if i not in idx_fr_mat])

[0, 5, 8, 28, 34, 37, 54, 57, 60, 94, 95, 106, 123, 126, 158, 160, 162, 164, 166, 167, 170, 172, 185, 188, 198, 201, 204, 207, 212, 218, 233, 237, 251, 253, 271, 274, 280, 289, 292, 300]


In [80]:
foo.iloc[207,:]

flight                       UK 745
operator                         UK
aircraft                      A 320
frequency                     Daily
from                            DEL
time_at_to                    18:35
to                              CCU
time_at_from                       
eff_from        2019-11-15 00:00:00
eff_to          2020-03-28 00:00:00
Name: 207, dtype: object