In [3]:
import numpy as np
import pandas as pd
import os

Reading the schedule details for each carrier, stored in separate files and concatenating all files into a single data file called dat.

In [4]:
dir_path = './raw-data/Excel/'

# Getting list of files in the directory
_, _, file_names = next(os.walk(dir_path)) 

# Retaining only Excel files
file_names = [i for i in file_names if '.xlsx' in i]

# Setting the column names for the data file
col_names = ['num', 'flight', 'operator', 'aircraft', 'frequency', 'from', 'time_at_to', 'to', 'time_at_from', 
             'eff_from', 'eff_to']

# Reading the first file
dat = pd.read_excel(dir_path + file_names[0], na_filter = False, convert_float = False, names = col_names)

# Reading the remaining files, one at a time, and concatenating to form a single dataframe
for i in range(1, len(file_names)):
    tmp = pd.read_excel(dir_path + file_names[i], na_filter = False, convert_float = False, names = col_names)
    dat = dat.append(tmp)
    
dat = dat.reset_index() # resetting the index
dat = dat.drop('index', axis = 1) # dropping the previous index

dat.shape

(10413, 11)

In [5]:
def wrangle_rows(df):
    # dropping superflows header rows
    
    r_drop_idx = [i for i in range(df.shape[0]) if ((not isinstance(df.iloc[i, 0], float)) and 
                                                    (df.iloc[i, 4] == 'Frequency'))]
      
    df = df.drop(r_drop_idx, axis = 0)
    
    df = df.reset_index() # resetting the index after dropping rows
    df = df.drop('index', axis = 1) # dropping the previous index
    
    # dropping city names subheader rows
    r_drop_idx = [i for i in range(df.shape[0]) if isinstance(df.iloc[i, 0], str)]
    
    for i in range(len(r_drop_idx)-1):
        df.iloc[r_drop_idx[i] + 1: r_drop_idx[i+1], 0] = df.iloc[r_drop_idx[i], 0]
    
    # Separately for the last sub-head in the list
    df.iloc[r_drop_idx[-1] + 1: df.shape[0], 0] = df.iloc[r_drop_idx[-1], 0]
        
    df = df.drop(r_drop_idx, axis = 0) 
    df = df.reset_index() # resetting the index after dropping rows
    df = df.drop('index', axis = 1) # dropping the previous index
    
    df = df.rename(columns = {'num': 'station'}, errors = 'raise')
    
    return df

In [6]:
dat = wrangle_rows(dat)
dat.shape

(9876, 11)

In [7]:
def wrangle_cols(df):
    idx = df.loc[:,'from'] == ''
    df.loc[idx, 'from'] = df.loc[idx, 'station']

    idx = df.loc[:, 'to'] == ''
    df.loc[idx, 'to'] = df.loc[idx, 'station']

    df = df.drop(['station'], axis = 1)

    return df

In [8]:
dat = wrangle_cols(dat)
dat.shape

(9876, 10)

In [9]:
def wrangle_city_names(df):
    city_names = {'Delhi' : 'New Delhi', 'Bhubaneshwar' : 'Bhubaneswar',
                  'Trivandrum' : 'Thiruvananthapuram', 'Pondicherry' : 'Puducherry',
                  'Porbander' : 'Porbandar', 'Tirupathi' : 'Tirupati',
                  'Tuticorin' : 'Thoothukudi', 'Vizag' : 'Visakhapatnam',
                  'Cuddapah': 'Kadapa', 'Jalgoan' : 'Jalgaon',
                  'Rajamundry' : 'Rajahmundry', 'Aizawal' : 'Aizawl',
                  'Trichy' : 'Tiruchirappally', 'Bathinda' : 'Bhatinda',
                  'Passighat' : 'Pasighat'}
    
    for i in range(df.shape[0]):
        if df['from'][i] in list(city_names.keys()): df['from'][i] = city_names[df['from'][i]]
        if df['to'][i] in list(city_names.keys()): df['to'][i] = city_names[df['to'][i]]

    return df
    

In [10]:
dat = wrangle_city_names(dat)
dat.shape

(9876, 10)

In [11]:
# Creating Dictionaries <a> mapping city names to iata codes, and <b> mapping iata codes to city names

city_codes = pd.read_csv('./raw-data/Excel/airports.csv')
city_to_codes = dict(zip(city_codes.city, city_codes.iata_code))
codes_to_city = dict(zip(city_codes.iata_code, city_codes.city))

In [12]:
def wrangle_iata_codes(df):
    idx = [i for i in range(df.shape[0]) if len(df['from'][i]) !=3]
    for i in idx: df['from'][i] = city_to_codes[df['from'][i]]
        
    idx = [i for i in range(df.shape[0]) if len(df['to'][i]) !=3]
    for i in idx: df['to'][i] = city_to_codes[df['to'][i]]
        
    return df

In [13]:
dat = wrangle_iata_codes(dat)
dat.shape

(9876, 10)

In [14]:
def drop_dummy_scheds(df):
    idx = [i for i in range(df.shape[0]) if df['eff_from'][i] == df['eff_to'][i]]
    df = df.drop(idx, axis = 0)
    df = df.reset_index()
    df = df.drop(['index'], axis = 1)
    
    return df

In [15]:
dat = drop_dummy_scheds(dat)
dat.shape

(9271, 10)

In [16]:
def drop_noncurrent_scheds(df, tz):
    today = datetime.now(tz).date()
    idx = [i for i in range(df.shape[0]) 
           if (today <=  datetime.date(df['eff_from'][i])) or (today >= datetime.date(df['eff_to'][i]))]
    df = df.drop(idx, axis = 0)
    df = df.reset_index()
    df = df.drop('index', axis = 1)
    
    return df

In [17]:
from datetime import datetime
import pytz

tz = pytz.timezone('Asia/Calcutta')

dat = drop_noncurrent_scheds(dat, tz)
dat.shape

(8286, 10)

KeyboardInterrupt: 

In [101]:
foo = dat.copy()

foo = foo.drop_duplicates(['flight', 'frequency', 'from', 'to'], keep = False)
print(foo.shape)
print(foo.head(25))

(2760, 10)
    flight operator aircraft frequency from time_at_to   to time_at_from  \
0   UK 939       UK    A 320     Daily  DEL       6:25  AMD                
1   UK 936       UK    A 320     Daily  AMD             DEL         7:05   
7   UK 966       UK    A 320     Daily  AMD             DEL        20:15   
8   UK 691       UK    A 320     Daily  DEL      12:50  ATQ                
9   UK 508       UK    A 320     Daily  ATQ             DEL        13:25   
10  UK 976       UK    A 320     Daily  DEL      14:50  ATQ                
11  UK 507       UK    A 320     Daily  DEL      15:05  ATQ                
13  UK 692       UK    A 320     Daily  ATQ             DEL        15:40   
22  UK 845       UK    B 738     Daily  BOM       7:35  BLR                
23  UK 846       UK    B 738     Daily  BLR             BOM         8:20   
30  UK 815       UK    B 738    134567  DEL      10:50  BLR                
31  UK 815       UK    B 738         2  DEL      10:50  BLR                
3

In [82]:
bar.head()

Unnamed: 0,flight,operator,aircraft,frequency,from,time_at_to,to,time_at_from,eff_from,eff_to
2,UK 959,UK,A 320,Daily,DEL,8:00,AMD,,2019-10-27 00:00:00,2020-03-28 00:00:00
3,UK 946,UK,A 320,Daily,AMD,,DEL,8:40,2019-10-27 00:00:00,2020-03-28 00:00:00
4,UK 979,UK,A 320,123456,DEL,10:20,AMD,,2019-10-28 00:00:00,2020-03-28 00:00:00
5,UK 976,UK,A 320,123456,AMD,,DEL,10:55,2019-10-28 00:00:00,2020-03-28 00:00:00
6,UK 969,UK,A 320,Daily,DEL,19:40,AMD,,2019-10-27 00:00:00,2020-03-28 00:00:00


In [98]:
foo = dat.copy()
print(foo.shape)
bar = foo.groupby(['flight', 'frequency', 'from', 'to'])
foo.iloc[bar, :]


(9271, 10)


IndexError: .iloc requires numeric indexers, got [[(inf, 'Daily', 'SAG', 'BLR')
       flight operator aircraft frequency from time_at_to   to time_at_from  \
6985    inf      IND   ATR 72     Daily  SAG             BLR        13:25   

                 eff_from               eff_to  
6985  2019-10-27 00:00:00  2020-03-28 00:00:00  ]
 [(inf, 'Daily', 'SAG', 'HYD')
       flight operator aircraft frequency from time_at_to   to time_at_from  \
6983    inf      IND   ATR 72     Daily  SAG             HYD        12:55   
6987    inf      IND   ATR 72     Daily  SAG             HYD        16:40   

                 eff_from               eff_to  
6983  2019-10-27 00:00:00  2020-03-28 00:00:00  
6987  2019-10-27 00:00:00  2020-03-28 00:00:00  ]
 [(inf, 'Daily', 'SAG', 'IDR')
       flight operator aircraft frequency from time_at_to   to time_at_from  \
6981    inf      IND   ATR 72     Daily  SAG             IDR         9:40   

                 eff_from               eff_to  
6981  2019-10-27 00:00:00  2020-03-28 00:00:00  ]
 ...
 [('UK 997', 'Daily', 'ATQ', 'DEL')
       flight operator aircraft frequency from time_at_to   to time_at_from  \
12   UK 997       UK    A 320     Daily  ATQ             DEL        15:25   
196  UK 997       UK    A 320     Daily  ATQ      16:40  DEL                

                eff_from               eff_to  
12   2019-10-27 00:00:00  2020-03-28 00:00:00  
196  2019-10-27 00:00:00  2020-03-28 00:00:00  ]
 [('UK 997', 'Daily', 'DEL', 'PNQ')
       flight operator aircraft frequency from time_at_to   to time_at_from  \
211  UK 997       UK    A 320     Daily  DEL             PNQ        17:35   
446  UK 997       UK    A 320     Daily  DEL      19:40  PNQ                

                eff_from               eff_to  
211  2019-10-27 00:00:00  2020-03-28 00:00:00  
446  2019-10-27 00:00:00  2020-03-28 00:00:00  ]
 [('UK 998', 123457.0, 'PNQ', 'DEL')
       flight operator aircraft frequency from time_at_to   to time_at_from  \
445  UK 998       UK    A 320    123457  PNQ             DEL        16:55   

                eff_from               eff_to  
445  2019-10-27 00:00:00  2020-03-27 00:00:00  ]]

In [61]:
dat.to_excel('./raw-data/clean-excel/dat.xlsx')