##  Create Dataframe for frontend application

In [1]:
import numpy as np
import pandas as pd

In [2]:
routes_delays = pd.read_csv('routes_delays.csv')
ratings = pd.read_csv('ratings_df.csv')

In [4]:
routes_delays.columns

Index(['Unnamed: 0', 'UNIQUE_CARRIER', 'arr_delay_var', 'carrier_delay%',
       'cancellation%', 'airline_name', 'airport_routes', 'city_coutes',
       'distance', 'average_delay_minutes', 'flight_counts',
       'delay_var_minutes'],
      dtype='object')

In [6]:
ratings.replace(np.nan, 'NR', inplace=True)

In [7]:
merged_df = pd.merge(routes_delays, ratings, on='UNIQUE_CARRIER')

In [9]:
tmp = pd.Series(np.unique(merged_df['UNIQUE_CARRIER']))

 ## Dictionary of  LDA extracted topics of airline reviews and html's of airline logo images

In [65]:
tmp_dict = {'AA':'long haul, missed connecting, lie flat seats',
           'AS':' friendly, attentive, good service, good food',
           'B6':'snacks/drinks,legroom, free wifi, direct tv, helpful',
           'DL':'excellent service, missed connection, avoid delta',
           'F9':'low cost, staff friendly, pay extra, bags charge',
           'HA':'seats uncomfortable, friendly attendants, entertainment',
           'NK':'hidden fees, baggage fees, pay extra, worst airline,',
           'OO':'highly recommended, efficient check, friendly crew',
           'UA':'missed connecting flight, flight entertainment, friendly staff',
           'VX':'staff friendly, good quality, food/drinks, entertainment, leather seats',
           'WN':'low cost, non stop, crew/staff friendly, early bird, bags fly free'}

tmp_img = {'AA':'http://crushed.com/wp-content/uploads/2014/04/american_airlines.png',
           'AS':'https://www.nbp.org/nbp/images/business/logo-alaska-airlines.jpg',
           'B6':'http://www.jetblue.com/img/sofly/firstfin.gif',
           'DL':'https://s-media-cache-ak0.pinimg.com/736x/b4/dc/c8/b4dcc8de9a51e157ab177375e74e43b7.jpg',
           'F9':'http://cf.juggle-images.com/matte/white/280x280/frontier-airlines-2-logo-primary.jpg',
           'HA':'http://themolokainews.com/wp-content/uploads/2014/08/hawaiian-logo-304.jpg',
           'NK':'http://sharing.abc15.com/sharewfts/photo/2010/06/23/spirit_airlines_logo_20100623103407_320_240.JPG',
           'OO':'http://www.miniindy.org/assets/Uploads/TeamLogos/skywestLogo.png',
           'UA':'http://vertassets.blob.core.windows.net/image/c9736e07/c9736e07-f283-4a9c-b981-a1a80024c665/mm89155logo.jpg',
           'VX':'https://www.virginamerica.com/images/vx-icon-152.png',
           'WN':'http://www.carryitclearly.com/images/southwest-logo.jpg'}

In [11]:
#list(enumerate(merged_df.columns))

In [12]:
merged_df.drop('Unnamed: 0_x', axis=1, inplace=True)
merged_df.drop('Unnamed: 0_y', axis=1, inplace=True)

In [13]:
merged_df.columns

Index(['UNIQUE_CARRIER', 'arr_delay_var', 'carrier_delay%', 'cancellation%',
       'airline_name_x', 'airport_routes', 'city_coutes', 'distance',
       'average_delay_minutes', 'flight_counts', 'delay_var_minutes',
       'airline_name_y', 'cabin', 'entertainment', 'food_bev', 'ground',
       'money_value', 'overall', 'recommended', 'review_sentiment', 'wifi'],
      dtype='object')

In [14]:
merged_df.head(2)

Unnamed: 0,UNIQUE_CARRIER,arr_delay_var,carrier_delay%,cancellation%,airline_name_x,airport_routes,city_coutes,distance,average_delay_minutes,flight_counts,...,airline_name_y,cabin,entertainment,food_bev,ground,money_value,overall,recommended,review_sentiment,wifi
0,AA,2226.048309,84.121552,0.850827,American Airlines Inc.,"('ABQ', 'DFW')","('Albuquerque, NM', 'Dallas/Fort Worth, TX')",569.0,6.843891,442.0,...,american-airlines,2.5,2.0,2.3,1.9,2.4,3.7,0.3,-0.0,1.9
1,AA,2226.048309,84.121552,0.850827,American Airlines Inc.,"('ALB', 'CLT')","('Albany, NY', 'Charlotte, NC')",646.0,0.946154,260.0,...,american-airlines,2.5,2.0,2.3,1.9,2.4,3.7,0.3,-0.0,1.9


In [46]:
def topic(x):
    for k, v in tmp_dict.items():
        if k == x:
            return v
    return np.nan

In [68]:
merged_df['Topics Extract'] = merged_df['UNIQUE_CARRIER'].apply(topic)
merged_df['url'] = merged_df['UNIQUE_CARRIER'].apply(lambda x: tmp_img[x] if x in tmp_img.keys() else np.nan)
merged_df['url'] = merged_df['url'].apply(lambda x: "<img src=" + x + '"style=width: 80%; height: 80%/">')

In [84]:
production_df = merged_df[['airline_name_x', 'airport_routes', 'city_coutes', 'distance', 'average_delay_minutes', 
                           'delay_var_minutes', 'overall', 'cabin', 'entertainment','wifi', 'food_bev', 
                           'money_value', 'Topics Extract', 'url']]

production_df.columns = ['Airline Name', 'Airports', 'Cities', 'Distance', 'Average Delay (min)', 
                        'Variance of Delays (min)', 'Overall', 'Cabin', 'Entertainment', 'Wifi', 
                         'Food/Beverage', 'Bang for Buck', 'Topics Extract', 'url']

production_df['for_html'] = production_df.ix[:, ('url', 'Topics Extract')].apply(tuple, axis=1)

production_df['Distance'] = production_df.ix[:,'Distance'].apply(lambda x: int(x))
for i in range(4,6):
    production_df.ix[:,i] = production_df.ix[:,i].apply(lambda x: round(x,1))

In [85]:
production_df.head(2)

Unnamed: 0,Airline Name,Airports,Cities,Distance,Average Delay (min),Variance of Delays (min),Overall,Cabin,Entertainment,Wifi,Food/Beverage,Bang for Buck,Topics Extract,url
0,American Airlines Inc.,"('ABQ', 'DFW')","('Albuquerque, NM', 'Dallas/Fort Worth, TX')",569.0,6.843891,2874.268092,3.7,2.5,2.0,1.9,2.3,2.4,"long haul, missed connecting, lie flat seats",<img src=http://crushed.com/wp-content/uploads...
1,American Airlines Inc.,"('ALB', 'CLT')","('Albany, NY', 'Charlotte, NC')",646.0,0.946154,26.290526,3.7,2.5,2.0,1.9,2.3,2.4,"long haul, missed connecting, lie flat seats",<img src=http://crushed.com/wp-content/uploads...


## Functions for backend of application

In [80]:
def get_airlines(org, dest, sortby):
    '''function used to display dataframe containing airlines serving a given route'''
    assert len(org), len(dest) == 3
    org, dest = org.upper(), dest.upper()
    if sortby == 'Average Delay (min)' or sortby == 'Variance of Delays (min)':
        return production_df.loc[production_df['Airports'] == str((str(org),str(dest)))].ix[:, (0,4,5,6,7,8,9,10,11,12)].sort_values(str(sortby), ascending=True).reset_index().drop('index', axis=1)
    else:
        return production_df.loc[production_df['Airports'] == str((str(org),str(dest)))].ix[:, (0,4,5,6,7,8,9,10,11,12)].sort_values(str(sortby), ascending=False).reset_index().drop('index', axis=1)
    
def origin_city(org, dest):
    '''function returns the city of origin'''
    org, dest = org.upper(), dest.upper()
    return eval(production_df.loc[production_df['Airports'] == str((str(org),str(dest)))].iloc[0,2])[0]

def destination_city(org, dest):
    '''function returns the city of destination'''
    org, dest = org.upper(), dest.upper()
    return eval(production_df.loc[production_df['Airports'] == str((str(org),str(dest)))].iloc[0,2])[1]

def distance_btw_cities(org, dest):
    '''function returns the distance in miles between origin and destination cities'''
    org, dest = org.upper(), dest.upper()
    return production_df.loc[production_df['Airports'] == str((str(org),str(dest)))].iloc[0,3]

def display_dist(org, dest):
    '''function returns origin, destination city names and distance between'''
    return 'Distance from ' + str(origin_city(org, dest)) + ' to ' + str(destination_city(org, dest)) + ' is ' + str(distance_btw_cities(org, dest)) + ' miles.' 

# def display_topics(org, dest, sortby):
#     df = production_df.loc[production_df['Airports'] == str(('JFK','ORD'))]\
#             .ix[:, (0,4,5,6,7,8,9,10,11,12)].sort_values(str(sortby), ascending=False).reset_index().drop('index', axis=1)
#     for i in range(len(df)):
#         print(df.ix[i, 'Airline Name']+': '+df.ix[i, 'Topics Extract'])
        
# def for_extract(org, dest, sortby):
#     org, dest = org.upper(), dest.upper()
#     return production_df.ix[production_df['Airports'] == str((str(org), str(dest)))].ix[:,14].tolist()

def for_extract_img(org, dest, sortby):
    '''create list of image html for selected airlines'''
    org, dest = org.upper(), dest.upper()
    if sortby == 'Average Delay (min)' or sortby == 'Variance of Delays (min)':
        return production_df.ix[production_df['Airports'] == str((str(org), str(dest)))].sort_values(str(sortby), ascending=True).ix[:,14].tolist()
    else:
        return production_df.ix[production_df['Airports'] == str((str(org), str(dest)))].sort_values(str(sortby), ascending=False).ix[:,14].tolist()

def for_extract_text(org, dest, sortby):
    '''create list of topics modeled from LDA for selected airlines'''
    org, dest = org.upper(), dest.upper()
    if sortby == 'Average Delay (min)' or sortby == 'Variance of Delays (min)':
        return production_df.ix[production_df['Airports'] == str((str(org), str(dest)))].sort_values(str(sortby), ascending=True).ix[:,13].tolist()
    else:
        return production_df.ix[production_df['Airports'] == str((str(org), str(dest)))].sort_values(str(sortby), ascending=False).ix[:,13].tolist()

def html_dict(imgList, extractsList):
    '''create list of dictionaries containing image html and LDA extracted text'''
    dictList = []
    for i, img in enumerate(imgList):
        tempDict = { 'img': img, 'text':extractsList[i] }
        dictList.append(tempDict)
    return dictList

In [181]:
# Testing the function
get_airlines('ewr', 'ord', 'Overall')

Unnamed: 0,Airline Name,Average Delay (min),Variance of Delays (min),Overall,Cabin,Entertainment,Wifi,Food/Beverage,Bang for Buck,Topics Extract
0,American Airlines Inc.,5.8,1072.1,3.7,2.5,2.0,1.9,2.3,2.4,"flight cancelled, long haul, flight delayed, m..."
1,United Air Lines Inc.,5.1,754.4,3.4,2.4,2.1,1.9,2.0,2.2,"flight delayed, missed connecting flight, canc..."


In [193]:
#Testing display topics function
display_topics('jfk', 'ord', 'Cabin')

'JetBlue Airways: snacks, drinks, plenty legroom, free wifi, cancelled, delayed flight, customer service, direct tv, seats comfortable, great, helpful'

In [200]:
#Testing get airlines function
get_airlines('LGA', 'ORD', 'Variance of Delays (min)')

Unnamed: 0,Airline Name,Average Delay (min),Variance of Delays (min),Overall,Cabin,Entertainment,Wifi,Food/Beverage,Bang for Buck,Topics Extract
0,Spirit Air Lines,0.6,14.5,2.9,2.1,1.2,1,1.4,2.0,"hidden fees, baggage fees, pay extra, delayed,..."
1,SkyWest Airlines Inc.,0.5,29.1,7.2,3.2,0.8,NR,0.8,2.6,"efficient, highly recommended, efficient check..."
2,United Air Lines Inc.,1.4,90.7,3.4,2.4,2.1,1.9,2.0,2.2,"flight delayed, missed connecting flight, canc..."
3,American Airlines Inc.,2.1,228.0,3.7,2.5,2.0,1.9,2.3,2.4,"flight cancelled, long haul, flight delayed, m..."


In [98]:
production_df.to_csv('prod_df_v5.csv')