In [34]:
import matplotlib.pyplot as plt
import requests
import numpy as np
import pandas as pd
import json
import operator
from tqdm import tqdm_notebook as tqdm
from pprint import pprint

## Reading & Cleaning Carriers Data

In [35]:
#Reading carriers data csv 
carriers_data = "Resources/Test_carriers_df.csv"
carriers_data_df = pd.read_csv(carriers_data,encoding="utf-8")
carriers_data_df

#This is the dictionary
airlines_dict = {row[0]: row[1] for k, row in carriers_data_df.iterrows()}


In [36]:
#Renaming columns
#https://stackoverflow.com/questions/11346283/renaming-columns-in-pandas
carriers_data_df = carriers_data_df.rename(columns={'CarrierId': 'Airline ID', 'Name':'Airline Name'})

convert_dict2 = {'Airline ID': object}
carriers_data_df = carriers_data_df.astype(convert_dict2) 

#Expport clean data frame to csv
export_csv = carriers_data_df.to_csv(r'C:\Users\pablo\Google Drive\Certifications\Rice Data Analytics\HomeWorks_Projects\COPY_Project_1\Resources\carriersCLEAN.csv',index = False)

carriers_data_df.head()

Unnamed: 0,Airline ID,Airline Name
0,838,Air France
1,857,Finnair
2,870,jetBlue
3,981,Condor
4,988,Delta


## Reading & Cleaning Quotes Data

In [37]:
#Reading Quoutes data csv
quotes_data = "Resources/Test_merged_quotes_outbound.csv"
quotes_data_df = pd.read_csv(quotes_data,encoding="utf-8")
quotes_data_df.head()

Unnamed: 0,Direct,MinPrice,OutboundLeg,QuoteDateTime,QuoteId,CarrierIds,DepartureDate,DestinationId,OriginId
0,False,600.0,"{'CarrierIds': [1713], 'OriginId': 58440, 'Des...",2019-07-22T05:55:00,1,[1713],2019-12-01T00:00:00,44759,58440
1,True,1266.0,"{'CarrierIds': [1324], 'OriginId': 58440, 'Des...",2019-07-22T05:55:00,2,[1324],2019-12-01T00:00:00,44759,58440
2,False,2833.0,"{'CarrierIds': [1368], 'OriginId': 58440, 'Des...",2019-07-22T23:10:00,3,[1368],2019-12-02T00:00:00,44759,58440
3,False,345.0,"{'CarrierIds': [1467], 'OriginId': 58440, 'Des...",2019-07-22T05:56:00,4,[1467],2019-12-04T00:00:00,44759,58440
4,True,1296.0,"{'CarrierIds': [1324], 'OriginId': 58440, 'Des...",2019-07-22T05:56:00,5,[1324],2019-12-04T00:00:00,44759,58440


In [38]:
#Delete columns that we don't use
#https://www.shanelynn.ie/using-pandas-dataframe-creating-editing-viewing-data-in-python/
quotes_data_df = quotes_data_df.drop(['OutboundLeg','QuoteDateTime'], axis =1)

#Delete the brackets from carrier Ids
quotes_data_df['CarrierIds'] = quotes_data_df['CarrierIds'].str.replace("[","")
quotes_data_df['CarrierIds'] = quotes_data_df['CarrierIds'].str.replace("]","")

#Delete the time from the daparture date
quotes_data_df['DepartureDate'] = quotes_data_df['DepartureDate'].str.replace("T00:00:00","")

#Change order of columns 
#https://erikrood.com/Python_References/change_order_dataframe_columns_final.html
quotes_data_df = quotes_data_df[['OriginId','DestinationId','MinPrice','CarrierIds','Direct','DepartureDate','QuoteId']]

#Renaming columns
#https://stackoverflow.com/questions/11346283/renaming-columns-in-pandas
quotes_data_df = quotes_data_df.rename(columns={'OriginId': 'City ID Departure', 'DestinationId':'City ID','MinPrice':'Min Price', 'CarrierIds': 'Airline ID', 'Direct': 'Non-Stop Flight', 'DepartureDate':'Departure Date', 'QuoteId': 'Quote ID'})

#Change Airline ID to numeric
#https://www.geeksforgeeks.org/change-data-type-for-one-or-more-columns-in-pandas-dataframe/
convert_dict = {'Airline ID': int}
quotes_data_df = quotes_data_df.astype(convert_dict) 


#Expport clean data frame to csv
export_csv = quotes_data_df.to_csv(r'C:\Users\pablo\Google Drive\Certifications\Rice Data Analytics\HomeWorks_Projects\COPY_Project_1\Resources\quotesCLEAN.csv',index = False)



## Reading & Cleaning Places data

In [43]:
#Reading places data csv
places_data = "Resources/Testplacescsv.csv"
places_data_df = pd.read_csv(places_data,encoding="utf-8")
places_data_df.head()

Unnamed: 0,CityId,CityName,CountryName,IataCode,Name,PlaceId,SkyscannerCode,Type
0,PARI,Paris,France,CDG,Paris Charles de Gaulle,44759,CDG,Station
1,HOUA,Houston,United States,IAH,Houston George Bush Intercntl.,58440,IAH,Station


In [44]:
#Drop/remove entire column CityId, Type, drop SkyscannerCode, change PlaceId to string
#Delete columns that we don't use
#https://www.shanelynn.ie/using-pandas-dataframe-creating-editing-viewing-data-in-python/
places_data_df = places_data_df.drop(['CityId','Type','SkyscannerCode'], axis =1)

In [45]:
#Change order of columns 
#https://erikrood.com/Python_References/change_order_dataframe_columns_final.html
places_data_df = places_data_df[['PlaceId','CityName','Name','IataCode','CountryName']]

#Renaming columns
#https://stackoverflow.com/questions/11346283/renaming-columns-in-pandas
places_data_df = places_data_df.rename(columns={'CityName': 'City Name', 'CountryName':'Country Name','IataCode':'Iata Code', 'Name': 'Airport Name', 'PlaceId': 'City ID'})

#Export clean data frame to csv
export_csv = places_data_df.to_csv(r'C:\Users\pablo\Google Drive\Certifications\Rice Data Analytics\HomeWorks_Projects\COPY_Project_1\Resources\placesCLEAN.csv',index = False)

In [46]:
places_data_df.head()

Unnamed: 0,City ID,City Name,Airport Name,Iata Code,Country Name
0,44759,Paris,Paris Charles de Gaulle,CDG,France
1,58440,Houston,Houston George Bush Intercntl.,IAH,United States


## Merging 

In [47]:
merge_1 = pd.merge(places_data_df, quotes_data_df, how = 'outer', on = 'City ID' )
merge_1.head(1000)

Unnamed: 0,City ID,City Name,Airport Name,Iata Code,Country Name,City ID Departure,Min Price,Airline ID,Non-Stop Flight,Departure Date,Quote ID
0,44759,Paris,Paris Charles de Gaulle,CDG,France,58440.0,600.0,1713.0,False,2019-12-01,1.0
1,44759,Paris,Paris Charles de Gaulle,CDG,France,58440.0,1266.0,1324.0,True,2019-12-01,2.0
2,44759,Paris,Paris Charles de Gaulle,CDG,France,58440.0,2833.0,1368.0,False,2019-12-02,3.0
3,44759,Paris,Paris Charles de Gaulle,CDG,France,58440.0,345.0,1467.0,False,2019-12-04,4.0
4,44759,Paris,Paris Charles de Gaulle,CDG,France,58440.0,1296.0,1324.0,True,2019-12-04,5.0
5,44759,Paris,Paris Charles de Gaulle,CDG,France,58440.0,290.0,1467.0,False,2019-12-05,6.0
6,44759,Paris,Paris Charles de Gaulle,CDG,France,58440.0,2961.0,838.0,True,2019-12-05,7.0
7,44759,Paris,Paris Charles de Gaulle,CDG,France,58440.0,886.0,1755.0,False,2019-12-08,8.0
8,44759,Paris,Paris Charles de Gaulle,CDG,France,58440.0,388.0,1065.0,False,2019-12-09,9.0
9,44759,Paris,Paris Charles de Gaulle,CDG,France,58440.0,2965.0,838.0,True,2019-12-09,10.0


In [48]:
#Drop NA values
#http://www.datasciencemadesimple.com/drop-rows-with-nan-na-drop-missing-value-in-pandas-python-2/
merge_1 = merge_1.dropna()
merge_1



Unnamed: 0,City ID,City Name,Airport Name,Iata Code,Country Name,City ID Departure,Min Price,Airline ID,Non-Stop Flight,Departure Date,Quote ID
0,44759,Paris,Paris Charles de Gaulle,CDG,France,58440.0,600.0,1713.0,False,2019-12-01,1.0
1,44759,Paris,Paris Charles de Gaulle,CDG,France,58440.0,1266.0,1324.0,True,2019-12-01,2.0
2,44759,Paris,Paris Charles de Gaulle,CDG,France,58440.0,2833.0,1368.0,False,2019-12-02,3.0
3,44759,Paris,Paris Charles de Gaulle,CDG,France,58440.0,345.0,1467.0,False,2019-12-04,4.0
4,44759,Paris,Paris Charles de Gaulle,CDG,France,58440.0,1296.0,1324.0,True,2019-12-04,5.0
5,44759,Paris,Paris Charles de Gaulle,CDG,France,58440.0,290.0,1467.0,False,2019-12-05,6.0
6,44759,Paris,Paris Charles de Gaulle,CDG,France,58440.0,2961.0,838.0,True,2019-12-05,7.0
7,44759,Paris,Paris Charles de Gaulle,CDG,France,58440.0,886.0,1755.0,False,2019-12-08,8.0
8,44759,Paris,Paris Charles de Gaulle,CDG,France,58440.0,388.0,1065.0,False,2019-12-09,9.0
9,44759,Paris,Paris Charles de Gaulle,CDG,France,58440.0,2965.0,838.0,True,2019-12-09,10.0


In [49]:
# Merge Airline Name, keep in mind that a dictionary has been created for the data frame of carriers to eliminate duplicates
#https://stackoverflow.com/questions/20250771/remap-values-in-pandas-column-with-a-dict

# Map the city name to the Airline ID by creating a new Airline Name column
merge_1['Airline Name'] = merge_1['Airline ID'].map(airlines_dict)
merge_1

Unnamed: 0,City ID,City Name,Airport Name,Iata Code,Country Name,City ID Departure,Min Price,Airline ID,Non-Stop Flight,Departure Date,Quote ID,Airline Name
0,44759,Paris,Paris Charles de Gaulle,CDG,France,58440.0,600.0,1713.0,False,2019-12-01,1.0,Singapore Airlines
1,44759,Paris,Paris Charles de Gaulle,CDG,France,58440.0,1266.0,1324.0,True,2019-12-01,2.0,KLM
2,44759,Paris,Paris Charles de Gaulle,CDG,France,58440.0,2833.0,1368.0,False,2019-12-02,3.0,Lufthansa
3,44759,Paris,Paris Charles de Gaulle,CDG,France,58440.0,345.0,1467.0,False,2019-12-04,4.0,Spirit Airlines
4,44759,Paris,Paris Charles de Gaulle,CDG,France,58440.0,1296.0,1324.0,True,2019-12-04,5.0,KLM
5,44759,Paris,Paris Charles de Gaulle,CDG,France,58440.0,290.0,1467.0,False,2019-12-05,6.0,Spirit Airlines
6,44759,Paris,Paris Charles de Gaulle,CDG,France,58440.0,2961.0,838.0,True,2019-12-05,7.0,Air France
7,44759,Paris,Paris Charles de Gaulle,CDG,France,58440.0,886.0,1755.0,False,2019-12-08,8.0,Turkish Airlines
8,44759,Paris,Paris Charles de Gaulle,CDG,France,58440.0,388.0,1065.0,False,2019-12-09,9.0,Frontier Airlines
9,44759,Paris,Paris Charles de Gaulle,CDG,France,58440.0,2965.0,838.0,True,2019-12-09,10.0,Air France


In [50]:
#Export and save to CSV
export_csv = merge_1.to_csv(r'C:\Users\pablo\Google Drive\Certifications\Rice Data Analytics\HomeWorks_Projects\COPY_Project_1\Resources\MergeCleanFINAL.csv',index = False)


#print(Houston_merge_1.dtypes)

merge_1


Unnamed: 0,City ID,City Name,Airport Name,Iata Code,Country Name,City ID Departure,Min Price,Airline ID,Non-Stop Flight,Departure Date,Quote ID,Airline Name
0,44759,Paris,Paris Charles de Gaulle,CDG,France,58440.0,600.0,1713.0,False,2019-12-01,1.0,Singapore Airlines
1,44759,Paris,Paris Charles de Gaulle,CDG,France,58440.0,1266.0,1324.0,True,2019-12-01,2.0,KLM
2,44759,Paris,Paris Charles de Gaulle,CDG,France,58440.0,2833.0,1368.0,False,2019-12-02,3.0,Lufthansa
3,44759,Paris,Paris Charles de Gaulle,CDG,France,58440.0,345.0,1467.0,False,2019-12-04,4.0,Spirit Airlines
4,44759,Paris,Paris Charles de Gaulle,CDG,France,58440.0,1296.0,1324.0,True,2019-12-04,5.0,KLM
5,44759,Paris,Paris Charles de Gaulle,CDG,France,58440.0,290.0,1467.0,False,2019-12-05,6.0,Spirit Airlines
6,44759,Paris,Paris Charles de Gaulle,CDG,France,58440.0,2961.0,838.0,True,2019-12-05,7.0,Air France
7,44759,Paris,Paris Charles de Gaulle,CDG,France,58440.0,886.0,1755.0,False,2019-12-08,8.0,Turkish Airlines
8,44759,Paris,Paris Charles de Gaulle,CDG,France,58440.0,388.0,1065.0,False,2019-12-09,9.0,Frontier Airlines
9,44759,Paris,Paris Charles de Gaulle,CDG,France,58440.0,2965.0,838.0,True,2019-12-09,10.0,Air France
