In [None]:
import numpy as np 
import pandas as pd
import networkx as nx
import collections

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
#from mpl_toolkits.basemap import Basemap as Basemap

Some steps to load the US cities data set & get correct city name

In [None]:
# https://en.wikipedia.org/wiki/List_of_United_States_metropolitan_areas_by_per_capita_income
cities_cols= ['rank','name','population','income']
cities_df = pd.read_csv("../input/uscities/uscities.csv", skiprows=1, names = cities_cols)
cities_df.head()

In [None]:
#cities_df['population'] = cities_df['population'].str.replace(',', '')
#cities_df["population"] = pd.to_numeric(cities_df["population"])
#cities_df["rank"] = pd.to_numeric(cities_df["rank"])

cities_df = cities_df[['name', 'income']]
cities_df["income"] = pd.to_numeric(cities_df["income"])
cities_df.head()

In [None]:
# new data frame with split value columns (split ',' here')
new = cities_df["name"].str.split(", ", expand = True) 
new.head()

In [None]:
# new data frame with split value columns  (split '-' here')
new2 = new[0].str.split("–", expand = True) 
new2.head()

In [None]:
cities_df['city'] = new2[0]
cities_df = cities_df.drop('name', 1)
cities_df.head()

In [None]:
cities_df['income'].hist(bins=50)

In [None]:
#cities_df['income_log'] = np.log(cities_df['income'])
#cities_df['income_log'].hist(bins=10)

In [None]:
# Read route data
route_cols = ['Airline', 'Airline ID', 'Source', 'Source Airport ID',
              'Dest', 'Dest Airport ID', 'Codeshare', 'Stops', 'equipment']
routes_df = pd.read_csv("../input/flight-route-database/routes.csv", skiprows=1, names = route_cols)
routes_df['Source Airport ID'] = pd.to_numeric(routes_df['Source Airport ID'].astype(str), 'coerce')
routes_df['Dest Airport ID'] = pd.to_numeric(routes_df['Dest Airport ID'].astype(str), 'coerce')
    
print(routes_df.shape)
routes_df.head()

In [None]:
# Read airport data
airport_df = pd.read_csv("../input/openflights-airports-database-2017/airports.csv")
print(airport_df.shape)
airport_df.tail()

In [None]:
# Drop airport that don't have IATA data
airport_df = airport_df[airport_df.IATA != '\\N']
print(airport_df.shape)
airport_df.tail()

In [None]:
# make new route df with route count info
routes_all = pd.DataFrame(routes_df.groupby(['Source', 'Dest']).size().reset_index(name='counts'))

airport_all = airport_df[['Name','City','Country','Latitude', 'Longitude', 'IATA']]
IATA_array = airport_all["IATA"].tolist()

# extract us airport info
airport_us = airport_df[(airport_df.Country == "United States")][['Airport ID','Name','City','IATA']]

In [None]:
airport_us.sort_values('City')

In [None]:
print(routes_all.shape)
routes_all.head()

In [None]:
# only keep route with airport have IATA code
routes_all = routes_all[routes_all['Source'].isin(IATA_array)]
routes_all = routes_all[routes_all['Dest'].isin(IATA_array)]

In [None]:
# add route for all 2 airports in same city

# make 2 temp df

local_source_ap = airport_all[['City','Country','IATA']].copy()
local_source_ap.rename({'IATA': 'Source'}, axis=1, inplace=True)
local_source_ap.dropna(inplace=True)

local_dest_ap = airport_all[['City','Country','IATA']].copy()
local_dest_ap.rename({'IATA': 'Dest'}, axis=1, inplace=True)
local_dest_ap.dropna(inplace=True)

In [None]:
# only consider airport that already have routes

# make set of all airport with route
ap_set1 = set(routes_all["Source"].tolist())
ap_set2 = set(routes_all["Dest"].tolist())
print(len(ap_set1))
print(len(ap_set2))
ap_set1.update(ap_set2)
print(len(ap_set1))

In [None]:
local_source_ap2 = local_source_ap[(local_source_ap['Source'].isin(ap_set1))]
local_dest_ap2 = local_dest_ap[(local_dest_ap['Dest'].isin(ap_set1))]

In [None]:
local_route = pd.merge(local_source_ap2, local_dest_ap2, how='inner', on=['City', 'Country'])
local_route = local_route.query("Source != Dest")
local_route.shape

In [None]:
routes_all_n_local = routes_all.append(local_route)
print(routes_all_n_local.shape)

In [None]:
routes_all_n_local.drop(['City', 'Country'], axis=1, inplace=True)
routes_all_n_local['counts'] = routes_all_n_local['counts'].fillna(1)
routes_all_n_local.head()

In [None]:
print(routes_all.shape)

In [None]:
# to find number of flights in and out of an airport
# it is similar to find number of rows in which each airport occur in either one of the 2 columns
counts = routes_all['Source'].append(routes_all.loc[routes_all['Source'] != routes_all['Dest'], 'Dest']).value_counts()

# create a data frame of position based on names in count
counts = pd.DataFrame({'IATA': counts.index, 'total_flight': counts})
pos_data = counts.merge(airport_all, on = 'IATA')

In [None]:
print(counts.shape)

In [None]:
counts.head()

In [None]:
pos_data.head()

In [None]:
# Create networkX graph
graph = nx.from_pandas_edgelist(routes_all, source = 'Source', target = 'Dest', edge_attr = 'counts',create_using = nx.DiGraph())
print(nx.info(graph))

# Network Analysis

In [None]:
# Find number of strongly connected components in flight network
print(nx.number_strongly_connected_components(graph))

largest_scc_nodes = max(nx.strongly_connected_components(graph), key=len)
largest_scc = graph.subgraph(largest_scc_nodes)

# Find number of airport in the largest strongly connected component
print(len(largest_scc.nodes()))

# Find the ratio of this compontnet's airport in the flight network
print(len(graph.nodes()))
print(len(largest_scc.nodes)/len(graph.nodes()))

In [None]:
# Find number of strongly connected components in flight network
print(nx.number_strongly_connected_components(graph))

In [None]:
# Get the largest strongly connected component in flight network
largest_scc_nodes = max(nx.strongly_connected_components(graph), key=len)
largest_scc = graph.subgraph(largest_scc_nodes)

# Find number of airport in the largest strongly connected component
print(len(largest_scc.nodes()))

# Find the ratio of this compontnet's airport in the entire flight network
print(len(graph.nodes()))
print(len(largest_scc.nodes)/len(graph.nodes()))

In [None]:
# Find number of weakly connected components in flight network
print(nx.number_weakly_connected_components(graph))

# Find airport number in the biggest weakly connected component
largest_wcc = max(nx.weakly_connected_components(graph), key=len)
print(len(largest_wcc))

# Find the portion of this compontnet in the flight network
print(len(graph.nodes()))
print(len(largest_wcc)/len(graph.nodes()))

Centrality

In [None]:
deg = nx.degree_centrality(graph)
sort = sorted(deg.items(), key=lambda x: -x[1])
print(sort[:50])

In [None]:
clo_cen = nx.closeness_centrality(graph)
sort = sorted(clo_cen.items(), key=lambda x: -x[1])
print(sort[:30])

In [None]:
btw_cen = nx.betweenness_centrality(graph)
sort = sorted(btw_cen.items(), key=lambda x: -x[1])
print(sort[:30])

In [None]:
pagerank = nx.pagerank(graph)
sort = sorted(pagerank.items(), key=lambda x: -x[1])
print(sort[:30])

In [None]:
DestCnt = routes_df['Dest'].value_counts()
DestCntDict = DestCnt.to_dict()

In [None]:
airport_us.shape

In [None]:
airport_us["deg_cen"] = airport_us["IATA"].map(deg)
airport_us["clo_cen"] = airport_us["IATA"].map(clo_cen)
airport_us["btw_cen"] = airport_us["IATA"].map(btw_cen)
airport_us["pagerank"] = airport_us["IATA"].map(pagerank)
#airport_us["destCnt"] = airport_us["IATA"].map(DestCntDict)

In [None]:
airport_us = airport_us[airport_us['deg_cen'].notna()]

In [None]:
# in 1 city, only keep top airport for economy analysis
airport_us = airport_us.sort_values(['City', 'deg_cen'], ascending=False).drop_duplicates(['City'], keep='first')

In [None]:
airport_us

In [None]:
airport_us2 = airport_us.merge(cities_df, left_on='City', right_on='city')
airport_us2.head()

In [None]:
airport_us2.shape

In [None]:
final_df = airport_us2[['deg_cen','clo_cen','btw_cen','pagerank','income']]
final_df.head()

In [None]:
g = sns.pairplot(final_df)

In [None]:
f = plt.figure(figsize=(10, 8))
corrMatrix = final_df.corr()
sns.heatmap(corrMatrix, annot=True)
plt.show()

In [None]:
corr = final_df.corr()
corr.style.background_gradient(cmap='coolwarm')

In [None]:
corr = final_df.corr()
corr

# Trial 2: use US cities GDP data

In [None]:
# https://en.wikipedia.org/wiki/List_of_U.S._metropolitan_areas_by_GDP
cities_cols= ['City','Area','2018','2017','2016','2015','2014','2013','2012']
gdp_df = pd.read_csv("../input/uscities/us_gdp.csv", skiprows=1, names = cities_cols)

gdp_df["GDP"] = pd.to_numeric(gdp_df["2014"])
gdp_df = gdp_df[['City', 'GDP']]

gdp_df.head()

In [None]:
#airport_us.to_csv("us.csv")
#airport_us3.to_csv("us3.csv")

In [None]:
airport_us3 = airport_us.merge(gdp_df, left_on='City', right_on='City')
airport_us3.head()

In [None]:
final_df = airport_us3[['deg_cen','clo_cen','btw_cen','pagerank','GDP']]
final_df.head()

In [None]:
g = sns.pairplot(final_df)

In [None]:
f = plt.figure(figsize=(10, 8))
corrMatrix = final_df.corr()
sns.heatmap(corrMatrix, annot=True, fmt=".4")
plt.show()

In [None]:
corr = final_df.corr()
corr.style.background_gradient(cmap='coolwarm')

In [None]:
corr = final_df.corr()
corr

# Notes for report

**(suggested section title for report: Relationship between a city's economy and its airport's importance)**

* Reference for GDP data source: https://en.wikipedia.org/wiki/List_of_U.S._metropolitan_areas_by_GDP

Our hypothesis:
*  	Airline companies develop flight routes based on business supply and demand. 
*  	Cities with more business activities should have busier airports, as they have more passengers visiting for both business and tourism purposes. 
*  	Therefore, we speculate that cities with higher GDP output should have airports that are more connected to the global flight network.
*  	This means the GDP of a city should correlate with its airport’s network centrality values.

Observations:
* 	Correlation matrix shows our hypothesis is correct: we observe strong correlations between cities’ GDP and their various centrality values
* 	Again, closeness centrality is the network centrality measure that correlates with GDP the most (0.6967). 
* 	This further confirm our earlier conclusion that this centrality measure is most useful in predicting the importance of airports, as cities with higher volume of business activities should have more important airports.