In [None]:
%%capture
!pip install networkx==2.3

In [None]:
import pandas as pd
import networkx as nx
#%matplotlib notebook
import matplotlib.pyplot as plt
import os
import operator
import warnings
warnings.filterwarnings('ignore')
print(os.listdir("../input"))

In [None]:
G_df = pd.read_csv('../input/openflights-route-database-2014/routes.csv')
#Count_df = pd.read_csv('D:/Personal and studies/College/Semester 6/Social and information networks project/countries.txt')
cols_list=["City","Country","IATA"]
airport_df = pd.read_csv('../input/openflights-airports-database-2017/airports.csv',usecols=cols_list)

In [None]:
G_df.head(2)

In [None]:
#Count_df.head(2)

In [None]:
airport_df.head(2)

In [None]:
G_draw = nx.from_pandas_edgelist(G_df.head(1000), 'Source airport', 'Destination airport',create_using=nx.DiGraph())

In [None]:
plt.figure(figsize=(12,8))
nx.draw(G_draw,pos=nx.spring_layout(G_draw),with_labels=False)

In [None]:
G = nx.from_pandas_edgelist(G_df, 'Source airport', 'Destination airport',create_using=nx.DiGraph())

In [None]:
print(nx.info(G))

In [None]:
#does a route exist between every two airport? #is every airport reachable from every other airport?
nx.is_strongly_connected(G), nx.is_connected(G.to_undirected())

## Weakly and strongly connected components

In [None]:
#How many nodes are in the largest (in terms of nodes) weakly connected component?
wccs = nx.weakly_connected_components(G)
x=len(max(wccs, key=len))
print(x)
print(x/len(G.nodes()))

#so means 99% of graph is weakly connected

In [None]:
#How many nodes are in the largest (in terms of nodes) strongly connected component?
sccs = nx.strongly_connected_components(G)
x=len(max(sccs, key=len))
print(x)
print(x/len(G.nodes()))
#so 97% are strongly connected 

## What is the Average shortest path length?
* Average shortest path length can only be calculated in graphs which are at least weakly connected(otherwise it'll obviously be infinity)
* So we find out average path length for both weakly and strongly connected subgraph

In [None]:
scc_subs = nx.strongly_connected_component_subgraphs(G)
G_sc = max(scc_subs, key=len) #the largest strongly connected subgraph
shortest_sc=nx.average_shortest_path_length(G_sc)
shortest_sc

In [None]:
wcc_subs = nx.weakly_connected_component_subgraphs(G)
G_wc = max(wcc_subs, key=len) #the largest weakly connected subgraph
shortest_wc=nx.average_shortest_path_length(G_wc)
shortest_wc

In [None]:
len(G_sc.edges())/len(G_sc.nodes()) 

In [None]:
len(G_wc.edges())/len(G_wc.nodes())

Why is it less for **weakly connected** subgraph? Even though number of edges per node is higher in strongly connected

## Density of a network

In [None]:
nx.density(G),nx.density(G_sc)

## Degrees

In [None]:
degrees = dict(G.degree())
degree_values = sorted(set(degrees.values()))
histogram = [list(degrees.values()).count(i)/float(nx.number_of_nodes(G_sc)) for i in degree_values]

In [None]:
plt.plot(histogram)

## Diameter and radius
* The diameter and radius of weakly connected componenet will obviously be infinity
* The diameter represents the greatest possible no of airports between any two airports
* The radius represents an airport from which every other airport is at a minimum no of airports apart


In [None]:
diameter=nx.diameter(G_sc)
diameter

In [None]:
radius=nx.radius(G_sc)
radius

## Periphery and Center. Now lets see which airports are these
* Set of nodes which have eccentricity equal to diameter
* Set of nodes eccentricity equal to the radius

In [None]:
per=nx.periphery(G_sc)
per

In [None]:
airport_df.loc[airport_df['IATA'].isin(per)]

In [None]:
cen=nx.center(G_sc)
cen

In [None]:
airport_df.loc[airport_df['IATA'].isin(cen)]

### Which node in G_sc is connected to the most other nodes by a shortest path of length equal to the 
* diameter of G_sc?
* radius of G_sc
How many nodes are connected to this node?

In [None]:
max_count = -1
result_node = None
for node in per:
    count = 0
    sp = nx.shortest_path_length(G_sc, node)
    for key, value in sp.items():
        if value == diameter:
            count += 1        
    if count > max_count:
        result_node = node
        max_count = count

result_node, max_count

In [None]:
airport_df.loc[airport_df['IATA'] == result_node]

In [None]:
d = radius
max_count = -1
result_node = None
for node in cen:
    count = 0
    sp = nx.shortest_path_length(G_sc, node)
    for key, value in sp.items():
        if value == radius:
            count += 1        
    if count > max_count:
        result_node = node
        max_count = count

result_node, max_count

In [None]:
airport_df.loc[airport_df['IATA'] == result_node]

### How many and which nodes need to be removed to disconnect result_node?

In [None]:
node = result_node
list(nx.minimum_node_cut(G_sc, cen[0], node)),len(nx.minimum_node_cut(G_sc, cen[0], node))

## Transitivity and Average Clustering Coefficient of

* Strongly connected

In [None]:
nx.transitivity(G_sc), nx.average_clustering(G_sc)

* Weakly connected

In [None]:
nx.transitivity(G_wc), nx.average_clustering(G_wc)

## Indegree and Outdegree
### What are the top and bottom 5 airports with most incoming flights?

In [None]:
in_deg=nx.in_degree_centrality(G_sc)

* Top 5

In [None]:
top5=sorted(in_deg.items(), key=operator.itemgetter(1),reverse=True)[:5]
l=[]
for i,j in top5:
    l.append(i)
airport_df.loc[airport_df['IATA'].isin(l)]

* Bottom 5

In [None]:
bot5=sorted(in_deg.items(), key=operator.itemgetter(1))[:5]
l=[]
for i,j in bot5:
    l.append(i)
airport_df.loc[airport_df['IATA'].isin(l)]

### What are the top and bottom 5 airports with most outgoing flights?

In [None]:
out_deg=nx.out_degree_centrality(G_sc)

* Top 5

In [None]:
top5=sorted(out_deg.items(), key=operator.itemgetter(1),reverse=True)[:5]
top5

In [None]:
l=[]
for i,j in top5:
    l.append(i)
airport_df.loc[airport_df['IATA'].isin(l)]

* Bottom 5

In [None]:
bot5=sorted(out_deg.items(), key=operator.itemgetter(1))[:5]
bot5

In [None]:
l=[]
for i,j in bot5:
    l.append(i)
airport_df.loc[airport_df['IATA'].isin(l)]

## Closeness Centrality
### Which airports will allow you to reach all other airports with the lowest average number of airports in between?

In [None]:
closeness = nx.closeness_centrality(G_sc, wf_improved=True)

In [None]:
close=sorted(closeness.items(), key=operator.itemgetter(1),reverse=True)[:5]
l=[]
for i,j in close:
    l.append(i)
airport_df.loc[airport_df['IATA'].isin(l)]

### Which airports will make you reach all other airports with the highest average number of airports in between?

In [None]:
close=sorted(closeness.items(), key=operator.itemgetter(1))[:18]
l=[]
for i,j in close:
    l.append(i)
airport_df.loc[airport_df['IATA'].isin(l)]

## Betweenness Centrality
### Which airports often act as bridges between other pairs of airports?

In [None]:
betweeness = nx.betweenness_centrality(G_sc, normalized=True)

In [None]:
close=sorted(betweeness.items(), key=operator.itemgetter(1),reverse=True)[:5]
l=[]
for i,j in close:
    l.append(i)
airport_df.loc[airport_df['IATA'].isin(l)]

### Which airports least often act as bridges between other pairs of airports?

In [None]:
close=sorted(betweeness.items(), key=operator.itemgetter(1))[:5]
l=[]
for i,j in close:
    l.append(i)
airport_df.loc[airport_df['IATA'].isin(l)]

## Articulation point, Bridges
* A node is considered an articulation point if the removal of that node (along with all its incident edges) increases the number of connected components of a network
* A bridge in a graph is an edge whose removal causes the number of connected components of the graph to increase.

In [None]:
arti=list(nx.articulation_points(G_sc.to_undirected()))
len(arti)

In [None]:
nx.has_bridges(G.to_undirected())

In [None]:
len(list(nx.bridges(G.to_undirected())))

## PageRank
### 5 airports with highest and lowest pagerank?

In [None]:
pr = nx.pagerank(G_sc, alpha=0.85)

* Highest 5

In [None]:
pager=sorted(pr.items(), key=operator.itemgetter(1),reverse=True)[:5]
l=[]
for i,j in pager:
    l.append(i)
airport_df.loc[airport_df['IATA'].isin(l)]

* Lowest 5

In [None]:
pager=sorted(pr.items(), key=operator.itemgetter(1))[:5]
l=[]
for i,j in pager:
    l.append(i)
airport_df.loc[airport_df['IATA'].isin(l)]

## HITS
### Identify hubs and authorities?
High authority scores have higher no of incoming nodes
And good hubs are those that are good at pointing at good authorities

In [None]:
hits = nx.hits(G_sc)

* Hubs

In [None]:
hubs=sorted(hits[0].items(), key=operator.itemgetter(1))[:5]
l=[]
for i,j in hubs:
    l.append(i)
airport_df.loc[airport_df['IATA'].isin(l)]

* Authority

In [None]:
auth=sorted(hits[1].items(), key=operator.itemgetter(1))[:5]
l=[]
for i,j in auth:
    l.append(i)
airport_df.loc[airport_df['IATA'].isin(l)]

## Degrees

In [None]:
degrees = dict(G_sc.degree())
degree_values = sorted(set(degrees.values()))
histogram = [list(degrees.values()).count(i)/float(nx.number_of_nodes(G_sc)) for i in degree_values]

In [None]:
plt.plot(histogram)

### This follows a power law distribution

In [None]:
df = pd.DataFrame(index=G_sc.nodes())
df['clustering'] = pd.Series(nx.clustering(G_sc))
df['in_degree'] = pd.Series(dict(in_deg))

df['out_degree'] = pd.Series(dict(out_deg))
df['degree_centrality'] = pd.Series(nx.degree_centrality(G))
df['closeness'] = pd.Series(closeness)

In [None]:
df['betweeness'] = pd.Series(betweeness)
df['pr'] = pd.Series(pr)
df['hits_hubs'] = pd.Series(hits[0])
df['hits_auth'] = pd.Series(hits[1])

In [None]:
df.head()