In [1]:
import networkx as nx
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import sys
import math
import pickle

In [5]:
stops_with_ids = pd.read_csv("../qgis/stops_with_tile_ids.csv", usecols = ['ID','stop_id'])
stops_with_ids = stops_with_ids.set_index('stop_id')
print(stops_with_ids.shape)
stops_with_ids.head()

(24499, 1)


Unnamed: 0_level_0,ID
stop_id,Unnamed: 1_level_1
132,553
133,949
134,892
135,603
136,572


# Computing centers of hexagons

## Graph 
For complete set of bus and train stops

In [7]:
G = nx.read_gpickle('../graph/complete.graph')

In [8]:
stops_with_ids['degree'] = 0

For the hexagons where no affluence data is available from official **SBB/CFF** sources, we use the max node-degree in the cell as affluence.

### Strategy - choose node with min-average time in tile
For each tile get node with minimum average time to other nodes. 

`for tile:
    min_time = inf.
    for nodes in tile:
        matrix = distance(all_nodes)
        min_time = min(avg_time(matrix), min_time)`

In [9]:
nodes = G.nodes()

errors = 0
for n in nodes:
    try:
        stops_with_ids.loc[n].degree = G.degree(n) 
    except KeyError:
        errors += 1
        
print("Nodes missing in graph: {}/{}, when matched with `stops_with_ids`".format(errors, len(nodes)))

Nodes missing in graph: 276/22056, when matched with `stops_with_ids`


Groupby the hexagon ID and aggregate by max affluence.

In [10]:
grouped_by_degree = stops_with_ids.reset_index().groupby('ID')['stop_id', 'degree'].agg({'degree': max})
grouped_by_degree.columns = grouped_by_degree.columns.droplevel()
# grouped_by_degree = grouped_by_degree[grouped_by_degree.degree != 0]
grouped_by_degree.head()

Unnamed: 0_level_0,stop_id,degree
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
2,8595012,0
3,8595953,8
4,8595951,0
5,8595553,0
6,8595952,8


In [11]:
grouped_by_degree[grouped_by_degree.degree != 0].shape[0]

1230

In [12]:
grouped_by_degree[grouped_by_degree.stop_id == 8530043]

Unnamed: 0_level_0,stop_id,degree
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
255,8530043,4


In [13]:
missing = grouped_by_degree[grouped_by_degree.degree == 0].shape[0]
total = grouped_by_degree.shape[0]
print("After grouping, missing affluence data for {}/{} cells".format(missing, total))

After grouping, missing affluence data for 29/1259 cells


## Affluence for available nodes

In [14]:
affluence = pd.read_csv("../gtfs/passagierfrequenz.csv", sep=';')
affluence['x'], affluence['y'] = affluence.geopos.str.split(',', 1).str
affluence = affluence.drop(['Bahnhof_Haltestelle', 'DWV', 'Bemerkungen', 'lod', 'geopos', 'Eigner', 'Bezugsjahr'], axis=1)
affluence.columns = ['Code', 'Affluence', 'x', 'y']
affluence['stop_id'] = 0

In [15]:
affluence.head()

Unnamed: 0,Code,Affluence,x,y,stop_id
0,AAT,770,47.3359563788,8.76561022548,0
1,AE,2100,47.4677359724,7.60305476436,0
2,ALL,3000,46.4757395273,6.39970400408,0
3,AW,3000,47.5504471564,9.30221759023,0
4,ARN,210,47.4420020412,9.25200601938,0


In [16]:
affluence.to_csv('../gtfs/affluence_code.csv')

In [17]:
affluence_with_ids = pd.read_csv("../qgis/affluence_with_tile_ids.csv")

In [18]:
joined_aff = affluence.join(affluence_with_ids, rsuffix='_r').drop(
        ['x', 'y', 'Code_r', 'stop_id', 'Code'], axis=1)
grouped_by_affluence = joined_aff.groupby('ID').agg(max).reset_index()

print("Centers with affluence data: {}".format(grouped_by_affluence.shape[0]))

Centers with affluence data: 371


## Combine results
Is there relation between degree of node and affluence?


In [19]:
affluence = affluence.join(affluence_with_ids, rsuffix='_r').drop(['Code_r'], axis=1)
affluence.head()

Unnamed: 0,Code,Affluence,x,y,stop_id,ID
0,AAT,770,47.3359563788,8.76561022548,0,1039.0
1,AE,2100,47.4677359724,7.60305476436,0,525.0
2,ALL,3000,46.4757395273,6.39970400408,0,50.0
3,AW,3000,47.5504471564,9.30221759023,0,1337.0
4,ARN,210,47.4420020412,9.25200601938,0,1309.0


In [21]:
stops = pd.read_csv('../gtfs/stops.txt').drop(['Unnamed: 0', 'platform_code'], axis=1)

In [22]:
stops.head()

Unnamed: 0,stop_id,stop_lon,stop_lat
0,132,7.68936,47.196374
1,133,8.603653,46.154371
2,134,8.435913,46.538322
3,135,7.773846,46.356888
4,136,7.717215,46.433756


### Get closest stop match by euclidean distance between coordinates -> heuristic 
Result stored in `gtfs/affluence_with_stopid.csv`

**WARNING**: takes some time to compute

In [None]:
MAX_SIZE = sys.maxsize
for i in range(affluence.shape[0]):
    min_ = MAX_SIZE
    min_id = None
    x1 = float(affluence.loc[i].x)
    y1 = float(affluence.loc[i].y)
    for j in range(stops.shape[0]):
        x2 = float(stops.loc[j].stop_lat)
        y2 = float(stops.loc[j].stop_lon)
        dist = math.sqrt(pow(abs(x1-x2),2) + pow(abs(y1 - y2),2))
        if dist < min_:
            min_ = dist
            min_id = stops.loc[j].stop_id
    affluence.set_value(i, 'stop_id', min_id)
    if (i%25 == 0):
        print("{}/{}".format(i, affluence.shape[0]))
        
affluence.to_csv('gtfs/affluence_with_stopid.csv')    

In [23]:
affluence = pd.read_csv('../gtfs/affluence_with_stopid.csv')
affluence.drop(['Unnamed: 0'], axis=1, inplace=True)
affluence.head()

Unnamed: 0,Code,Affluence,x,y,stop_id,ID
0,AAT,770,47.335956,8.76561,8503124,1039.0
1,AE,2100,47.467736,7.603055,8500117,525.0
2,ALL,3000,46.47574,6.399704,8501035,50.0
3,AW,3000,47.550447,9.302218,8506109,1337.0
4,ARN,210,47.442002,9.252006,8506211,1309.0


Merge max affluence nodes for each cell with corresponding stop_id (gtfs format).

In [24]:
affluence_stop_ids = grouped_by_affluence.merge(affluence, left_on=['ID', 'Affluence'],
                          right_on=['ID', 'Affluence'])[['ID', 'stop_id']]

In [25]:
affluence_stop_ids.head()

Unnamed: 0,ID,stop_id
0,3.0,8501001
1,6.0,8501003
2,11.0,8516155
3,12.0,8501008
4,22.0,8501022


Combine the results from affluence and degree to one collection

In [26]:
cell_centers = grouped_by_degree.drop(['degree'], axis=1)
for i in range(affluence_stop_ids.shape[0]):
    cell_id = affluence_stop_ids.loc[i].ID
    stop_id = affluence_stop_ids.loc[i].stop_id
    cell_centers.set_value(cell_id, 'stop_id', stop_id)
    

In [27]:
cell_centers.head()

Unnamed: 0_level_0,stop_id
ID,Unnamed: 1_level_1
2,8595012.0
3,8501001.0
4,8595951.0
5,8595553.0
6,8501003.0


Get list of all centers

In [28]:
lst_centers = list(map(lambda x: int(x), cell_centers.stop_id.sort(inplace=False)))

In [29]:
nodes = G.nodes()
filt_centers = []
for c in lst_centers:
    if c in nodes:
        filt_centers.append(c)
    else: 
        pass

In [30]:
with open('../res/center_nodes', 'wb') as f:
    pickle.dump(filt_centers, f)