In [1]:
import re
import time
import yaml
import holidays
import numpy as np
import pandas as pd

from datetime import time
from datetime import date
from datetime import datetime
from datetime import timedelta

from scraper.Parser import Gesucht

from bson.objectid import ObjectId
from scraper.mongo_handler import Mongo

pd.set_option('display.max_colwidth', -1)

In [2]:
city = 'berlin'

with open('scraper/config/{city}.yaml'.format(city=city)) as f:
    config = yaml.load(f)


gesucht = Gesucht(config)
mongo = Mongo(config['mongo'])

center = {'hamburg': (53.553077, 10.006572), 'berlin': (52.525789, 13.369574)}

In [3]:
fields = {'_id': 0, 'name': 1, 'lat': 1, 'lng': 1, 'user_id': 1}
destination_name_map = {'lat': 'dest_lat', 'lng': 'dest_lng', 'name': 'dest_name'}

destinations = list(mongo.db['destinations'].find({"user_id": None}, fields))
destinations = pd.DataFrame(destinations)
destinations = destinations.rename(columns=destination_name_map)

destinations

Unnamed: 0,dest_lat,dest_lng,dest_name,user_id
0,52.511794,13.291066,charlottenburg,
1,52.457048,13.688608,friedrichshagen,
2,52.513812,13.458979,friedrichshain,
3,52.531368,13.588181,hellersdorf,
4,52.580827,13.511707,hohenschoenhausen,
5,52.436953,13.564079,koepenick,
6,52.497732,13.41266,kreuzberg,
7,52.512448,13.49632,lichtenberg,
8,52.518113,13.330475,moabit,
9,52.46791,13.445895,neukoelln,


In [4]:
gen_time = datetime.today() - timedelta(days=1, hours=13, minutes=7) 

fields = {'_id': 0, 'url': 1, 'lat': 1, 'lng': 1}
origins = list(mongo.db['berlin'].find({"_id": {"$gte": ObjectId.from_datetime(gen_time)}}, fields))
origins = pd.DataFrame(origins)

origins

Unnamed: 0,lat,lng,url
0,52.519259,13.426941,https://www.wg-gesucht.de/wg-zimmer-in-Berlin-Friedrichshain.7128945.html
1,52.498954,13.337615,https://www.wg-gesucht.de/wg-zimmer-in-Berlin-Wilmersdorf.7128980.html
2,52.462190,13.343817,https://www.wg-gesucht.de/wg-zimmer-in-Berlin-Steglitz.7096857.html
3,52.534401,13.401835,https://www.wg-gesucht.de/wg-zimmer-in-Berlin-Mitte.4766130.html
4,52.592187,13.351148,https://www.wg-gesucht.de/wg-zimmer-in-Berlin-Reinickendorf.7127748.html
5,52.632888,13.265985,https://www.wg-gesucht.de/wg-zimmer-in-Berlin-Frohnau.6921710.html
6,52.495718,13.415601,https://www.wg-gesucht.de/wg-zimmer-in-Berlin-Kreuzberg.7129065.html
7,52.575339,13.544984,https://www.wg-gesucht.de/wg-zimmer-in-Berlin-Kreuzberg.7129086.html
8,52.575339,13.544984,https://www.wg-gesucht.de/wg-zimmer-in-Berlin-Friedrichshain.7129094.html
9,52.435660,13.309730,https://www.wg-gesucht.de/wg-zimmer-in-Berlin-Steglitz.7129088.html


In [5]:
def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

bundestland_feiertage = holidays.Germany(prov=config['bundesland'])

week = [date.today() + timedelta(days=n) for n in range(7)]

weekend = np.array([day.weekday() >= 5 for day in week])      # make sure all postings have a fair comparison.
holiday = np.array([day in bundestland_feiertage for day in week])         # holidays be cray cray

offset = np.argmax((weekend | holiday) == False)

travel_day = date.today() + timedelta(days=int(offset))
travel_time = datetime.combine(travel_day, time(10, 30))      # again, fair comparison. Weekday at 10 AM


documents = []

for origin_chunk in chunker(origins, 4):
    origin_chunk = pd.DataFrame(origin_chunk)
    for destination_chunk in chunker(destinations, 20):
        destination_chunk = pd.DataFrame(destination_chunk)
        print('processing {0} origins, and {1} destinations'.format(len(origin_chunk), len(destination_chunk)))
        
        dist_matrix = gesucht.gmaps.distance_matrix(origin_chunk[['lat', 'lng']].values, 
                                            destination_chunk[['dest_lat', 'dest_lng']].values, 
                                            departure_time=travel_time, 
                                            mode='transit')

        for origin, dist_matrix_row in zip(origin_chunk.itertuples(), dist_matrix['rows']):

            doc = [{k:elem[k]['value'] for k in ['distance', 'duration']} for elem in dist_matrix_row['elements']]
            
            doc = pd.DataFrame(doc)
            doc = doc.merge(destination_chunk, left_index=True, right_index=True)
            doc['origin_lat'] = origin.lat
            doc['origin_lng'] = origin.lng
            doc['url'] = origin.url
            
            documents.append(doc)

processing 4 origins, and 20 destinations
processing 4 origins, and 20 destinations
processing 4 origins, and 20 destinations
processing 4 origins, and 20 destinations
processing 4 origins, and 20 destinations
processing 4 origins, and 20 destinations
processing 4 origins, and 20 destinations
processing 4 origins, and 20 destinations
processing 4 origins, and 20 destinations
processing 4 origins, and 20 destinations
processing 4 origins, and 20 destinations
processing 4 origins, and 20 destinations
processing 4 origins, and 20 destinations
processing 4 origins, and 20 destinations
processing 4 origins, and 20 destinations
processing 4 origins, and 20 destinations
processing 4 origins, and 20 destinations
processing 4 origins, and 20 destinations
processing 4 origins, and 20 destinations
processing 4 origins, and 20 destinations
processing 4 origins, and 20 destinations
processing 2 origins, and 20 destinations


In [7]:
documents = pd.concat(documents)

In [9]:
mongo.db['travel_info'].insert_many(documents.to_dict('records'))

<pymongo.results.InsertManyResult at 0x114984188>

In [13]:
pd.DataFrame(list(mongo.db['travel_info'].find({})))

Unnamed: 0,_id,dest_lat,dest_lng,dest_name,distance,duration,origin_lat,origin_lng,url,user_id
0,5c558889b451ea3ab3087a73,52.511794,13.291066,charlottenburg,12087,3225,52.519259,13.426941,https://www.wg-gesucht.de/wg-zimmer-in-Berlin-Friedrichshain.7128945.html,
1,5c558889b451ea3ab3087a74,52.457048,13.688608,friedrichshagen,21325,3243,52.519259,13.426941,https://www.wg-gesucht.de/wg-zimmer-in-Berlin-Friedrichshain.7128945.html,
2,5c558889b451ea3ab3087a75,52.513812,13.458979,friedrichshain,2477,878,52.519259,13.426941,https://www.wg-gesucht.de/wg-zimmer-in-Berlin-Friedrichshain.7128945.html,
3,5c558889b451ea3ab3087a76,52.531368,13.588181,hellersdorf,14090,2165,52.519259,13.426941,https://www.wg-gesucht.de/wg-zimmer-in-Berlin-Friedrichshain.7128945.html,
4,5c558889b451ea3ab3087a77,52.580827,13.511707,hohenschoenhausen,12234,3948,52.519259,13.426941,https://www.wg-gesucht.de/wg-zimmer-in-Berlin-Friedrichshain.7128945.html,
5,5c558889b451ea3ab3087a78,52.436953,13.564079,koepenick,14501,3341,52.519259,13.426941,https://www.wg-gesucht.de/wg-zimmer-in-Berlin-Friedrichshain.7128945.html,
6,5c558889b451ea3ab3087a79,52.497732,13.412660,kreuzberg,4472,1580,52.519259,13.426941,https://www.wg-gesucht.de/wg-zimmer-in-Berlin-Friedrichshain.7128945.html,
7,5c558889b451ea3ab3087a7a,52.512448,13.496320,lichtenberg,4877,1283,52.519259,13.426941,https://www.wg-gesucht.de/wg-zimmer-in-Berlin-Friedrichshain.7128945.html,
8,5c558889b451ea3ab3087a7b,52.518113,13.330475,moabit,8582,2383,52.519259,13.426941,https://www.wg-gesucht.de/wg-zimmer-in-Berlin-Friedrichshain.7128945.html,
9,5c558889b451ea3ab3087a7c,52.467910,13.445895,neukoelln,9652,1743,52.519259,13.426941,https://www.wg-gesucht.de/wg-zimmer-in-Berlin-Friedrichshain.7128945.html,
