In [1]:
import re
import time
import yaml
import math
import gmaps
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt

from bs4 import BeautifulSoup

from scraper.mongo_handler import Mongo
from scraper.Parser import Gesucht

pd.set_option('display.max_colwidth', -1)

In [2]:
city = 'berlin'

with open('scraper/config/{city}.yaml'.format(city=city)) as f:
    config = yaml.load(f)

gmaps.configure(api_key=config['google_api_key'])

gesucht = Gesucht(config)
mongo = Mongo(config['mongo'])

center = {'hamburg': (53.553077, 10.006572), 'berlin': (52.525789, 13.369574)}

In [None]:
from scipy import stats
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, MinMaxScaler

fields = ['url', '_id', 'lat', 'lng']

results = mongo.collection.find({}).sort('_id', -1)
apartments = [doc for doc in results]
apartments = [apt for apt in apartments if all(k in apt for k in fields)]
apartments = pd.DataFrame({k:[apt[k] for apt in apartments] for k in fields})
apartments['timestamp'] = apartments['_id'].apply(lambda _id: _id.generation_time)

apartments['remoteness'] = [gesucht.coordinate_distance(apt, center[city]) for i, apt in apartments.iterrows()]
apartments = apartments[(np.abs(stats.zscore(apartments.remoteness)) <= 1)]

del apartments['_id']

In [None]:
def extract_stadtteil(s):
    return re.sub('.*-in-{city}-'.format(city=city),'', s.lower()).split('.')[0]

apartments['Stadtteil'] = apartments.url.apply(extract_stadtteil)
apartments['Stadtteil'] = apartments.Stadtteil.astype('category')
#apartments = apartments[~apartments.Stadtteil.isin(['friedrichshagen', 'koepenick', 'mueggelheim'])]

apartments.sample(n=15)

In [None]:
counts = apartments.timestamp.apply(lambda stamp: stamp.date()).value_counts().sort_index()

plt.plot(counts, '*');

In [None]:
counts.describe()

In [None]:
n_clusters = 25

cluster = KMeans(n_clusters=n_clusters)
cluster.fit(apartments[['lat', 'lng']])
apartments['cluster'] = cluster.predict(apartments[['lat', 'lng']])

centroids = []
for name, group in apartments.groupby('cluster'):
    frequencies = group.Stadtteil.value_counts()
    centroid = {k:cluster.cluster_centers_[name][i] for i, k in enumerate(['lat', 'lng'])} 
    centroid['name'] = frequencies.idxmax()
    centroid['n'] = len(group)
    centroid = pd.DataFrame(centroid, index=[0])
    centroids.append(centroid)

centroids = pd.concat(centroids)

weighted_average = lambda group: group[['lat', 'lng']].apply(lambda x: np.average(x, weights=group.n))
average = lambda group: group[['lat', 'lng']].apply(lambda x: np.average(x))

centroids = centroids.groupby('name').apply(weighted_average)
centroids = centroids.reset_index()

centroids['user_id'] = None
centroids['model_timestamp'] = datetime.now()

In [None]:
fig = gmaps.figure()
colors = [tuple(np.random.randint(0, 256) for _ in range(3)) for _ in centroids.iterrows()]

centroid_layer = gmaps.symbol_layer(centroids[['lat', 'lng']],
                                   scale=5,
                                   fill_color=colors,
                                   stroke_color=colors,
                                   info_box_content=centroids.name)
fig.add_layer(centroid_layer)
fig

In [None]:
mongo.db['destinations'].insert_many(centroids.to_dict('records'))

In [52]:
origin_chunk

[{'url': 'https://www.wg-gesucht.de/wg-zimmer-in-Berlin-Kreuzberg.6179151.html',
  'lat': 52.504184,
  'lng': 13.4278601},
 {'lat': 52.4649012,
  'url': 'https://www.wg-gesucht.de/wg-zimmer-in-Berlin-Treptow--Neukoelln.7130404.html',
  'lng': 13.4800165}]

In [None]:
res = gesucht.gmaps.distance_matrix(origins[['lat', 'lng']].values, 
                                    cluster.cluster_centers_, 
                                    departure_time=travel_time, 
                                    mode='transit')
res

In [None]:
travel_info = {}
for url, row in zip(origins.url, res['rows']):
    row = [{'distance':elem['distance']['value'], 'time': elem['duration']['value']} for elem in row['elements']]
    travel_info[url] = {dest: travel_info for dest, travel_info in zip(cluster_names, row)}

In [None]:
travel_info