In [15]:
import xmltodict
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import requests
import json
import zipfile
import io
from math import sin, cos, sqrt, atan2, radians

In [3]:
def get_xml_for_period(start_date, end_date):
    # POST request
    URL = 'http://stat.gibdd.ru/getCardsXML'
    data = {"data":'{'
            f'"date_st":"{start_date}","date_end":"{end_date}",'
            '"ParReg":"877","order":{"type":1,"fieldName":"dat"},'
            '"reg":["40"],"ind":"1","exportType":1'
            '}'
           }
    response = requests.post(URL, json=data)
    data = response.content
    json_data = json.loads(data)
    res_id = int(json_data['data'])

    # GET request
    URL = f'http://stat.gibdd.ru/getPDFbyId?data={res_id}'
    response = requests.get(URL)
    data = response.content

    z = zipfile.ZipFile(io.BytesIO(data))
    xml_string = z.read('Карточки ДТП.xml')
    z.close()

    return xml_string

def save_xml_data_to_file(data, start_date, end_date):
    filename = f'../datasets/{start_date}-{end_date}.xml'
    with open(filename, 'wb') as fd:
        fd.write(data)

In [4]:
def parse_xml_file(filename):
    with open(filename) as fd:
        data = xmltodict.parse(fd.read())

    coords = {
        'type': [],
        'dtp_num': [],
        'lon': [],
        'lat': [],
    }

    # tab = dtp
    for dtp in data['dtpCardList']['tab']:
    #     if 'Приморский' not in dtp['district']:
    #         continue
        info = dtp['infoDtp']
        coords['type'].append(dtp['DTPV'])
        coords['dtp_num'].append(dtp['EMTP_NUMBER'])
        coords['lon'].append(float(info['COORD_L']))
        coords['lat'].append(float(info['COORD_W']))

    return pd.DataFrame(coords)

In [76]:
def get_map(df):
    mapbox_access_token = open("../mapbox/.mapbox_token").read()

    fig = go.Figure(go.Scattermapbox(
            lon=df['lon'],
            lat=df['lat'],
            mode='markers',
            marker=go.scattermapbox.Marker(
                size=14
            ),
#             text=df['dtp_num'],
        ))

    fig.update_layout(
        width=800,
        height=800,
        hovermode='closest',
        mapbox=dict(
            accesstoken=mapbox_access_token,
            bearing=0,
            style='open-street-map',
            center=go.layout.mapbox.Center(
                lon=30.259566,
                lat=60.007568
            ),
            pitch=0,
            zoom=12
        )
    )

    fig.show()

In [72]:
# xml_data = get_xml_for_period('01.08.2020', '15.08.2020')
# save_xml_data_to_file(xml_data, '01.08.2020', '15.08.2020')
df = parse_xml_file('../datasets/01.08.2020-15.08.2020.xml')
get_map(df)

In [70]:
def start_new_cluster(df):
    new_cluster_id = df['cluster_id'].max() + 1
    for index, row in df.iterrows():
        if row['cluster_id'] == 0:
            df.at[index,'cluster_id'] = new_cluster_id
            return row['lon'], row['lat'], new_cluster_id
    return None

def get_distance_between(lon1, lat1, lon2, lat2):
    # approximate radius of earth in km
    R = 6373.0
    
    lon1_r = radians(lon1)
    lat1_r = radians(lat1)
    lon2_r = radians(lon2)
    lat2_r = radians(lat2)
    
    dlon = lon2_r - lon1_r
    dlat = lat2_r - lat1_r

    a = sin(dlat / 2)**2 + cos(lat1_r) * cos(lat2_r) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c * 1000
    return distance
        
def add_points_to_cluster(df, lon, lat, cluster_id, distance):
    for index, row in df.iterrows():
        if row['cluster_id'] != 0:
            continue
        if get_distance_between(lon, lat, row['lon'], row['lat']) > distance:
            continue
        df.at[index,'cluster_id'] = cluster_id

def get_center(df):
    lon = df['lon'].sum() / len(df['lon'])
    lat = df['lat'].sum() / len(df['lat'])
    return lon, lat
    
def get_clusters(df):
    cluster_dict = {
        'cluster_id': [],
        'lon': [],
        'lat': [],
        'count': [],
    }
    for cluster_id in df['cluster_id'].unique():
        filtered = df[df['cluster_id']==cluster_id]
        lon, lat = get_center(filtered)
        cluster_dict['cluster_id'].append(cluster_id)
        cluster_dict['lon'].append(lon)
        cluster_dict['lat'].append(lat)
        cluster_dict['count'].append(len(filtered))
    return pd.DataFrame(cluster_dict)

In [77]:
# Add column (reset clusters)
df['cluster_id'] = 0

while True:
    new_cluster = start_new_cluster(df)
    if not new_cluster:
        break
    lon, lat, cluster_id = new_cluster
    add_points_to_cluster(df, lon, lat, cluster_id, 1000)

cluster_df = get_clusters(df)
get_map(cluster_df)