In [1]:
import xmltodict
import pandas as pd
import numpy as np
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import requests
import json
import zipfile
import io
import os
from math import sin, cos, sqrt, atan2, radians
import datetime
import calendar
import itertools

**Download functions**

In [2]:
def get_xml_for_period(start_date, end_date):
    # POST request
    URL = 'http://stat.gibdd.ru/getCardsXML'
    data = {"data":'{'
            f'"date_st":"{start_date}","date_end":"{end_date}",'
            '"ParReg":"877","order":{"type":1,"fieldName":"dat"},'
            '"reg":["40"],"ind":"1","exportType":1'
            '}'
           }
    response = requests.post(URL, json=data)
    json_data = json.loads(response.content)
    res_id = int(json_data['data'])

    # GET request
    URL = f'http://stat.gibdd.ru/getPDFbyId?data={res_id}'
    response = requests.get(URL)
    data = response.content

    z = zipfile.ZipFile(io.BytesIO(data))
    xml_string = z.read('Карточки ДТП.xml')
    z.close()

    return xml_string

def save_xml_data_to_file(data, start_date, end_date):
    filename = f'../datasets/{start_date}-{end_date}.xml'
    with open(filename, 'wb') as fd:
        fd.write(data)
        
def download_dtp_info(year, month):
    day = 1
    step = 14
    days_in_month = calendar.monthrange(year, month)[1]
    date_format = '%d.%m.%Y'
    while day < days_in_month:
        start_date = datetime.date(year, month, day)
        end_day = min(day + step - 1, days_in_month)
        end_date = datetime.date(year, month, end_day)
        start_string = start_date.strftime(date_format)
        end_string = end_date.strftime(date_format)
        xml_string = get_xml_for_period(start_string, end_string)
        save_xml_data_to_file(xml_string, start_string, end_string)
        day += step

**Dowload data**

In [60]:
# for month in range(12, 13):
#     download_dtp_info(2019, month)
#     print(f'Downloaded for 2019-{month}')

**Parse functions**

In [35]:
def parse_xml_file(filename):
    with open(filename) as fd:
        data = xmltodict.parse(fd.read())

    dtp_dict = {
        'type': [],
        'dtp_num': [],
        'lon': [],
        'lat': [],
        'district': [],
    }

    # <tab> = dtp
    for dtp in data['dtpCardList']['tab']:
#         if 'Приморский' not in dtp['district']:
#             continue
        info = dtp['infoDtp']
        dtp_dict['type'].append(dtp['DTPV'])
        dtp_dict['district'].append(dtp['district'])
        dtp_dict['dtp_num'].append(dtp['EMTP_NUMBER'])
        dtp_dict['lon'].append(float(info['COORD_L']))
        dtp_dict['lat'].append(float(info['COORD_W']))

    df = pd.DataFrame(dtp_dict)
    df['meta_info'] = [f'dtp_num={i}' for i in df['dtp_num']]
    df['color'] = 0
    return df

def parse_xml_files(dataset_dir, years, months):
    df = pd.DataFrame()
    for file_name in os.listdir(dataset_dir):
        for year, month in itertools.product(years, months):
            if f'{month}.{year}-' in file_name:
                period_df = parse_xml_file(f'{dataset_dir}/{file_name}')
                df = pd.concat([df, period_df], ignore_index=True)
    return df

**Parse data**

In [43]:
dataset_dir = '../datasets'
data_df = parse_xml_files(dataset_dir, [2020], range(1, 13))

**Map functions**

In [24]:
def get_colorscale():
    return [
        [0, "rgb(255, 186, 8)"],
        [0.1, "rgb(255, 186, 8)"],
        [0.2, "rgb(250, 163, 7)"],
        [0.3, "rgb(244, 140, 6)"],
        [0.4, "rgb(232, 93, 4)"],
        [0.5, "rgb(220, 47, 2)"],
        [0.6, "rgb(208, 0, 0)"],
        [0.7, "rgb(157, 2, 8)"],
        [0.8, "rgb(106, 4, 15)"],
        [0.9, "rgb(55, 6, 23)"],
        [1, "rgb(3, 7, 30)"],
    ]

def get_map(df, style='open-street-map', colorscale=False):
    mapbox_access_token = open("../mapbox/.mapbox_token").read()

    colorbar = {}
    if colorscale:
        colorbar['title'] = ''
    
    fig = go.Figure(go.Scattermapbox(
            lon=df['lon'],
            lat=df['lat'],
            mode='markers',
            marker=go.scattermapbox.Marker(
                size=10,
                color=df['color'],
#                 colorscale='YlOrRd',
                colorscale=get_colorscale(),
                colorbar=colorbar,
            ),
            text=df['meta_info'],
        ))

    fig.update_layout(
        width=800,
        height=800,
        hovermode='closest',
        mapbox=dict(
            accesstoken=mapbox_access_token,
            bearing=0,
            style=style,
            center=go.layout.mapbox.Center(
                lon=30.259566,
                lat=60.007568
            ),
            pitch=0,
            zoom=12
        )
    )

    fig.show()

**Display data on map**

In [44]:
get_map(data_df)

In [138]:
def get_stat_district():
    fig = go.Figure()
    data_dict = {
        'district': [],
        'month': [],
        'count': [],
        'year': [],
    }
    for year in range(2016, 2021):
        for month in range (1, 13):
            df = parse_xml_files(dataset_dir, [year], [month])
            if 'district' not in df:
                continue
            districts = df['district'].unique()
            for district in districts:
                cnt = len(df[df['district']==district])
                data_dict['district'].append(district)
                data_dict['month'].append(month)
                data_dict['count'].append(cnt)
                data_dict['year'].append(year)
    return pd.DataFrame(data_dict)

def show_stat_district_month(df):
    fig = go.Figure()
    
    new_df = pd.DataFrame()
    names = df[df['year']==2019].groupby(['district']).mean().sort_values('count', ascending=False).index
    for name in names:
        distr_df = df[df['district']==name]
        new_df = pd.concat([new_df, distr_df], ignore_index=True)
    
    min_year = new_df['year'].min()
    max_year = new_df['year'].max()
    for year in range(min_year, max_year + 1):
        filtered = new_df[new_df['year']==year]
        fig.add_trace(go.Box(x=filtered['district'],
                             y=filtered['count'], name=f'{year}'))
    fig.update_layout(
        title='Month avg',
        xaxis_title="District",
        yaxis_title="Accidents number",
        boxmode='group'
    )
#     fig.update_traces(boxpoints='all')
    fig.show()
    
def show_stat_district_year(df):
    fig = go.Figure()
    
    new_df = pd.DataFrame()
    names = df[df['year']==2019].groupby(['district']).sum().sort_values('count', ascending=False).index
    for name in names:
        distr_df = df[df['district']==name]
        new_df = pd.concat([new_df, distr_df], ignore_index=True)
    
    min_year = new_df['year'].min()
    max_year = new_df['year'].max()
    for year in range(min_year, max_year + 1):
        districts = []
        cnt = []
        filtered = new_df[new_df['year']==year]
        for name in names:
            by_name = filtered[filtered['district']==name]
            districts.append(name)
            cnt.append(by_name['count'].sum())
        fig.add_trace(go.Bar(x=districts,
                             y=cnt, name=f'{year}'))
    fig.update_layout(
        title='Year sum',
        xaxis_title="District",
        yaxis_title="Accidents number",
    )
    fig.show()

In [139]:
# df_districts = get_stat_district()
# show_stat_district_month(df_districts)
show_stat_district_year(df_districts)

Index(['Выборгский район', 'Калининский район', 'Невский район',
       'Приморский район', 'Московский район', 'Центральный район',
       'Красногвардейский район', 'Кировский район', 'Красносельский район',
       'Фрунзенский район', 'Адмиралтейский район', 'Пушкинский район',
       'Петроградский район', 'Василеостровский район', 'Колпинский район',
       'Курортный район', 'Петродворцовый район', 'Кронштадтский район'],
      dtype='object', name='district')


**Clusterization functions**

In [39]:
def start_new_cluster(df, cluster_id):
    for index, row in df.iterrows():
        if row['cluster_id'] == 0:
            df.at[index,'cluster_id'] = cluster_id
            return row['lon'], row['lat'], cluster_id
    return None

def get_distance_between(lon1, lat1, lon2, lat2):
    # approximate radius of earth in km
    R = 6373.0
    
    lon1_r = radians(lon1)
    lat1_r = radians(lat1)
    lon2_r = radians(lon2)
    lat2_r = radians(lat2)
    
    dlon = lon2_r - lon1_r
    dlat = lat2_r - lat1_r

    a = sin(dlat / 2)**2 + cos(lat1_r) * cos(lat2_r) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c * 1000
    return distance
        
def add_points_to_cluster(df, lon, lat, cluster_id, distance):
    for index, row in df.iterrows():
        if row['cluster_id'] != 0:
            continue
        if get_distance_between(lon, lat, row['lon'], row['lat']) > distance:
            continue
        df.at[index,'cluster_id'] = cluster_id
        add_points_to_cluster(df, row['lon'], row['lat'], cluster_id, distance)

def get_center(df):
    lon = df['lon'].sum() / len(df['lon'])
    lat = df['lat'].sum() / len(df['lat'])
    return lon, lat
    
def get_clusters(df):
    cluster_dict = {
        'cluster_id': [],
        'lon': [],
        'lat': [],
        'count': [],
    }
    for cluster_id in df['cluster_id'].unique():
        filtered = df[df['cluster_id']==cluster_id]
        lon, lat = get_center(filtered)
        cluster_dict['cluster_id'].append(cluster_id)
        cluster_dict['lon'].append(lon)
        cluster_dict['lat'].append(lat)
        cluster_dict['count'].append(len(filtered))
        
    cluster_df = pd.DataFrame(cluster_dict)
    cluster_df['meta_info'] = [f'count={i}' for i in cluster_df['count']]
    cluster_df['color'] = cluster_df['count']
    return cluster_df

# def clusterize_dtp(dtp_df, distance):
#     df = dtp_df.copy()
#     df['cluster_id'] = 0
#     while True:
#         new_cluster = start_new_cluster(df)
#         if not new_cluster:
#             break
#         lon, lat, cluster_id = new_cluster
#         add_points_to_cluster(df, lon, lat, cluster_id, distance)
#     cluster_df = get_clusters(df)
#     return cluster_df

def clusterize_dtp(dtp_df, distance):
    df = dtp_df.copy()
    df['cluster_id'] = 0
    districts = df['district'].unique()
    res_df = pd.DataFrame()
    new_cluster_id = 1
    for district in districts:
        df_distr = df[df['district']==district]
        while True:
            new_cluster = start_new_cluster(df_distr, new_cluster_id)
            if not new_cluster:
                break
            new_cluster_id += 1
            lon, lat, cluster_id = new_cluster
            add_points_to_cluster(df_distr, lon, lat, cluster_id, distance)
        res_df = pd.concat([res_df, df_distr], ignore_index=True)
    cluster_df = get_clusters(res_df)
    return cluster_df

**Cluster data**

In [45]:
cluster_df = clusterize_dtp(data_df, 50)

**Display clusters on map**

In [47]:
get_map(cluster_df[cluster_df['count']>4], style='light', colorscale=True)

In [16]:
def get_stat_for_points(points, years, months):
    data_dict = {
        'point_name': [],
        'count': [],
        'year': [],
        'month': [],
    }

    for lon, lat, name in points:
        for year in years:
            for month in months:
                cnt = 0
                month_df = parse_xml_files(dataset_dir, [year], [month])
                month_cluster_df = clusterize_dtp(month_df, 50)
                for index, row in month_cluster_df.iterrows():
                    if get_distance_between(lon, lat, row['lon'], row['lat']) < 100:
                        cnt += row['count']
                data_dict['point_name'].append(name)
                data_dict['count'].append(cnt)
                data_dict['year'].append(year)
                data_dict['month'].append(month)

    df = pd.DataFrame(data_dict)
    return df

def show_stat_month(df):
    for name in df['point_name'].unique():
        by_name = df[df['point_name']==name]
        fig = go.Figure()
        for year in sorted(by_name['year'].unique()):
            filtered = by_name[by_name['year']==year]
            filtered.sort_values('month')
            fig.add_trace(go.Bar(x=filtered['month'], y=filtered['count'], name=f'{year}',
                                text=filtered['count'], textposition='auto'))
        fig.update_layout(
            title=name,
            xaxis_title="Month",
            yaxis_title="Accidents number",
            barmode='stack',
        )
        fig.show()

def show_stat_month_avg(df):
    fig = go.Figure()
    for year in sorted(df['year'].unique()):
        by_year = df[df['year']==year]
        fig.add_trace(go.Box(x=by_year['point_name'], y=by_year['count'], name=f'{year}'))
    fig.update_layout(
        title='Year stat',
        xaxis_title="Point name",
        yaxis_title="Accidents number",
        boxmode='group'
    )
    fig.update_traces(boxpoints='all')
    fig.show()
    
def show_stat_year(df):
    fig = go.Figure()
    for year in sorted(df['year'].unique()):
        by_year = df[df['year']==year]
        names = by_year['point_name'].unique()
        values = []
        for name in names:
            year_sum = by_year[by_year['point_name']==name]['count'].sum()
            values.append(year_sum)       
        fig.add_trace(go.Bar(x=names, y=values, name=f'{year}'))
    fig.update_layout(
        title='Year stat',
        xaxis_title="Point name",
        yaxis_title="Accidents number",
    )
    fig.show()

In [21]:
points = []
points.append((30.297945, 60.002919, 'м. Пионерская'))
points.append((30.263875, 60.020973, 'Парашютная/Долгоозерная'))
points.append((30.275906, 60.011824, 'Сизова/Королева'))
points.append((30.272827, 60.006367, 'Сизова/Испытателей'))

dataset_dir = '../datasets'
df = get_stat_for_points(points, range(2016,2021), range(1,13))
show_stat_month(df)
show_stat_year(df)