# Here a quick scan from the data from the fire-brigate department

In [None]:
import pandas as pd
import numpy as np
from math import radians, cos, sin, asin, sqrt,atan2
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats import norm
import os
from pyproj import Proj
from scipy import stats

import sys
sys.path.append("../")
from FDAASimulationEngine import SimulationEngine
from utils import pre_process_station_name

# Function Utils

## Call the data sets

In [None]:
# global station_locations, response_time_parameters 
# response_time_parameters = {}

In [None]:
incidents = pd.read_csv('..\Data\incidenten_2017.csv', sep=';', decimal=',')
deployments = pd.read_csv('..\Data\inzetten_2017.csv', sep=';', decimal=',')
station_locations = pd.read_excel('..\Data\kazernepositie en voertuigen.xlsx', sheet_name='adressen')
station_locations['kazerne'] = station_locations['kazerne'].apply(lambda x: pre_process_station_name(x))

# 1. General assesments of the datasets quality

There are many incedents that are only on the deploy dataset. These incidents are demostrations. Thus, probably is better not to take them into account

In [None]:
print("the shape of the incidents dataset is {}".format(incidents.shape))
print("the shape of the deployment dataset is {}".format(deployments.shape))
print("the time span of the data set incidents is: {}".format((min(incidents['dim_datum_datum']), 
                                                               max(incidents['dim_datum_datum']))))

print(print("the time span of the data set deploy is: {}".format((min(deployments['inzet_gealarmeerd_datumtijd']), 
                                                                  max(deployments['inzet_gealarmeerd_datumtijd'])))))

incidents_df_in = list(set(incidents['dim_incident_id'].unique()) - set(deployments['hub_incident_id'].unique()))
incidents_df_dep = list(set(deployments['hub_incident_id'].unique()) - set(incidents['dim_incident_id'].unique()))

print('the following id are only at incidents {}'.format(incidents_df_in))
print('the following id are only at deploy {}'.format(len(incidents_df_dep)))

# 2. EDA of the response time components

## 2.1 Relevant time slots for the response time

In order to calculate the different components of the response time, we considered the folling information:


(in) incident starts = dim_incident_start_datumtijd

(in) incident finish = dim_incident_eind_datumtijd

(dep) alarm is activated = inzet_gealarmeerd_datumtijd

(dep) leave = inzet_uitgerukt_datumtijd

(dep) arrive = inzet_terplaatse_datumtijd

In [None]:
sim = SimulationEngine(verbose=False)
sim.fit_deployment_parameters(incidents, deployments, station_locations)
M = sim.merged_log

In [None]:
M.describe().T

data issues:

(a) no all the stations have dispatch time

(b) There are some stations with different numbering. E.g, aalsmeer vrijwillig 1, aalsmeer vrijwillig 2. We will take only the first word for the deplyment dataset.

(c) negrative time differences. E.g, activate the alarm before the calles was made

(d) the station DRIEMOND does not have dispatch time

(e) is the end of the incident defined when the incident is solved in place or when the trucks are back into the fire station?

(f) The on scene deployment time depends on every deployed vehicle. Is reasonable to calculate the on scene deployment only with the TS vehicles?



In [None]:
# These are the vehicles more frquently deployed 
vehicles_type = ['TS', 'RV', 'WO', 'HV']
variables_to_plot = variable = ['turn out time (min)', 'travel time (min)', 
                                'response time (min)', 'Average Speed (Km/h)']

## 2.2 Distributions at level_1

Most of the combinations, car_type, fire_station, and priority_level, don't have enough observations to be statistical significant (n>20). We address this problem later with the level concept where 1 is the less aggregated and 3 is the most aggregated. AT level 3 we guarantee that n>20 for the most frequently used vehicles.

In [None]:
def PDF_grid_station_level_one(M, vehicles_type, variable):
    """
    plot the the kernel aproximations and histograms of the level one variables.
    
    Parameters
    ------------------------------
    M: dataframe wit the merged information of the incidents, deployment, and locations datasets.
    vehicles_type: list with the vehicles to be considered
    variable: string with the variable to be plotted
    
    Return
    -----------------------------
    figure that is save in the 'figures' folder
    
    """
    M = M[M['voertuig_groep'].isin(vehicles_type) & (M['dim_prioriteit_prio']<3)]
    g = sns.FacetGrid(M, row="kazerne",  col="voertuig_groep", hue="dim_prioriteit_prio")
    g = (g.map(sns.distplot, variable, bins = 100).add_legend())
    g.savefig("../figures/PDF_detail_station (level_1)_{}.png".format(variable.replace("/", "")))



# for variable in variables_to_plot:
#     PDF_grid_station_level_one(M, vehicles_type, variable)
    

## 2.3 Distributions at level_2

Better distributions. There is a difference in the average speed when tehre are different priorities.


In [None]:
def PDF_grid_station_level_two(M, vehicles_type, variable):
    """
    plot the the kernel aproximations and histograms of the level one variables.
    
    Parameters
    ------------------------------
    M: dataframe wit the merged information of the incidents, deployment, and locations datasets.
    vehicles_type: list with the vehicles to be considered
    variable: string with the variable to be plotted
    
    Return
    -----------------------------
    figure that is save in the 'figures' folder
    
    """
    
    M = M[M['voertuig_groep'].isin(vehicles_type) & (M['dim_prioriteit_prio']<3)]
    
    g = sns.FacetGrid(M, col="voertuig_groep", hue="dim_prioriteit_prio")
    g = (g.map(sns.distplot, variable, bins = 100).add_legend())
    g.savefig("../figures/PDF_vehicles_type (level_2)_{}.png".format(variable.replace("/", "")))
    

for variable in variables_to_plot:
    PDF_grid_station_level_two(M, vehicles_type, variable)

## 2.4 Distributions level 2

Most aggregated distribution level. The turnout time takes the same values quite often. This is not normal in a continuous variable. Probably is better to understand why this particular behavior.

In [None]:
def PDF_grid_station_level_three(M, vehicles_type, variable):
    """
    plot the the kernel aproximations and histograms of the level one variables.
    
    Parameters
    ------------------------------
    M: dataframe wit the merged information of the incidents, deployment, and locations datasets.
    vehicles_type: list with the vehicles to be considered
    variable: string with the variable to be plotted
    
    Return
    -----------------------------
    figure that is save in the 'figures' folder
    
    """
    M = M[M['voertuig_groep'].isin(vehicles_type) & (M['dim_prioriteit_prio']<3)]
    g = sns.FacetGrid(M, col="voertuig_groep")
    g = g.map(sns.distplot, variable, bins = 100)
    g.savefig("../figures/PDF_vehicles_type (level_3)_{}.png".format(variable.replace("/", "")))
    

for variable in variables_to_plot:
    PDF_grid_station_level_three(M, vehicles_type, variable)


## 2.5 On scene distribution 

This variable has an especial chapter because is a property of the incident itself.

In [None]:
def plot_dist_on_scene_duration(M):
#     g = sns.FacetGrid(M, col="dim_incident_incident_type", hue="dim_prioriteit_prio", col_wrap=4)
    g = sns.FacetGrid(M, col="dim_incident_incident_type", col_wrap=3)
    g = (g.map(sns.distplot, 'on scene duration (min)', bins = 100).add_legend())
    g.savefig("../figures/PDF_on_scene_duration.png".format(variable.replace("/", "")))

plot_dist_on_scene_duration(M)