# Here a quick scan from the data from the fire-brigate department

In [None]:
import pandas as pd
import numpy as np
from pyproj import Proj, transform
from math import radians, cos, sin, asin, sqrt,atan2
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import os

# Function Utils

In [None]:
def projections(x,y, inProj, outProj):
    """
    in this funtion we transform the data from the coordinate system to gps
    """
    longitd, latitud = transform(inProj,outProj,x,y)
    return longitd, latitud

def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = (sin(dlat/2))**2 + cos(lat1) * cos(lat2) * (sin(dlon/2))**2
    c = 2 * atan2(sqrt(a), sqrt(1-a))
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r

def pre_process_station_name(x):
    """
    Standarized the station names. This step is necesary to merge different data sets later
    """
    x = x.lower()        
    x = x.split()    
    return x[0]

## Call the data sets

In [None]:
global station_locations, response_time_parameters 
response_time_parameters = {}

In [None]:
df_in = pd.read_csv('data\JADS\incidenten_2017.csv', sep=';', decimal=',')
df_dep = pd.read_csv('data\JADS\inzetten_2017.csv', sep=';', decimal=',')
station_locations = pd.read_excel('data\JADS\kazernepositie en voertuigen.xlsx', sheet_name='adressen')
station_locations['kazerne'] = station_locations['kazerne'].apply(lambda x: pre_process_station_name(x))

# 1. General assesments of the datasets quality

There are many incedents that are only on the deploy dataset. These incidents are demostrations. Thus, probably is better not to take them into account

In [None]:
print("the shape of the incidents dataset is {}".format(df_in.shape))
print("the shape of the deployment dataset is {}".format(df_dep.shape))
print("the time span of the data set incidents is: {}".format((min(df_in['dim_datum_datum']), max(df_in['dim_datum_datum']))))
print(print("the time span of the data set deploy is: {}".format((min(df_dep['inzet_gealarmeerd_datumtijd']), 
                                                                  max(df_dep['inzet_gealarmeerd_datumtijd'])))))

incidents_df_in = list(set(df_in['dim_incident_id'].unique()) - set(df_dep['hub_incident_id'].unique()))
incidents_df_dep = list(set(df_dep['hub_incident_id'].unique()) - set(df_in['dim_incident_id'].unique()))

print('the following id are only at incidents {}'.format(incidents_df_in))
print('the following id are only at deploy {}'.format(len(incidents_df_dep)))

In [None]:
# print('General statistics of the incident dataset')
# print(df_in.describe().T.dropna()[['mean', 'std', 'min', 'max']])
# print("\n generak statistics of the deployment dataset: \n")
# print(df_dep.describe().T.dropna()[['mean', 'std', 'min', 'max']])

In [None]:
df_dep.T

# 2. EDA of the response time components

## 2.1 Relevant time slots for the response time

In order to calculate the different components of the response time, we considered the folling information:


(in) incident starts = dim_incident_start_datumtijd

(in) incident finish = dim_incident_eind_datumtijd

(dep) alarm is activated = inzet_gealarmeerd_datumtijd

(dep) leave = inzet_uitgerukt_datumtijd

(dep) arrive = inzet_terplaatse_datumtijd

In [None]:
def pre_process_data(df_in, df_dep, station_locations):
    """
    In this function we do all pre-processing necessary to calculate the different components of the response time.
    
    Note: the travel time depends on the speed and the incident location. Because the incident location is a 
    random variable, we only consider the speed parameters.
    
    Params
    -------------------------------------------
    df_in: data frame with the incident data
    df_dep: data frame with the deployment data
    station_locations: data frame with the fire station data
    
    Returns:
    M: dataframe where every row includes all the relevant information of every deployment needed to calculate the response
    time.
    """

    
    
    inProj  = Proj("+init=EPSG:28992", preserve_units=True)
    outProj = Proj("+init=EPSG:4326")


    keep_in = ['dim_incident_id','st_x', 'st_y', 'dim_incident_incident_type', 'inc_dim_object_naam', 
               'dim_incident_start_datumtijd', 'dim_incident_eind_datumtijd', 'dim_prioriteit_prio']
    
    time_stamps = ['dim_incident_start_datumtijd', 'dim_incident_eind_datumtijd', 'inzet_gealarmeerd_datumtijd',
                  'inzet_uitgerukt_datumtijd', 'inzet_terplaatse_datumtijd']
    
    keep_dep = ['hub_incident_id', 'inzet_uitgerukt_datumtijd', 'inzet_gealarmeerd_datumtijd','inzet_terplaatse_datumtijd', 'voertuig_groep',
            'kazerne_groep', 'inzet_kazerne_naam']
    
    df_in = df_in[keep_in]
    df_dep = df_dep[keep_dep]
    
    M = df_dep.merge(df_in, left_on='hub_incident_id', right_on='dim_incident_id', how = 'inner')
    M['inzet_kazerne_naam'] = M['inzet_kazerne_naam'].apply(lambda x: pre_process_station_name(x))
    
    
    # print(set(station_locations['kazerne'].unique()) - (set(station_locations['kazerne'].unique()) & set(M['inzet_kazerne_naam'].unique())))
    M = station_locations.merge(M, left_on='kazerne', right_on='inzet_kazerne_naam', how = 'inner')
    M['lon_in'], M['lat_in'] = np.vectorize(projections)(M['st_x'], M['st_y'], inProj, outProj)
    M['haversine_distance (Km)'] = np.vectorize(haversine)(M['lon'], M['lat'], M['lon_in'], M['lat_in'])
    
    for date in time_stamps:
        M[date] = pd.to_datetime(M[date])
    
    M['turn out time (seconds)'] = (M['inzet_uitgerukt_datumtijd'] - M['inzet_gealarmeerd_datumtijd']).astype('timedelta64[s]')
    M['travel time (seconds)'] = (M['inzet_terplaatse_datumtijd'] - M['inzet_uitgerukt_datumtijd']).astype('timedelta64[s]')
    M['response time (seconds)'] = M['turn out time (seconds)'] + M['travel time (seconds)'] + M['dispatch (seconds)']
    M['Average Speed (Km/h)'] = M['haversine_distance (Km)']/(M['travel time (seconds)']/(60*60))
    
    #Some filters to remove outliers
    M = M[(M['Average Speed (Km/h)']>0) & (M['Average Speed (Km/h)']<150)]
    M = M[M['turn out time (seconds)']>0]
    
    return M.replace([np.inf, -np.inf], np.nan).dropna()


M = pre_process_data(df_in, df_dep, station_locations)
M.T

data issues:

(a) no all the stations have dispatch time

(b) There are some stations with different numbering. E.g, aalsmeer vrijwillig 1, aalsmeer vrijwillig 2. We will take only the first word for the deplyment dataset.

(c) negrative time differences. E.g, activate the alarm before the calles was made

(d) the station DRIEMOND does not have dispatch time



In [None]:
# Here, there are a couple of cases where the x and y coordinates are 0
M.describe().T

In [None]:
def plot_dist(df, variable, vehicle_type, kazerne, norm_val, alpha):
    """
    Funtion to plot the time and speed distributions per station
    
    Parameters:
    ------------------------------------------------
    df: dataFrame
    variable: the variables to be plotted from M
    vehicle_type: the vehicle type of interest
    kazerne: name of the station
    norm_val: dataframe to store the results
    alpha: threshold to define normality
    
    Return
    ---------------------------------------------------
    norm_val
    """
    
    df = df[(df['kazerne']==kazerne) & (df['voertuig_groep'] == vehicle_type)]
    priority_color = {1:'#d73027', 2:'#91bfdb'}
    
    fig, ax = plt.subplots(figsize=(14,10))
    norm  = 'N'
    
    for dim_prioriteit_prio in [1,2]:
        df_temp = df[df['dim_prioriteit_prio']==dim_prioriteit_prio]

        # From https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.normaltest.html
        try:
            k2, p = stats.normaltest(df_temp[variable])
            if p<alpha: norm = 'N-N'
            
        except ValueError:
            norm = 'Null'
            p = np.nan
            
        norm_val = norm_val.append(pd.Series([variable, kazerne, dim_prioriteit_prio, p, norm]), ignore_index=True)
        ax = sns.distplot(df_temp[variable], bins=100, color=priority_color[dim_prioriteit_prio] ,
                          label = 'Priority {} ({})'.format(dim_prioriteit_prio, norm))
    
    
    ax.set(xlabel=variable, ylabel='', 
           title = 'Histogram and GKD of {} in the station: {} ({})'.format(variable, kazerne, vehicle_type))
    fig.legend(loc='right')
    path = 'figures/' + vehicle_type + '/'
    if not os.path.exists(directory):
        os.makedirs(path)
    
    fig.savefig(path + "hist_ {}_{}_{}.png".format(variable.replace("/", ""), kazerne, vehicle_type))
    
    return norm_val
    
# var = ['response time (seconds)', 'turn out time (seconds)', 'Average Speed (Km/h)']

# norm_val = pd.DataFrame()
# alpha = 1e-3
# for kazerne in M['kazerne'].unique():
#     for v in var:
#         for v_t in M['voertuig_groep'].unique():
#             norm_val = plot_dist(M, v, v_t, kazerne, norm_val, alpha)
        
# norm_val.columns = ['var', 'kazerne', 'dim_prioriteit_prio', 'p', 'Normal']


In [None]:
def plot_dist(df, variable, fire_station, norm_val, alpha):
    """
    Funtion to plot the time and speed distributions per station
    
    Parameters:
    ------------------------------------------------
    df: dataFrame
    variable: the variables to be plotted from M
    vehicle_type: the vehicle type of interest
    kazerne: name of the station
    norm_val: dataframe to store the results
    alpha: threshold to define normality
    
    Return
    ---------------------------------------------------
    norm_val
    """
    
    df = df[(df['kazerne'] == fire_station)]
    priority_color = {1:'#d73027', 2:'#91bfdb'}
    
    fig, ax = plt.subplots(figsize=(14,10))
    norm  = 'N'
    
    for priority in [1,2]:
        df_temp = df[df['dim_prioriteit_prio']==priority]
        
        k2, p = stats.normaltest(df_temp[variable])
        if p<alpha: 
            norm = 'N-N'
        
        norm_val = norm_val.append(pd.Series([variable, fire_station, priority, p, norm]), ignore_index=True)
        ax = sns.distplot(df_temp[variable], bins=100, color=priority_color[priority] ,
                          label = 'Priority {} ({})'.format(priority, norm))
    
    
    ax.set(xlabel=var, ylabel='', 
           title = 'Histogram and GKD of {} in the station: {}'.format(variable, fire_station, priority))
    fig.legend(loc='right')
    
    fig.savefig("figures/hist_ {}_{}.png".format(variable.replace("/", ""), kazerne))
    
    return norm_val
    
variable = ['haversine_distance (Km)', 'turn out time (seconds)', 
       'response time (seconds)', 'Average Speed (Km/h)']

norm_val = pd.DataFrame()
alpha = 1e-3
for fire_station in M['kazerne'].unique():
    for v in var:
        norm_val = plot_dist(M, v, fire_station, norm_val, alpha)
        
norm_val.columns = ['var', 'kazerne', 'dim_prioriteit_prio', 'p', 'Normal']

# 3. Simulation functions

Thise funtions will be later on included in the main class

In [None]:
#General satistics, this information can be used as an input ot the simulation model

def create_global_time_parameters(M):
    """
    Calculate different aggregate levels of the time dependent parameters
    
    Params:
    ------------------------------------------
    M: see funtion pre_process_data
    
    Return
    -----------------------------------------
    Global dictionary response_time_parameters that have three levels of aggregation. Level one is the less aggregated level  
    
    """
    # We only include priority one and two incidents
    M = M[M['dim_prioriteit_prio'] <3]
    

    
    response_time_parameters['level_1'] = M.groupby(['kazerne', 'dim_prioriteit_prio', 'voertuig_groep'], as_index=False).agg({
                                                                  'dispatch (seconds)':['mean','std', 'count'],
                                                                  'turn out time (seconds)':['mean','std', 'count'],
                                                                  'response time (seconds)':['mean','std', 'count'],
                                                                  'Average Speed (Km/h)':['mean','std', 'count']}).dropna()

    
    response_time_parameters['level_2'] = M.groupby([ 'dim_prioriteit_prio', 'voertuig_groep'], as_index=False).agg({
                                                                  'dispatch (seconds)':['mean','std', 'count'],
                                                                  'turn out time (seconds)':['mean','std', 'count'],
                                                                  'response time (seconds)':['mean','std', 'count'],
                                                                  'Average Speed (Km/h)':['mean','std', 'count']}).dropna()
    
    response_time_parameters['level_3'] = M.groupby(['voertuig_groep'], as_index=False).agg({
                                                                  'dispatch (seconds)':['mean','std', 'count'],
                                                                  'turn out time (seconds)':['mean','std', 'count'],
                                                                  'response time (seconds)':['mean','std', 'count'],
                                                                  'Average Speed (Km/h)':['mean','std', 'count']}).dropna()
    

create_global_time_parameters(M)

In [None]:
def check_statistical_significance(df, n=20):
    """
    check if the filtered subset is statistical significant
    
    Parameters
    ----------------------------------------
    df: dataFrame with the parameter of the filter made at the funtion get_parameters
    n: minimun number of observations to consider the parameters as significant
    
    Return
    -----------------------------------------
    True if the parameters are statistically significant, False otherwise
    """
    
    names = df.columns.get_level_values(level=0).unique()[0:3]
    try:
        statistical_significance = df['turn out time (seconds)']['count'].values < n
        statistical_significance = statistical_significance[0]
        print('No enougth observations, the minumin is n={}'.format(n))
        
    except IndexError:
        print('Empty dataFrame')
        statistical_significance = True
    
    if statistical_significance:
            return False
        
    return True
            
def get_parameters(station, vehicle, incident_priority):
    """
    get the mean and standar deviation of the time distribution of the required sources
    
    Parameters
    ----------------------------------------------
    See the response_time_simulation parameters
    
    Return
    --------------------------------------------
    lower aggregated parameters
    """
    
    # Level one of aggregation
    parameters = response_time_parameters['level_1'][ 
                                         (response_time_parameters['level_1']['kazerne'] == station) & 
                                         (response_time_parameters['level_1']['dim_prioriteit_prio'] == incident_priority) &
                                         (response_time_parameters['level_1']['voertuig_groep'] == vehicle)
                                         ]

    if check_statistical_significance(parameters):
        print('getting the parameters from level_1 of the vehicle {}'.format(vehicle))
        return parameters
    
    # Level two of aggregation
    parameters = response_time_parameters['level_2'][  
                                         (response_time_parameters['level_2']['dim_prioriteit_prio'] == incident_priority) &
                                         (response_time_parameters['level_2']['voertuig_groep'] == vehicle)
                                         ]
        
    if check_statistical_significance(parameters):
        print('getting the parameters from level_2 of the vehicle {}'.format(vehicle))
        return parameters
    
    # Level one of aggregation
    parameters = response_time_parameters['level_3'][  
                                         (response_time_parameters['level_3']['voertuig_groep'] == vehicle)
                                         ]
    print('getting the parameters from level_3 of the vehicle {}'.format(vehicle))
    
    return parameters
    


def get_simulated_values(station, vehicle, incident_priority, incident_location):
    """
    Function to calculate the simulated value
    
    Paramters:
    ------------------------------------------
    see response_time_simulation
    
    Return
    ----------------------------------------
    x: random value x that assumes normal ditsribution
    """
    x = {}
    parameters = get_parameters(station, vehicle, incident_priority)
    if parameters is None:
        return parameters
    
    reponse_time = 0.
    for var in list(response_time_parameters['level_1'].columns.get_level_values(level=0).unique()[3::]):
        temp_par = parameters[var]
        x[var] = max(np.random.normal(temp_par['mean'], temp_par['std'], 1), 0.001)
        reponse_time += x[var]
        
    station_locations[station_locations['kazerne'] == station][['lon', 'lat']]
    distance_km = haversine(incident_location[1], incident_location[0], 
                            station_locations[station_locations['kazerne'] == station]['lon'], 
                            station_locations[station_locations['kazerne'] == station]['lat'])
    
    
    x['travel time (sec)'] = 1/(x['Average Speed (Km/h)'] / distance_km / 60 / 60)
    x['response time (seconds)'] = x['dispatch (seconds)'] + x['turn out time (seconds)'] + x['travel time (sec)']
    return x
    
    
def response_time_simulation(station, incident_priority, incident_location, vehicle_type):
    """
    Calculate the simulated response time per vehicle. We assume normality on the time distributions
    
    Parameters
    --------------------------------------------
    station: The station that is going to be used to deploy the vehicles
    incident_priority: (str) is the priority of the incident (dim_prioriteit_prio)
    incident_location: [latitud, longitud] latitud and longitud of the incident
    vehicle_type: [] list with the vehicles that will to be deployed 
    
    Return
    --------------------------------------------
    response_time = (dict) every time component per vehicle type
    """
    response_time = {}
    vehicle, freq = np.unique(vehicle_type, return_counts=True)
    for v, iter_ in zip(vehicle, freq):
        for i in range(iter_):
            response_time[v + '_' + str(i)] = get_simulated_values(station, v, incident_priority, incident_location)
            
    return response_time


In [None]:
# Test for the response time
station = 'aalsmeer'
incident_priority = 2
incident_location = [52.2538, 4.76889]
vehicle_type = ['TS', 'Middelen', 'TS', 'Overig']
response_time = response_time_simulation(station, incident_priority, incident_location, vehicle_type)