# Here a kick scan from the data from the fire-brigate department

In [None]:
import pandas as pd
import numpy as np
from pyproj import Proj, transform
from math import radians, cos, sin, asin, sqrt,atan2
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

## Call the data sets

In [None]:
df_in = pd.read_csv('data\JADS\incidenten_2017.csv', sep=';', decimal=',')
df_dep = pd.read_csv('data\JADS\inzetten_2017.csv', sep=';', decimal=',')
locations = pd.read_excel('data\JADS\kazernepositie en voertuigen.xlsx', sheet_name='adressen')

In [None]:
#(in) incident starts = dim_incident_start_datumtijd
#(in) incident finish = dim_incident_eind_datumtijd
#(dep) alarm is activated = inzet_gealarmeerd_datumtijd
#(dep) leave = inzet_uitgerukt_datumtijd
#(dep) arrive = inzet_terplaatse_datumtijd


In [None]:
locations

In [None]:
print("the shape of the incidents dataset is {}".format(df_in.shape))
print("the shape of the deployment dataset is {}".format(df_dep.shape))
print("the time span of the data set incidents is: {}".format((min(df_in['dim_datum_datum']), max(df_in['dim_datum_datum']))))
print(print("the time span of the data set deploy is: {}".format((min(df_dep['inzet_gealarmeerd_datumtijd']), 
                                                                  max(df_dep['inzet_gealarmeerd_datumtijd'])))))

incidents_df_in = list(set(df_in['dim_incident_id'].unique()) - set(df_dep['hub_incident_id'].unique()))
incidents_df_dep = list(set(df_dep['hub_incident_id'].unique()) - set(df_in['dim_incident_id'].unique()))

print('the following id are only at incidents {}'.format(incidents_df_in))
print('the following id are only at deploy {}'.format(len(incidents_df_dep)))

In [None]:
df_in.describe().T.dropna()[['mean', 'std', 'min', 'max']]

In [None]:
df_dep.describe().T.dropna()[['mean', 'std', 'min', 'max']]


# EDA on the components of the response time

In [None]:
#(in) incident starts = dim_incident_start_datumtijd
#(in) incident finish = dim_incident_eind_datumtijd
#(dep) alarm is activated = inzet_gealarmeerd_datumtijd
#(dep) leave = inzet_uitgerukt_datumtijd
#(dep) arrive = inzet_terplaatse_datumtijd

In [None]:
def pre_process_data(df_in, df_dep, locations):
    
    inProj  = Proj("+init=EPSG:28992", preserve_units=True)
    outProj = Proj("+init=EPSG:4326")

    def projections(x,y, inProj, outProj):
        """
        in this funtion we transform the data from the corrdinate system to gps
        """
        longitd, latitud = transform(inProj,outProj,x,y)
        return longitd, latitud
    
    def haversine(lon1, lat1, lon2, lat2):
        """
        Calculate the great circle distance between two points
        on the earth (specified in decimal degrees)
        """
        # convert decimal degrees to radians
        lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

        # haversine formula
        dlon = lon2 - lon1
        dlat = lat2 - lat1
        a = (sin(dlat/2))**2 + cos(lat1) * cos(lat2) * (sin(dlon/2))**2
        c = 2 * atan2(sqrt(a), sqrt(1-a))
        r = 6371 # Radius of earth in kilometers. Use 3956 for miles
        return c * r
    
    
    keep_in = ['dim_incident_id','st_x', 'st_y', 'dim_incident_incident_type', 'inc_dim_object_naam', 
               'dim_incident_start_datumtijd', 'dim_incident_eind_datumtijd', 'dim_prioriteit_prio']
    
    time_stamps = ['dim_incident_start_datumtijd', 'dim_incident_eind_datumtijd', 'inzet_gealarmeerd_datumtijd',
                  'inzet_uitgerukt_datumtijd', 'inzet_terplaatse_datumtijd']
    
    keep_dep = ['hub_incident_id', 'inzet_uitgerukt_datumtijd', 'inzet_gealarmeerd_datumtijd','inzet_terplaatse_datumtijd', 'voertuig_groep',
            'kazerne_groep', 'inzet_kazerne_naam']
    
    df_in = df_in[keep_in]
    df_dep = df_dep[keep_dep]
    
    M = df_dep.merge(df_in, left_on='hub_incident_id', right_on='dim_incident_id', how = 'inner')
    #Double check this merging process!!!!!!!!!!!!!!!!
    M['inzet_kazerne_naam'] = M['inzet_kazerne_naam'].apply(lambda x: x.lower())
    locations['kazerne'] = locations['kazerne'].apply(lambda x: x.lower())
    M = locations.merge(M, left_on='kazerne', right_on='inzet_kazerne_naam', how = 'inner')
    M['lon_in'], M['lat_in'] = np.vectorize(projections)(M['st_x'], M['st_y'], inProj, outProj)
    M['haversine_distance (Km)'] = np.vectorize(haversine)(M['lon'], M['lat'], M['lon_in'], M['lat_in'])
    
    for date in time_stamps:
        M[date] = pd.to_datetime(M[date])
    
    M['turn out time (seconds)'] = (M['inzet_uitgerukt_datumtijd'] - M['inzet_gealarmeerd_datumtijd']).astype('timedelta64[s]')
    M['travel time (seconds)'] = (M['inzet_terplaatse_datumtijd'] - M['inzet_uitgerukt_datumtijd']).astype('timedelta64[s]')
    M['response time (seconds)'] = M['turn out time (seconds)'] + M['travel time (seconds)'] + M['dispatch (seconds)']
    M['Average Speed (Km/h)'] = M['haversine_distance (Km)']/(M['travel time (seconds)']/(60*60))
    
    M = M[(M['Average Speed (Km/h)']>0) & (M['Average Speed (Km/h)']<150)]
    M = M[M['turn out time (seconds)']>0]
    
    return M.replace([np.inf, -np.inf], np.nan).dropna()

M = pre_process_data(df_in, df_dep, locations)
M.T

data issues:
(a) no all the stations have dispatch time
(b) no all the stations (kazerne) are in the deploy dataset (inzet_kazerne_naam) ['anton', 'dirk', 'hendrik', 'ijsbrand', 'nico', 'osdorp', 'pieter', 'teunis', 'victor', 'willem', 'zebra'] 
(c) negrative time differences. E.g, activate the alarm before the calles was made



In [None]:
# Here, there are a couple of cases where the x and y coordinates are 0
M.describe().T

In [None]:
#General satistics, this information can be used as an input ot the simulation model
M.groupby(['kazerne', 'dim_prioriteit_prio'], as_index=False).agg({'haversine_distance (Km)':['mean','std', 'count'],
                                                                  'turn out time (seconds)':['mean','std', 'count'],
                                                                  'travel time (seconds)':['mean','std', 'count'],
                                                                  'response time (seconds)':['mean','std', 'count'],
                                                                  'Average Speed (Km/h)':['mean','std', 'count']})

In [None]:
def plot_dist(df, var, kazerne, norm_val, alpha):
    """
    Funtion to plot the time and speed distributions per station
    
    Parameters:
    df: dataFrame
    var: the variables to be plotted from M
    """
    
    df = df[(df['kazerne']==kazerne)]
    priority_color = {1:'#d73027', 2:'#91bfdb'}
    
    fig, ax = plt.subplots(figsize=(14,10))
    norm  = 'N'
    
    for dim_prioriteit_prio in [1,2]:
        df_temp = df[df['dim_prioriteit_prio']==dim_prioriteit_prio]
        # From https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.normaltest.html
        k2, p = stats.normaltest(df_temp[var])
        if p<alpha: norm = 'N-N'
        norm_val = norm_val.append(pd.Series([var, kazerne, dim_prioriteit_prio, p, norm]), ignore_index=True)
        ax = sns.distplot(df_temp[var], bins=100, color=priority_color[dim_prioriteit_prio] ,
                          label = 'Priority {} ({})'.format(dim_prioriteit_prio, norm))
    
    
    ax.set(xlabel=var, ylabel='', 
           title = 'Histogram and GKD of {} in the station: {}'.format(var, kazerne, dim_prioriteit_prio))
    fig.legend(loc='right')
    
    fig.savefig("figures/hist_ {}_{}.png".format(var.replace("/", ""), kazerne))
    
    return norm_val
    
var = ['haversine_distance (Km)', 'turn out time (seconds)', 
       'response time (seconds)', 'Average Speed (Km/h)']

norm_val = pd.DataFrame()
alpha = 1e-3
for kazerne in M['kazerne'].unique():
    for v in var:
        norm_val = plot_dist(M, v, kazerne, norm_val, alpha)
        
norm_val.columns = ['var', 'kazerne', 'dim_prioriteit_prio', 'p', 'Normal']
