# Imports

In [1]:
import sys
import pandas as pd
import warnings

sys.path.append('../src')
warnings.filterwarnings('ignore')

In [2]:
from data_loading import YearLoader

# Weather conditions

Get the number of accident and the gravity of each accident given the atmospheric conditions

## Helpers

In [3]:
def atm_code_to_str(atm_code):
    """
    Function that convert an atmospheric code to its string value
    
    Parameters
    ----------
    atm_code : int
        The code of the atmospheric conditions
    
    Returns
    -------
    String
        The string value
    """
    
    dict_atm_code = {
        1 : "Normal",
        2 : "Light rainfall",
        3 : "Heavy rainfall",
        4 : "Snow - hail",
        5 : "Fog - smoke",
        6 : "Strong wind - storm",
        7 : "Dazzling weather",
        8 : "Covered weather",
        9 : "Other",
    }
    
    return dict_atm_code[atm_code]

In [4]:
def gravity_code_to_string(grav_code):
    """
    Function that convert a code of gravity to its string value
    
    Parameters
    ----------
    grav_code : int
        The code of the accident's gravity
    
    Returns
    -------
    String
        The string value
    """
    
    dict_gravity_code = {
        1 : "Unharmed",
        2 : "Killed",
        3 : "Injured and hospitalized",
        4 : "Slightly injured",
    }
    
    return dict_gravity_code[grav_code]

In [5]:
loader = YearLoader(2018)

In [6]:
charac = loader.get_dataframe('characteristics')
charac.head()

Unnamed: 0,Num_Acc,an,mois,jour,hrmn,lum,agg,int,atm,col,com,adr,gps,lat,long,dep
0,201800000001,18,1,24,1505,1,1,4,1.0,1.0,5,route des Ansereuilles,M,5055737.0,294992.0,590
1,201800000002,18,2,12,1015,1,2,7,7.0,7.0,11,Place du général de Gaul,M,5052936.0,293151.0,590
2,201800000003,18,3,4,1135,1,2,3,1.0,7.0,477,Rue nationale,M,5051243.0,291714.0,590
3,201800000004,18,5,5,1735,1,2,1,7.0,3.0,52,30 rue Jules Guesde,M,5051974.0,289123.0,590
4,201800000005,18,6,26,1605,1,2,1,1.0,3.0,477,72 rue Victor Hugo,M,5051607.0,290605.0,590


In [7]:
charac["Num_Acc"].is_unique

True

## Number of accidents given the weather

In [8]:
def get_accident_per_weather(year):
    """
    Function that returns a Dataframe containing for each type 
    of weather the count and the percentage of accidents
    
    Parameters
    ----------
    year : int
        The year
    
    Returns
    -------
    DataFrame
        The dataframe containing the number of accidents
    """
    
    # Load the dataset
    loader = YearLoader(year)
    charac = loader.get_dataframe('characteristics')
    charac = charac.dropna(subset=['atm'])
    charac['atm'] = charac['atm'].astype(int)
    
    # Extract the count of accidents
    df_accidents = charac["atm"].value_counts().sort_index().rename_axis('weather').reset_index(name='accidents')
    df_accidents["weather"] = df_accidents["weather"].apply(lambda x : atm_code_to_str(x))
    # Adding percentage
    count = df_accidents["accidents"].sum()
    df_accidents["perc"] = df_accidents["accidents"] / count * 100
    
    return df_accidents.set_index('weather')
    
def get_accident_per_weather_years(years):
    """
    Function that returns a Dataframe containing for each type 
    of weather the count and the percentage of accidents given
    multiple years
    
    Parameters
    ----------
    years : list(int)
        The list of the years
    
    Returns
    -------
    DataFrame
        The dataframe containing the number of accidents
    """
    
    df_full = get_accident_per_weather(years[0])
    
    for year in years[1:]:
        df_year = get_accident_per_weather(year)
        df_full += df_year
    
    # Fixing percentages
    count = df_full["accidents"].sum()
    df_full["perc"] = df_full["accidents"]/count * 100
        
    return df_full

### For the year 2008

In [9]:
df_full = get_accident_per_weather(2018)
df_full.head()

Unnamed: 0_level_0,accidents,perc
weather,Unnamed: 1_level_1,Unnamed: 2_level_1
Normal,45824,79.310464
Light rainfall,6350,10.990342
Heavy rainfall,1333,2.307107
Snow - hail,358,0.619613
Fog - smoke,427,0.739036


### For the years between 2008 and 2018

In [10]:
df_full = get_accident_per_weather_years(list(range(2008, 2019)))
df_full.head()

Unnamed: 0_level_0,accidents,perc
weather,Unnamed: 1_level_1,Unnamed: 2_level_1
Normal,568065,80.630693
Light rainfall,74927,10.635079
Heavy rainfall,15767,2.237955
Snow - hail,4134,0.586777
Fog - smoke,4531,0.643127


## Gravity given weather

In [11]:
def grav_weather_year(year):
    """
    Function that returns a Dataframe containing 
    for a year the count of accidents and the 
    percentage given a gravity 
    
    Parameters
    ----------
    year : int
        The year
    
    Returns
    -------
    DataFrame
        The dataframe containing the number of accidents
    """
    
    # Loading datasets
    loader = YearLoader(year)
    charac = loader.get_dataframe('characteristics')
    charac = charac.dropna(subset=['atm'])
    charac['atm'] = charac['atm'].astype(int)
    
    # Empty dataset for the results
    df_full = pd.DataFrame()
    
    for atm in range(1, 10):
        # Loop for each atm condition
        
        # Keeping only the number of the accidents given the atm code
        df_atm = charac[charac["atm"] == atm]
        list_accidents = df_atm["Num_Acc"]
        
        # Keeping the passengers of the previous accidents
        passengers = loader.get_dataframe("passengers")
        passengers = passengers[passengers["Num_Acc"].isin(list_accidents)]
        
        # Counting the gravity of the accidents
        df_res = passengers["grav"].value_counts().sort_index().rename_axis('grav').reset_index(name='counts')        
        
        # Adding percentage and atm code
        sum_acc = df_res["counts"].sum()
        df_res["perc"] = df_res["counts"] / sum_acc * 100 # Adding percentage
        df_res["atm"] = atm                                 # Add atm code
        
        # Append to full dataset
        df_full = df_full.append(df_res)
    
    # Codes to strings
    df_full["atm"] = df_full["atm"].apply(lambda x : atm_code_to_str(x))
    df_full["grav"] = df_full["grav"].apply(lambda x : gravity_code_to_string(x))
    
    # Set correct indexes
    df_full = df_full.set_index(["atm", "grav"])
    
    return df_full

In [12]:
def grav_weather_over_years(years):
    """
    Function that returns a Dataframe containing 
    for several years the count of accidents and 
    the percentage given a gravity 
    
    Parameters
    ----------
    years : list(int)
        The years
    
    Returns
    -------
    DataFrame
        The dataframe containing the number of accidents
    """
    
    df_res = grav_weather_year(years[0])
    
    # Adding results
    for year in years[1:]:
        df_res = df_res + grav_weather_year(year)
    
    # Fixing percentages   
    # Get count of acc for each atm
    dict_count = df_res.groupby(["atm"]).sum(columns=["counts"]).to_dict()["counts"]
    # Reset index to access all fields
    df_res = df_res.reset_index()
    for i in range(len(df_res)):
        # Compute percentages
        df_res.ix[i, "perc"] = df_res.iloc[i]["counts"] * 100 / dict_count[df_res.iloc[i]["atm"]]
    
    return df_res.set_index(["atm", "grav"])

### For the year 2018

In [13]:
df_res = grav_weather_year(2018)
df_res

Unnamed: 0_level_0,Unnamed: 1_level_0,counts,perc
atm,grav,Unnamed: 2_level_1,Unnamed: 3_level_1
Normal,Unharmed,43337,41.867048
Normal,Killed,2645,2.555284
Normal,Injured and hospitalized,17546,16.950855
Normal,Slightly injured,39983,38.626813
Light rainfall,Unharmed,5734,41.001073
Light rainfall,Killed,281,2.009296
Light rainfall,Injured and hospitalized,2188,15.645334
Light rainfall,Slightly injured,5782,41.344297
Heavy rainfall,Unharmed,1237,39.762134
Heavy rainfall,Killed,100,3.214401


### For the years between 2008 and 2018

In [14]:
df_res = grav_weather_over_years(list(range(2008, 2019)))
df_res

Unnamed: 0_level_0,Unnamed: 1_level_0,counts,perc
atm,grav,Unnamed: 2_level_1,Unnamed: 3_level_1
Normal,Unharmed,521212,41.19784
Normal,Killed,32855,2.596938
Normal,Injured and hospitalized,257930,20.387403
Normal,Slightly injured,453147,35.81782
Light rainfall,Unharmed,65647,40.003047
Light rainfall,Killed,3753,2.28695
Light rainfall,Injured and hospitalized,30620,18.658786
Light rainfall,Slightly injured,64085,39.051217
Heavy rainfall,Unharmed,14039,39.364625
Heavy rainfall,Killed,1108,3.106774


# Roads conditions

In [15]:
loader = YearLoader(2018)
loader.get_datasets()

dict_keys(['characteristics', 'locations', 'passengers', 'vehicles'])

## Helpers

In [16]:
def surf_code_to_string(surf_code):
    """
    Function that convert the code of a 
    road surface into its string value
    
    Parameters
    ----------
    surf_code : int
        The code of the road's surface
    
    Returns
    -------
    String
        The string value
    """
        
    dict_surf_code = {
        0 : "Other",
        1 : "Normal",
        2 : "Wet",
        3 : "Puddles",
        4 : "Flooded",
        5 : "Snowy",
        6 : "Muddy",
        7 : "Icy",
        8 : "Greasy substance - oil",
        9 : "Other",
    }
    
    return dict_surf_code[surf_code]

## Number of accidents given road conditions

In [17]:
def get_accident_per_road_conditions(year):
    """
    Function that returns a Dataframe containing 
    for a specific year the count of accidents and
    the percentage for each road condition
    
    Parameters
    ----------
    year : int
        The year
    
    Returns
    -------
    DataFrame
        The dataframe containing the number of accidents
    """
    
    loader = YearLoader(year)
    locations = loader.get_dataframe('locations')
    locations = locations.dropna(subset=['surf'])
    locations['surf'] = locations['surf'].astype(int)
    # 0 are considered as "Other"
    locations['surf'] = locations['surf'].apply(lambda x : 9 if x == 0 else x)
    
    df_accidents = locations["surf"].value_counts().sort_index().rename_axis('surface').reset_index(name='accidents')
    # Getting the code into string
    df_accidents["surface"] = df_accidents["surface"].apply(lambda x : surf_code_to_string(x))
    # Adding percentage
    count_accidents = df_accidents["accidents"].sum()
    df_accidents["perc"] = df_accidents["accidents"] / count_accidents * 100
    
    return df_accidents.set_index('surface')
    
def get_accident_per_road_conditions_years(years):
    """
    Function that returns a Dataframe containing 
    for a several years the count of accidents 
    for each road condition
    
    Parameters
    ----------
    years : list(int)
        The years
    
    Returns
    -------
    DataFrame
        The dataframe containing the number of accidents
    """
    
    df_full = get_accident_per_road_conditions(years[0])
    
    for year in years[1:]:
        df_year = get_accident_per_road_conditions(year)
        df_full += df_year
    
    # Fixing percentages
    count = df_full["accidents"].sum()
    df_full["perc"] = df_full["accidents"]/count * 100
    
    return df_full

### For the year 2018

In [18]:
df_res = get_accident_per_road_conditions(2018)
df_res

Unnamed: 0_level_0,accidents,perc
surface,Unnamed: 1_level_1,Unnamed: 2_level_1
Normal,45726,79.743988
Wet,10247,17.870285
Puddles,90,0.156956
Flooded,53,0.09243
Snowy,158,0.275545
Muddy,28,0.048831
Icy,244,0.425524
Greasy substance - oil,125,0.217994
Other,670,1.168448


### For the years between 2008 and 2018

In [19]:
df_res = get_accident_per_road_conditions_years(list(range(2008, 2019)))
df_res

Unnamed: 0_level_0,accidents,perc
surface,Unnamed: 1_level_1,Unnamed: 2_level_1
Normal,544611,77.50528
Wet,121543,17.297161
Puddles,1111,0.15811
Flooded,410,0.058348
Snowy,2028,0.288611
Muddy,388,0.055217
Icy,4381,0.623474
Greasy substance - oil,1658,0.235955
Other,26546,3.777844


## Gravity of the accidents given the roads conditions

In [20]:
def grav_surface_year(year):
    """
    Function that returns a Dataframe containing 
    for a specific year the count of accidents 
    for each road condition and type of gravity
    
    Parameters
    ----------
    year : int
        The year
    
    Returns
    -------
    DataFrame
        The dataframe containing the number of accidents
    """
        
    # Loading datasets
    loader = YearLoader(year)
    locations = loader.get_dataframe('locations')
    locations = locations.dropna(subset=['surf'])
    locations['surf'] = locations['surf'].astype(int)
    # 0 are considered as "Other"
    locations['surf'] = locations['surf'].apply(lambda x : 9 if x == 0 else x)
    
    # Empty dataset for the results
    df_full = pd.DataFrame()
    
    for surf in range(1, 10):
        # Loop for each road surface condition
        
        # Keeping only the number of the accidents given the surf code
        df_surf = locations[locations["surf"] == surf]
        list_accidents = df_surf["Num_Acc"]
        
        # Keeping the passengers of the previous accidents
        passengers = loader.get_dataframe("passengers")
        passengers = passengers[passengers["Num_Acc"].isin(list_accidents)]
        
        # Counting the gravity of the accidents
        df_res = passengers["grav"].value_counts().sort_index().rename_axis('grav').reset_index(name='counts')        
        
        # Adding percentage and atm code
        sum_acc = df_res["counts"].sum()
        df_res["perc"] = df_res["counts"] / sum_acc * 100 # Adding percentage
        df_res["surf"] = surf                             # Add atm code
        
        # Append to full dataset
        df_full = df_full.append(df_res)
    
    # Codes to strings
    df_full["surf"] = df_full["surf"].apply(lambda x : surf_code_to_string(x))
    df_full["grav"] = df_full["grav"].apply(lambda x : gravity_code_to_string(x))
    
    # Set correct indexes
    df_full = df_full.set_index(["surf", "grav"])
    
    return df_full

In [21]:
def grav_surface_over_years(years):
    """
    Function that returns a Dataframe containing 
    for a several year the count of accidents 
    for each road condition and type of gravity
    
    Parameters
    ----------
    year : int
        The year
    
    Returns
    -------
    DataFrame
        The dataframe containing the number of accidents
    """
    
    df_res = grav_surface_year(years[0])
    
    # Adding results
    for year in years[1:]:
        df_res = df_res + grav_surface_year(year)
    
    ## Fixing percentages ##
    # Get count of acc for each surface
    dict_count = df_res.groupby(["surf"]).sum(columns=["counts"]).to_dict()["counts"]
    # Reset index to access all fields
    df_res = df_res.reset_index()
    
    for i in range(len(df_res)):
        # Compute percentages
        df_res.ix[i, "perc"] = df_res.iloc[i]["counts"] * 100 / dict_count[df_res.iloc[i]["surf"]]
    
    return df_res.set_index(["surf", "grav"])

### For year 2018

In [22]:
df_res = grav_surface_year(2018)
df_res

Unnamed: 0_level_0,Unnamed: 1_level_0,counts,perc
surf,grav,Unnamed: 2_level_1,Unnamed: 3_level_1
Normal,Unharmed,43694,42.192373
Normal,Killed,2691,2.598519
Normal,Injured and hospitalized,17607,17.001902
Normal,Slightly injured,39567,38.207206
Wet,Unharmed,9063,39.819859
Wet,Killed,590,2.592267
Wet,Injured and hospitalized,3790,16.652021
Wet,Slightly injured,9317,40.935852
Puddles,Unharmed,106,49.074074
Puddles,Killed,4,1.851852


### For years between 2008 and 2018

In [23]:
df_res = grav_surface_over_years(list(range(2008, 2019)))
df_res

Unnamed: 0_level_0,Unnamed: 1_level_0,counts,perc
surf,grav,Unnamed: 2_level_1,Unnamed: 3_level_1
Flooded,Injured and hospitalized,230.0,25.30253
Flooded,Killed,64.0,7.040704
Flooded,Slightly injured,286.0,31.463146
Flooded,Unharmed,329.0,36.193619
Greasy substance - oil,Injured and hospitalized,712.0,23.45191
Greasy substance - oil,Killed,83.0,2.73386
Greasy substance - oil,Slightly injured,1471.0,48.45191
Greasy substance - oil,Unharmed,770.0,25.362319
Icy,Injured and hospitalized,2747.0,29.477412
Icy,Killed,452.0,4.850306
