# Healthy Streets of Los Angeles Injuries/Deaths by streets and intersections data project
This project defines streets and streets intersections of Los Angeles with most amount of car accidents resulted in deaths/injuries.

Sources:
* Injuries/Deaths in the City of LA https://tims.berkeley.edu/

Assumptions:
* Location only City of Los Angeles
* Available data is from 2012-2022, note that according to the source data for 2021-2022 is provisional and subject to change
* Excluding freeways (STATE_ROUTE is Null)
* Matching Primary street and Secondary streets if at intersection
* Intersection includes a street if the distance for accident is less then 10 feet for this street

Output:
* Top 500 streets by injuries/deaths with consolidated streets for last 5/10yrs/all time
* Top 100 intersections by injuries/deaths for last 5/10yrs</br>
"last_n years" mean including previous year, but not current one


Any questions - elena.sunchugasheva@gmail.com

In [None]:
# define top amount
top_n_streets = 500
top_n_x = 100

In [None]:
import pandas as pd
import datetime

pd.set_option('display.max_rows', 10000)
pd.set_option('display.max_columns', 1000)

# data

In [None]:
today = datetime.datetime.now()
print(today)

### Data preparation

In [None]:
crashes_raw = pd.read_csv('Crashes.csv')
display(crashes_raw.head(1))

In [None]:
# columns we are interested in
crashes_col = [
    'CASE_ID', 'COUNTY', 'CITY',
    'ACCIDENT_YEAR', 'COLLISION_DATE',
    'PRIMARY_RD', 'SECONDARY_RD', 'POINT_X', 'POINT_Y',
    'INTERSECTION', 'DISTANCE', 
    'COLLISION_SEVERITY', 'NUMBER_KILLED', 'NUMBER_INJURED', 'PARTY_COUNT',
    'PED_ACTION', 'PEDESTRIAN_ACCIDENT', 'BICYCLE_ACCIDENT',
    'COUNT_SEVERE_INJ', 'COUNT_VISIBLE_INJ',
    'COUNT_PED_KILLED', 'COUNT_PED_INJURED',
    'COUNT_BICYCLIST_KILLED', 'COUNT_BICYCLIST_INJURED'
]

In [None]:
# take only City of LA and not highways
crashes = crashes_raw[
        (crashes_raw.CITY=='LOS ANGELES')&
        (crashes_raw.STATE_ROUTE.isnull())
    ][crashes_col].copy()
display(crashes.head(1))

take a look at stats/outliers

In [None]:
print(
    'intersections:', crashes[crashes.INTERSECTION=='Y'].shape[0],
    ', non-intersections:', crashes[crashes.INTERSECTION=='N'].shape[0],
    ', total:', crashes.shape[0]
)

In [None]:
crashes[[
    'NUMBER_KILLED', 'NUMBER_INJURED',
    'COUNT_PED_KILLED', 'COUNT_PED_INJURED',
    'COUNT_BICYCLIST_KILLED', 'COUNT_BICYCLIST_INJURED'
]].describe()

In [None]:
crashes[crashes.NUMBER_INJURED>=20]

# stats

## functions

In [None]:
def join_streets(col1, col2):
    return '/'.join(sorted([str(col1), str(col2)]))

In [None]:
def summarize(df, summ_column, time_period=None, today=today):
    '''
    df - DataFrame with the stucture of crashes_street
    summ_column - summarize on column (excluding 'county', 'city')
    time_period = ('last5', 'last10') - time filter
    '''
    if time_period:   
        df = df[
            df.ACCIDENT_YEAR.isin(
                range(
                    today.year-int(time_period[4:]),
                    today.year
                )
            )
        ]
    
    df_return = df\
        .sort_values(by=['COUNTY', 'CITY', summ_column])\
        .groupby(by=['COUNTY', 'CITY', summ_column]).sum()\
        .reset_index(drop=False)\
        .drop(columns='ACCIDENT_YEAR')
    
    df_return.columns = [
        'COUNTY', 'CITY', summ_column
        ] + list(df_return.columns[3:] + '_' + time_period)

    return df_return

In [None]:
def get_top_n(df, top_n, summ_column, time_period, outcome):
    '''
    df - DataFrame with the stucture of crashes_street
    summ_column - summarize on column
    time_period = ('last5', 'last10') - time filter
    outcome = ('killed', 'injured') - outcome of the crash
    '''
    df_cols = list(df.columns[df.columns.str.contains(time_period)])
    sort_cols = [col for col in df_cols if 'number' in col.lower()]
    first = [col for col in sort_cols if outcome in col.lower()][0]
    sort_cols.remove(first)
    second = sort_cols[0]
    if len(sort_cols)!=1:
        second = sorted(second)[0]

    df_top = df[[summ_column] + df_cols].copy()\
        .sort_values(
            by=[first, second],
            ascending=False
        ).head(top_n)
    
    return df_top

## top streets

### prepare dataset

In [None]:
# if the crash happened at intesection - count secondary street as well
crashes_secondary = crashes[
        crashes.INTERSECTION=='Y'
    ].copy().reset_index(drop=True)
crashes_secondary.PRIMARY_RD = crashes_secondary.SECONDARY_RD
crashes_primary_secondary = pd.concat([crashes, crashes_secondary])
display(crashes_secondary.tail(3))
display(crashes_primary_secondary.tail(3))

In [None]:
# check duplicates
print(crashes_primary_secondary.shape[0])
crashes_primary_secondary.drop_duplicates(inplace=True)
print(crashes_primary_secondary.shape[0])

### get stats

In [None]:
# get main dataset for stats
crashes_street = crashes[[
    'COUNTY', 'CITY', 'ACCIDENT_YEAR', 'PRIMARY_RD',
    'NUMBER_KILLED', 'NUMBER_INJURED', 'COUNT_PED_KILLED', 'COUNT_PED_INJURED',
    'COUNT_BICYCLIST_KILLED', 'COUNT_BICYCLIST_INJURED'
]].groupby(by=['COUNTY', 'CITY', 'ACCIDENT_YEAR', 'PRIMARY_RD']).sum()\
.reset_index(drop=False)
display(crashes_street.head(2))

In [None]:
# get the list of all streets and fill it in
crashes_street_stat = crashes_primary_secondary[[
    'COUNTY', 'CITY', 'PRIMARY_RD'
]].copy().drop_duplicates()

intervals = ['last5', 'last10']

for interval in intervals:
    df_summary = summarize(
        df = crashes_street,
        summ_column = 'PRIMARY_RD',
        time_period = interval
    )
    crashes_street_stat = crashes_street_stat.merge(
        df_summary,
        how='left',
        on=['COUNTY', 'CITY', 'PRIMARY_RD']
    )
    
display(crashes_street_stat.head(2))

In [None]:
# find top = top_n_streets streets on killed/injures and make one list
top_streets = set()
outcome = ['killed', 'injured']

for interval in intervals:
    for out in outcome:
        print(interval, 'years,', out)
        top_n = get_top_n(
            df = crashes_street_stat,
            top_n = top_n_streets,
            summ_column = 'PRIMARY_RD',
            time_period = interval,
            outcome = out
        )
        top_streets.update(top_n.PRIMARY_RD.unique())
    
top_streets_df = crashes_street_stat[
    crashes_street_stat.PRIMARY_RD.isin(list(top_streets))
    ].reset_index(drop=True)

In [None]:
top_streets_df.to_csv(f'top{top_n_streets}_streets_combined_{today.strftime("%Y_%m_%d")}.csv', index=False)

## top intersections

### create intersection column

In [None]:
# these are probably outliers, may be look into it later
crashes[
    (crashes.INTERSECTION=='Y')&(crashes.DISTANCE > 15)
].sort_values(
    by='DISTANCE',
    ascending=False
).shape

In [None]:
# lets take all crashes within 15 ft distance from intersections
crashes_intersection = crashes[
        crashes.DISTANCE<=15
    ].copy().reset_index(drop=True)

# get an intersection combined name
crashes_intersection['cross_name'] = crashes_intersection.apply(
    lambda row: join_streets(
        row['PRIMARY_RD'], row['SECONDARY_RD']
    ), axis=1
)
display(crashes_intersection.head(2))

In [None]:
# get average coordinates for intersections from accidents on the intersection=Y
# so we can count it as coordinates for intersection
intersection_coords_list = []

for inters in crashes_intersection.cross_name.unique():
    cross_locations = crashes_intersection[
            (crashes_intersection.cross_name==inters)&
            (crashes_intersection.INTERSECTION=='Y')
        ]
    intersection_coords_list.append({
        'cross_name':inters,
        'cross_x':cross_locations.POINT_X.mean(),
        'cross_y':cross_locations.POINT_Y.mean()
    })

In [None]:
intersection_coords = pd.DataFrame(intersection_coords_list)
display(intersection_coords.head(2))

crashes_intersections = crashes_intersection.merge(intersection_coords, on='cross_name')
display(crashes_intersections.head(2))

In [None]:
# check intersections with locations far from the averaged value
# they still look not too far
crashes_intersections.loc[
    (crashes_intersections.INTERSECTION=='Y')&
    ((crashes_intersections.POINT_X-crashes_intersections.cross_x>0.1)|
     (crashes_intersections.POINT_Y-crashes_intersections.cross_y>0.1)),
    ['cross_name', 'cross_x', 'cross_y', 'POINT_X', 'POINT_Y']
]

### get stats

In [None]:
# get a base for the final table
crashes_cross_stat = crashes_intersections[[
    'COUNTY', 'CITY', 'cross_name', 'cross_x', 'cross_y'
]].copy().drop_duplicates()
display(crashes_cross_stat.head())

# get separate streets
crashes_cross_stat['street1'] = ''
crashes_cross_stat['street2'] = ''

crashes_cross_stat.street1 = crashes_cross_stat.cross_name.str.split('/').str[0]
crashes_cross_stat.street2 = crashes_cross_stat.cross_name.str.split('/').str[1]

In [None]:
# get a list of crashes for statistics
crashes_cross = crashes_intersections[[
    'COUNTY', 'CITY', 'ACCIDENT_YEAR', 'cross_name',
    'NUMBER_KILLED', 'NUMBER_INJURED', 'COUNT_PED_KILLED', 'COUNT_PED_INJURED',
    'COUNT_BICYCLIST_KILLED', 'COUNT_BICYCLIST_INJURED'
]].groupby(by=['COUNTY', 'CITY', 'ACCIDENT_YEAR', 'cross_name']).sum()\
.reset_index(drop=False)

intervals = ['last5', 'last10']
for interval in intervals:
    df_summary = summarize(
        df = crashes_cross,
        summ_column = 'cross_name',
        time_period = interval
    )
    crashes_cross_stat = crashes_cross_stat.merge(
        df_summary,
        how='left',
        on=['COUNTY', 'CITY', 'cross_name']
    )
    
display(crashes_cross_stat.head(2))

In [None]:
# find top = top_n_streets streets on killed/injures and make one list
top_intersections = set()
outcome = ['killed', 'injured']

for interval in intervals:
    for out in outcome:
        print(interval, 'years,', out)
        top_n = get_top_n(
            df = crashes_cross_stat,
            top_n = top_n_x,
            summ_column = 'cross_name',
            time_period = interval,
            outcome = out
        )
        top_intersections.update(top_n.cross_name.unique())
    
top_intersections_df = crashes_cross_stat[
        crashes_cross_stat.cross_name.isin(list(top_intersections))
    ].reset_index(
        drop=True
    ).sort_values(
        by=['NUMBER_KILLED_last5', 'NUMBER_INJURED_last5'],
        ascending=False
    )

In [None]:
top_intersections_df.to_csv(f'top{top_n_x}_intersections_combined_{today.strftime("%Y_%m_%d")}.csv', index=False)

### more checks

In [None]:
# some checks
display(top_intersections_df[top_intersections_df.cross_x==0])
display(top_intersections_df[top_intersections_df.street1.isnull()])

In [None]:
print(
    crashes_intersections[
        crashes_intersections.cross_x==0
    ].cross_name.nunique(),
    crashes_intersections[
        crashes_intersections.cross_x.isnull()
    ].cross_name.nunique(),
)

In [None]:
print(
    crashes_intersections[crashes_intersections.cross_x!=0].shape,
    crashes_intersections[crashes_intersections.cross_x==0].shape,
    crashes_intersections[crashes_intersections.cross_x.isnull()].shape,
)

In [None]:
top_intersections_df[top_intersections_df.cross_x.isnull()]

In [None]:
top_intersections_df[top_intersections_df.cross_name=='HASKELL AV/VINTAGE ST']