## Safe Routes Data - 

**Limited to San Fran Area**

**Generates and appends sythetic negative data**

### Import accident data

In [0]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
url = 'https://raw.githubusercontent.com/safe-routes/Build-SafeRoutes-DS/master/safe-routes-model.csv'
ac = pd.read_csv(url, error_bad_lines = False)

In [0]:
ac.head()

Unnamed: 0,TWAY_ID,TWAY_ID2,COUNTY,LATITUDE,LONGITUD,DATE,DAY_WEEK,HOUR,LGT_COND,WEATHER,WRK_ZONE,FATALS,PEDS,MAN_COLL,FUNC_SYS,TYP_INT
0,SR-51,NO SECOND STREET,SAN MATEO,32.618239,-85.371383,2015-01-13,TUESDAY,17,NIGHT,CLEAR,0,1,0,ANGLED,ARTERY,NOT AN INTERSECTION
1,CR-40,NO SECOND STREET,ALAMEDA,32.524344,-86.672119,2015-01-16,FRIDAY,19,NIGHT,CLEAR,0,1,0,NOT APPLICABLE,COLLECTOR,NOT AN INTERSECTION
2,SR-14,NO SECOND STREET,ALAMEDA,32.430664,-86.517917,2015-01-10,SATURDAY,0,NIGHT,CLEAR,0,1,0,NOT APPLICABLE,ARTERY,NOT AN INTERSECTION
3,SR-147,NO SECOND STREET,SAN MATEO,32.615806,-85.507961,2015-02-11,WEDNESDAY,11,DAY,CLEAR,0,1,0,HEAD ON,ARTERY,NOT AN INTERSECTION
4,US-SR 1,NO SECOND STREET,SAN MATEO,32.679275,-85.370181,2015-02-23,MONDAY,18,NIGHT,CLEAR,0,1,1,NOT APPLICABLE,ARTERY,NOT AN INTERSECTION


In [0]:
ac.COUNTY.unique()

array(['SAN MATEO', 'ALAMEDA', 'SAN FRANCISCO', 'MARIN', 'SANTA CLARA'],
      dtype=object)

In [0]:
ac.shape

(1164, 16)

### Add label column 'CRASH' to accident data

In [0]:
ac['CRASH'] = np.ones(len(ac)).astype(int)  # prefill with 1's

### define a random date within a range

In [0]:
import random
from datetime import datetime, timedelta
def newdate():
    start = datetime(2015, 1, 1)
    end = datetime(2017, 12, 31)
    DATE = start + (end - start) * random.random()
    DATE = pd.to_datetime(DATE)
    return DATE
DATE = newdate()
DATE

Timestamp('2016-12-10 20:41:14.907864')

### Create synthetic Negatives and add to accidents df

In [0]:
# Making copy of 'ac' as 'ac2'
ac2 = ac.copy()

In [0]:
# CREATING RANDOM NEGATIVE OBSERVATIONS

for _ in range(len(ac2)*10):
    TWAY_ID = np.random.choice(ac2['TWAY_ID'])
    ID = ac2[ac2['TWAY_ID'] == TWAY_ID]
    TWAY_ID2 =  np.random.choice(ID['TWAY_ID2'])
    COUNTY =  np.random.choice(ID['COUNTY'])
    LATITUDE = np.random.choice(ID['LATITUDE'])
    LONGITUD = np.random.choice(ID['LONGITUD'])
    DATE = newdate()
    DATE = pd.Series(DATE)
    DAY_WEEK = DATE.dt.weekday.values[0] #0-6 starting Sunday
    HOUR = DATE.dt.hour.values[0]
    DATE = DATE.dt.date.values[0]
    LGT_COND = 'NIGHT' if HOUR > 20 else 'DAY'
    WEATHER = np.random.choice(ac2['WEATHER'])
    #^ or find way to populate based on date, lgt and county
    WRK_ZONE = 0
    #^also look for way to populate based on date and county
    FATALS = 0
    PEDS = 0
    MAN_COLL = 'NOT APPLICABLE'
    FUNC_SYS = np.random.choice(ID['FUNC_SYS'])
    TYP_INT = np.random.choice(ID['TYP_INT'])
    CRASH = 0
    ROW = [[TWAY_ID, TWAY_ID2, COUNTY, LATITUDE, LONGITUD,
           DATE, DAY_WEEK, HOUR, LGT_COND, WEATHER, WRK_ZONE,
           FATALS, PEDS, MAN_COLL, FUNC_SYS, TYP_INT, CRASH]]
    NAMES = ac2.columns
    ROW = pd.DataFrame(ROW, columns = NAMES)
    ac2 = ac2.append(ROW)

In [0]:
ac2.shape

(48008, 17)

### Unencode Day of Week

In [0]:
ac2['DAY_WEEK'] = ac2['DAY_WEEK'].replace({0: 'SUNDAY',
                                         1: 'MONDAY',
                                         2: 'TUESDAY',
                                         3: 'WEDNESDAY',
                                         4: 'THURSDAY',
                                         5: 'FRIDAY',
                                         6: 'SATURDAY'})

In [0]:
ac2.tail()

In [0]:
ac2.CRASH.value_counts()