# Raw Data Analysis
In this notebook I explore the three data files collected from SWITRS. The purpose of this notebooks is to leek through each of the datasets and select the columns that will be used for the rest of the project. At the end of this notebook I hope to export a dataframe that I will clean and explore in a seperate notebook. While we are not making the final dataframe used in our model, this initial exploration will allow me to filter out data that is irrelevant to my project.

### Import Data

In [1]:
# import libraries
import pandas as pd
import numpy as np

In [2]:
collisions = pd.read_csv('CollisionRecords.csv', engine='python', encoding='latin1', on_bad_lines='skip')


In [3]:
party = pd.read_csv('PartyRecords.csv')

  party = pd.read_csv('PartyRecords.csv')


In [4]:
victims = pd.read_csv('VictimRecords.csv')

  victims = pd.read_csv('VictimRecords.csv')


## Raw Data At A Glance

In [12]:
print("Collisions Shape:", collisions.shape)
print("Victims Shape:", victims.shape)
print("Party Shape:", party.shape)

Collisions Shape: (2486655, 76)
Victims Shape: (1762334, 10)
Party Shape: (4911120, 33)


### Party Data

In [8]:
party.sample(10)

Unnamed: 0,CASE_ID,PARTY_NUMBER,PARTY_TYPE,AT_FAULT,PARTY_SEX,PARTY_AGE,PARTY_SOBRIETY,PARTY_DRUG_PHYSICAL,DIR_OF_TRAVEL,PARTY_SAFETY_EQUIP_1,...,MOVE_PRE_ACC,VEHICLE_YEAR,VEHICLE_MAKE,STWD_VEHICLE_TYPE,CHP_VEH_TYPE_TOWING,CHP_VEH_TYPE_TOWED,RACE,INATTENTION,SPECIAL_INFO_F,SPECIAL_INFO_G
4417163,9372859,1,1,Y,-,998,G,G,S,-,...,D,,-,-,99.0,,,,-,-
2950859,91470858,1,1,Y,M,38,A,-,N,M,...,J,2006.0,MAZDA,A,1.0,,H,K,-,-
372247,82262674,1,1,Y,F,29,A,-,W,L,...,F,2021.0,KIA,A,1.0,,H,,-,-
581022,8649679,2,1,N,F,52,A,-,S,M,...,A,2010.0,TOYOTA,A,1.0,,W,,-,-
4520802,9433810,2,1,N,M,29,H,H,N,M,...,A,2013.0,-,A,1.0,,A,,-,-
4464555,9403358,1,1,Y,F,40,A,-,N,L,...,B,2015.0,-,A,1.0,,,,-,-
4859274,9616713,1,1,Y,M,64,A,-,W,M,...,B,2010.0,-,A,1.0,,H,,-,-
694271,8708330,2,1,N,M,21,A,-,W,L,...,A,2000.0,HONDA,A,1.0,,W,,-,-
1374685,9060528,2,1,N,F,21,H,H,N,M,...,B,2018.0,NISSAN,A,1.0,,W,,-,-
2061847,90978928,2,1,N,F,24,A,-,E,M,...,B,2020.0,TOYT,A,1.0,,A,,-,-


In [30]:
party.columns

Index(['CASE_ID', 'PARTY_NUMBER', 'PARTY_TYPE', 'AT_FAULT', 'PARTY_SEX',
       'PARTY_AGE', 'PARTY_SOBRIETY', 'PARTY_DRUG_PHYSICAL', 'DIR_OF_TRAVEL',
       'PARTY_SAFETY_EQUIP_1', 'PARTY_SAFETY_EQUIP_2', 'FINAN_RESPONS',
       'SP_INFO_1', 'SP_INFO_2', 'SP_INFO_3', 'OAF_VIOLATION_CODE',
       'OAF_VIOL_CAT', 'OAF_VIOL_SECTION', 'OAF_VIOLATION_SUFFIX', 'OAF_1',
       'OAF_2', 'PARTY_NUMBER_KILLED', 'PARTY_NUMBER_INJURED', 'MOVE_PRE_ACC',
       'VEHICLE_YEAR', 'VEHICLE_MAKE', 'STWD_VEHICLE_TYPE',
       'CHP_VEH_TYPE_TOWING', 'CHP_VEH_TYPE_TOWED', 'RACE', 'INATTENTION',
       'SPECIAL_INFO_F', 'SPECIAL_INFO_G'],
      dtype='object')

In [36]:
party.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4911120 entries, 0 to 4911119
Data columns (total 33 columns):
 #   Column                Dtype  
---  ------                -----  
 0   CASE_ID               int64  
 1   PARTY_NUMBER          int64  
 2   PARTY_TYPE            object 
 3   AT_FAULT              object 
 4   PARTY_SEX             object 
 5   PARTY_AGE             int64  
 6   PARTY_SOBRIETY        object 
 7   PARTY_DRUG_PHYSICAL   object 
 8   DIR_OF_TRAVEL         object 
 9   PARTY_SAFETY_EQUIP_1  object 
 10  PARTY_SAFETY_EQUIP_2  object 
 11  FINAN_RESPONS         object 
 12  SP_INFO_1             object 
 13  SP_INFO_2             object 
 14  SP_INFO_3             object 
 15  OAF_VIOLATION_CODE    object 
 16  OAF_VIOL_CAT          object 
 17  OAF_VIOL_SECTION      float64
 18  OAF_VIOLATION_SUFFIX  object 
 19  OAF_1                 object 
 20  OAF_2                 object 
 21  PARTY_NUMBER_KILLED   int64  
 22  PARTY_NUMBER_INJURED  int64  
 23  MOVE_PR

In [37]:
# view missing values
party_miss_vals = party.isna().sum()

print("Party missing vals:", "\n",party_miss_vals[party_miss_vals>0])

Party missing vals: 
 PARTY_DRUG_PHYSICAL       11159
FINAN_RESPONS            294792
OAF_VIOLATION_CODE       312384
OAF_VIOL_SECTION        4598728
OAF_VIOLATION_SUFFIX    4826341
VEHICLE_YEAR             497839
VEHICLE_MAKE                 44
CHP_VEH_TYPE_TOWING      212539
CHP_VEH_TYPE_TOWED      4771117
RACE                     915274
INATTENTION             4817319
SPECIAL_INFO_G                4
dtype: int64


### Victims Data
We will not be using this data since what we need is captured in the other two dataframes.

In [5]:
victims.sample(10)

Unnamed: 0,CASE_ID,PARTY_NUMBER,VICTIM_ROLE,VICTIM_SEX,VICTIM_AGE,VICTIM_DEGREE_OF_INJURY,VICTIM_SEATING_POSITION,VICTIM_SAFETY_EQUIP_1,VICTIM_SAFETY_EQUIP_2,VICTIM_EJECTED
1624916,9492714,2,1,F,54,7,1,M,G,0
594034,9020360,1,1,F,26,7,1,L,G,0
331334,8752357,2,2,F,59,4,4,L,G,0
889483,91121887,2,1,F,28,7,1,M,G,0
1511724,9361558,2,1,F,35,4,1,L,G,0
486196,8911193,2,1,F,88,1,1,P,G,0
987654,91317603,1,1,M,20,7,1,M,G,0
1333420,92101528,1,3,M,54,6,0,P,P,0
1020735,91389053,1,1,F,44,6,1,L,G,0
1597797,9463582,2,2,M,20,0,3,L,E,0


In [38]:
victims.columns

Index(['CASE_ID', 'PARTY_NUMBER', 'VICTIM_ROLE', 'VICTIM_SEX', 'VICTIM_AGE',
       'VICTIM_DEGREE_OF_INJURY', 'VICTIM_SEATING_POSITION',
       'VICTIM_SAFETY_EQUIP_1', 'VICTIM_SAFETY_EQUIP_2', 'VICTIM_EJECTED'],
      dtype='object')

In [40]:
victims.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1762334 entries, 0 to 1762333
Data columns (total 10 columns):
 #   Column                   Dtype 
---  ------                   ----- 
 0   CASE_ID                  int64 
 1   PARTY_NUMBER             int64 
 2   VICTIM_ROLE              int64 
 3   VICTIM_SEX               object
 4   VICTIM_AGE               int64 
 5   VICTIM_DEGREE_OF_INJURY  int64 
 6   VICTIM_SEATING_POSITION  object
 7   VICTIM_SAFETY_EQUIP_1    object
 8   VICTIM_SAFETY_EQUIP_2    object
 9   VICTIM_EJECTED           object
dtypes: int64(5), object(5)
memory usage: 134.5+ MB


In [41]:
# view missing values
victims_miss_vals = victims.isna().sum()

print("Victims missing vals:","\n",victims_miss_vals[victims_miss_vals>0])

Victims missing vals: 
 Series([], dtype: int64)


### Collisions Data

In [6]:
collisions.sample(10)

Unnamed: 0,CASE_ID,ACCIDENT_YEAR,PROC_DATE,JURIS,COLLISION_DATE,COLLISION_TIME,OFFICER_ID,REPORTING_DISTRICT,DAY_OF_WEEK,CHP_SHIFT,...,COUNT_PED_KILLED,COUNT_PED_INJURED,COUNT_BICYCLIST_KILLED,COUNT_BICYCLIST_INJURED,COUNT_MC_KILLED,COUNT_MC_INJURED,PRIMARY_RAMP,SECONDARY_RAMP,LATITUDE,LONGITUDE
1910096,91996383,2023,20230216,9435,20230209,1835,021086,,4,2,...,0,0,0,0,0,0,-,-,36.75441,119.76919
2267641,9405194,2021,20220215,106,20211107,845,412,,7,5,...,0,1,0,0,0,0,-,-,37.63734,122.07017
696076,90650614,2018,20180129,9650,20180122,925,013570,,1,1,...,0,0,0,0,0,0,-,-,33.18312,117.29887
410754,8776833,2018,20190130,4905,20181218,1231,SR533,10609.0,2,5,...,0,0,0,0,0,0,-,-,,
1293626,91250212,2020,20200610,9265,20200531,1308,013515,,7,1,...,0,0,0,0,0,0,-,-,37.99282,121.26596
927451,90879595,2018,20181210,9765,20181205,740,016234,,3,1,...,0,0,0,0,0,0,-,-,34.22485,119.15803
1616204,9162457,2020,20201015,3313,20200805,1558,2029,2.0,3,5,...,0,0,0,0,0,0,-,-,,
1855149,91928184,2022,20221128,9660,20221122,1431,023001,,2,2,...,0,0,0,0,0,0,-,-,33.61006,114.82376
2204010,9321415,2022,20220803,3703,20220701,8,1392,3.0,5,5,...,0,0,0,0,0,0,-,-,32.677,117.17221
1373298,91335453,2020,20201030,9390,20201021,1426,018869,,3,2,...,0,0,0,0,0,0,-,-,37.70301,121.75593


In [7]:
collisions.columns

Index(['CASE_ID', 'ACCIDENT_YEAR', 'PROC_DATE', 'JURIS', 'COLLISION_DATE',
       'COLLISION_TIME', 'OFFICER_ID', 'REPORTING_DISTRICT', 'DAY_OF_WEEK',
       'CHP_SHIFT', 'POPULATION', 'CNTY_CITY_LOC', 'SPECIAL_COND', 'BEAT_TYPE',
       'CHP_BEAT_TYPE', 'CITY_DIVISION_LAPD', 'CHP_BEAT_CLASS', 'BEAT_NUMBER',
       'PRIMARY_RD', 'SECONDARY_RD', 'DISTANCE', 'DIRECTION', 'INTERSECTION',
       'WEATHER_1', 'WEATHER_2', 'STATE_HWY_IND', 'CALTRANS_COUNTY',
       'CALTRANS_DISTRICT', 'STATE_ROUTE', 'ROUTE_SUFFIX', 'POSTMILE_PREFIX',
       'POSTMILE', 'LOCATION_TYPE', 'RAMP_INTERSECTION', 'SIDE_OF_HWY',
       'TOW_AWAY', 'COLLISION_SEVERITY', 'NUMBER_KILLED', 'NUMBER_INJURED',
       'PARTY_COUNT', 'PRIMARY_COLL_FACTOR', 'PCF_CODE_OF_VIOL',
       'PCF_VIOL_CATEGORY', 'PCF_VIOLATION', 'PCF_VIOL_SUBSECTION',
       'HIT_AND_RUN', 'TYPE_OF_COLLISION', 'MVIW', 'PED_ACTION',
       'ROAD_SURFACE', 'ROAD_COND_1', 'ROAD_COND_2', 'LIGHTING',
       'CONTROL_DEVICE', 'CHP_ROAD_TYPE', 'PEDESTRIAN_

In [100]:
collisions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2486655 entries, 0 to 2486654
Data columns (total 76 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   CASE_ID                  int64  
 1   ACCIDENT_YEAR            int64  
 2   PROC_DATE                int64  
 3   JURIS                    object 
 4   COLLISION_DATE           int64  
 5   COLLISION_TIME           int64  
 6   OFFICER_ID               object 
 7   REPORTING_DISTRICT       object 
 8   DAY_OF_WEEK              int64  
 9   CHP_SHIFT                int64  
 10  POPULATION               int64  
 11  CNTY_CITY_LOC            int64  
 12  SPECIAL_COND             int64  
 13  BEAT_TYPE                int64  
 14  CHP_BEAT_TYPE            object 
 15  CITY_DIVISION_LAPD       object 
 16  CHP_BEAT_CLASS           int64  
 17  BEAT_NUMBER              object 
 18  PRIMARY_RD               object 
 19  SECONDARY_RD             object 
 20  DISTANCE                 float64
 21  DIRECTIO

In [42]:
# view missing values
collision_miss_vals = collisions.isna().sum()

print("Collision missing vals:","\n",collision_miss_vals[collision_miss_vals>0])

Collision missing vals: 
 OFFICER_ID                 1606
REPORTING_DISTRICT      1592911
CITY_DIVISION_LAPD      2372460
BEAT_NUMBER              155545
SECONDARY_RD                 70
DIRECTION                605699
STATE_HWY_IND              2734
CALTRANS_COUNTY         2422020
CALTRANS_DISTRICT       2422020
STATE_ROUTE             2422020
ROUTE_SUFFIX            2422035
POSTMILE_PREFIX         2422024
POSTMILE                2422020
LOCATION_TYPE           2422020
RAMP_INTERSECTION       2422020
SIDE_OF_HWY             2422040
TOW_AWAY                  31018
PCF_VIOLATION            144146
PCF_VIOL_SUBSECTION     1581598
PEDESTRIAN_ACCIDENT     2410387
BICYCLE_ACCIDENT        2427518
MOTORCYCLE_ACCIDENT     2398630
TRUCK_ACCIDENT          2327746
ALCOHOL_INVOLVED        2209823
CHP_VEHTYPE_AT_FAULT      95780
LATITUDE                 757214
LONGITUDE                757214
dtype: int64


### Notes:
- Relevant columns must be chosen for project
- Imported data had mixed data types that need to be changed to the appropriate data type using the SWITRS data decription
- Values must be appropriately encoded to fit data description. Ex: Age 998 = Not Stated 
- Missing values must be imputed or removed
- Final columns must be combined to make a new data set

## Select Data
For the first round of selecting data, I will manually go into the data description and choose columns that do not contribute to the purpose of the project such as `CHP Vehicle Type Towing` since this happens after the incedent and it's not something that lead to a collision. Later in the project we will use more advance techniques to filter through column selection for the model. Victims data will not be used since the information is captured in the party dataframe and other columns aren't needed.

### Collisions
List of columns:  
Case ID  
Collision Date  
Collision Time  
Reporting District  
Day of the Week  
CHP Shift  
Population  
County City Location  
Beat Type  
CHP Beat Type (A beat is a specific patrol area within a service area where a violation occurs.)  
Primary Rd    
Secondary Rd  
Direction  
Intersection  
Weather 1  
Weather 2  
State Highway Indicator    
Location Type    
Ramp Intersection    
Side of Highway    
Party Count  
Primary Collision Factor  
PCF Violation Code  
PCF Violation Category (Target)  
Type of Collision  
Motor Vehicle Involved With  
Ped Action  
Road Surface  
Road Condition 1  
Road Condition 2  
Lighting  
Control Device  
Ped Collision  
Bicycle Collision  
Motorcycle Collision  
Truck Collision  
Not Private Property  
Alcohol Involved     
Latitude  
Longitude


In [85]:
# choose columns to keep for collision
cols_to_keep = ['CASE_ID','COLLISION_DATE','COLLISION_TIME','REPORTING_DISTRICT','DAY_OF_WEEK',
                'CHP_SHIFT','POPULATION','CNTY_CITY_LOC','BEAT_TYPE','CHP_BEAT_TYPE','DISTANCE',
                'DIRECTION','INTERSECTION','WEATHER_1','WEATHER_2','STATE_HWY_IND','LOCATION_TYPE',
                'RAMP_INTERSECTION','SIDE_OF_HWY','PARTY_COUNT','PRIMARY_COLL_FACTOR','PCF_CODE_OF_VIOL',
                'PCF_VIOL_CATEGORY','TYPE_OF_COLLISION','MVIW', 'PED_ACTION','ROAD_SURFACE','ROAD_COND_1',
                'ROAD_COND_2','LIGHTING','CONTROL_DEVICE','PEDESTRIAN_ACCIDENT','BICYCLE_ACCIDENT', 
                'MOTORCYCLE_ACCIDENT','TRUCK_ACCIDENT','NOT_PRIVATE_PROPERTY','ALCOHOL_INVOLVED',
                'LATITUDE','LONGITUDE']

reduced_collisions = collisions[cols_to_keep]

In [86]:
reduced_collisions.sample(10)

Unnamed: 0,CASE_ID,COLLISION_DATE,COLLISION_TIME,REPORTING_DISTRICT,DAY_OF_WEEK,CHP_SHIFT,POPULATION,CNTY_CITY_LOC,BEAT_TYPE,CHP_BEAT_TYPE,...,LIGHTING,CONTROL_DEVICE,PEDESTRIAN_ACCIDENT,BICYCLE_ACCIDENT,MOTORCYCLE_ACCIDENT,TRUCK_ACCIDENT,NOT_PRIVATE_PROPERTY,ALCOHOL_INVOLVED,LATITUDE,LONGITUDE
2259573,9391451,20211212,1015,0610,7,5,6,1985,0,0,...,A,A,,,,,Y,,34.42062,118.48455
700721,90655162,20180126,2252,,5,3,5,112,1,1,...,C,D,,,,,Y,,37.71966,122.17698
1431911,91403864,20210117,1750,,7,2,4,3618,1,1,...,D,D,,,,Y,Y,,34.13155,117.20811
1260448,91213421,20200229,340,,6,3,9,4100,1,3,...,D,D,,,,,Y,Y,37.50792,122.35537
1694179,91723261,20220311,1150,,5,1,7,3711,1,1,...,A,D,,,,Y,Y,,32.78389,117.11391
1568796,91567126,20210816,1452,,1,2,6,3345,1,3,...,A,D,,,,,Y,,34.01929,117.49056
829271,90783169,20180729,116,,7,3,9,3700,1,1,...,D,D,,,,,Y,Y,33.35462,117.15894
1160346,91111315,20191022,1915,,2,2,7,3711,1,1,...,C,D,,,,,Y,,32.77839,117.10736
1512670,91499563,20210608,1352,,2,1,9,1900,1,1,...,A,D,,,,,Y,,34.28231,118.40134
279956,8638449,20180603,1603,9B,7,5,6,3315,0,0,...,A,A,,,,,Y,,33.88548,117.53619


### Party 
List of Columns:  
Case id (to merge on)  
At Fault  
Party Sex  
Party Age  
Party Sobriety  
Party Drug Physical   
Direction of travel  
Party Safety Equiptment 1  
Party Safety Equiptment 2  
Financial Responsibility  
Special Info 2  (Cell use)
OAF Violation Code  
OAF Violation Category  
OAF Factor 1  
OAF Factor 2  
Movement Preceding Collision    
Vehicle Year  
Vehicle Make  
Statewide Vehicle Type  
CHP Vehicle Type  
CHP Vehicle Type Towed  
Party Race  



In [77]:
# choose columns to keep for party
party_col_to_keep = ['CASE_ID','AT_FAULT','PARTY_SEX','PARTY_AGE','PARTY_SOBRIETY','PARTY_DRUG_PHYSICAL', 
                     'DIR_OF_TRAVEL','PARTY_SAFETY_EQUIP_1','PARTY_SAFETY_EQUIP_2','FINAN_RESPONS','SP_INFO_2',
                     'OAF_VIOLATION_CODE','OAF_VIOL_CAT','OAF_1','OAF_2','MOVE_PRE_ACC','VEHICLE_YEAR','VEHICLE_MAKE',
                     'STWD_VEHICLE_TYPE','CHP_VEH_TYPE_TOWING','CHP_VEH_TYPE_TOWED','RACE']

party_reduced = party[party_col_to_keep]

In [88]:
party_reduced.sample(10)


Unnamed: 0,CASE_ID,AT_FAULT,PARTY_SEX,PARTY_AGE,PARTY_SOBRIETY,PARTY_DRUG_PHYSICAL,DIR_OF_TRAVEL,PARTY_SAFETY_EQUIP_1,PARTY_SAFETY_EQUIP_2,FINAN_RESPONS,...,OAF_VIOL_CAT,OAF_1,OAF_2,MOVE_PRE_ACC,VEHICLE_YEAR,VEHICLE_MAKE,STWD_VEHICLE_TYPE,CHP_VEH_TYPE_TOWING,CHP_VEH_TYPE_TOWED,RACE
2234928,91066468,N,M,23,A,-,S,M,G,Y,...,-,N,-,A,2012.0,MITS,A,1.0,,H
3497494,91821743,N,F,43,A,-,N,M,G,Y,...,-,N,-,H,2012.0,TOYT,A,1.0,,H
3552467,91856988,N,M,34,A,-,N,M,G,Y,...,-,N,-,A,2022.0,MAZD,A,7.0,,O
258373,82013237,Y,M,998,G,G,W,-,-,N,...,-,N,-,G,1997.0,CHEVROLET,D,22.0,,B
1897982,90895621,Y,M,92,A,-,N,M,G,Y,...,-,N,-,D,2016.0,CHEV,D,22.0,,W
3431105,91778141,N,-,998,G,G,N,B,B,N,...,-,N,-,J,,UNKNOWN,-,99.0,,
2301335,91099417,N,M,46,A,-,S,P,G,Y,...,-,N,-,H,2015.0,FRH,G,25.0,31.0,O
2622963,91273944,N,M,69,A,-,E,P,G,Y,...,-,N,-,B,1997.0,DODGE,D,22.0,,H
19617,81330620,N,M,23,A,-,E,M,G,Y,...,-,N,-,D,1997.0,PLYMOUTH,A,1.0,,B
4236886,9260882,N,M,998,A,-,W,-,-,N,...,-,N,-,B,,-,-,,,


### Merge Dataframes

In [91]:
accidents = party_reduced.merge(reduced_collisions, how = 'left', on = 'CASE_ID')

In [95]:
pd.set_option('display.max_columns', None)

accidents.head(10)

Unnamed: 0,CASE_ID,AT_FAULT,PARTY_SEX,PARTY_AGE,PARTY_SOBRIETY,PARTY_DRUG_PHYSICAL,DIR_OF_TRAVEL,PARTY_SAFETY_EQUIP_1,PARTY_SAFETY_EQUIP_2,FINAN_RESPONS,SP_INFO_2,OAF_VIOLATION_CODE,OAF_VIOL_CAT,OAF_1,OAF_2,MOVE_PRE_ACC,VEHICLE_YEAR,VEHICLE_MAKE,STWD_VEHICLE_TYPE,CHP_VEH_TYPE_TOWING,CHP_VEH_TYPE_TOWED,RACE,COLLISION_DATE,COLLISION_TIME,REPORTING_DISTRICT,DAY_OF_WEEK,CHP_SHIFT,POPULATION,CNTY_CITY_LOC,BEAT_TYPE,CHP_BEAT_TYPE,DISTANCE,DIRECTION,INTERSECTION,WEATHER_1,WEATHER_2,STATE_HWY_IND,LOCATION_TYPE,RAMP_INTERSECTION,SIDE_OF_HWY,PARTY_COUNT,PRIMARY_COLL_FACTOR,PCF_CODE_OF_VIOL,PCF_VIOL_CATEGORY,TYPE_OF_COLLISION,MVIW,PED_ACTION,ROAD_SURFACE,ROAD_COND_1,ROAD_COND_2,LIGHTING,CONTROL_DEVICE,PEDESTRIAN_ACCIDENT,BICYCLE_ACCIDENT,MOTORCYCLE_ACCIDENT,TRUCK_ACCIDENT,NOT_PRIVATE_PROPERTY,ALCOHOL_INVOLVED,LATITUDE,LONGITUDE
0,81715,Y,M,35,H,H,N,L,G,Y,3,-,-,N,-,B,2007.0,FORD,-,,,O,20200314.0,745.0,212.0,6.0,5.0,7.0,1941.0,0.0,0,0.0,,Y,B,-,N,,,,2.0,A,-,3,C,C,A,A,H,-,A,D,,,,,Y,,,
1,81715,N,F,43,H,H,N,M,G,Y,3,-,-,N,-,B,2019.0,-,-,,,H,20200314.0,745.0,212.0,6.0,5.0,7.0,1941.0,0.0,0,0.0,,Y,B,-,N,,,,2.0,A,-,3,C,C,A,A,H,-,A,D,,,,,Y,,,
2,682021,Y,F,21,C,-,W,M,G,Y,-,-,-,E,-,J,2014.0,-,A,1.0,,H,20210608.0,855.0,,2.0,5.0,5.0,3709.0,0.0,0,0.0,,-,A,-,N,,,,2.0,A,-,9,D,C,A,A,H,-,A,D,,,,,Y,Y,,
3,682021,N,F,58,C,-,W,L,G,Y,-,-,-,E,-,B,2018.0,-,A,1.0,,A,20210608.0,855.0,,2.0,5.0,5.0,3709.0,0.0,0,0.0,,-,A,-,N,,,,2.0,A,-,9,D,C,A,A,H,-,A,D,,,,,Y,Y,,
4,726202,Y,-,998,G,G,N,-,-,,-,-,-,N,-,B,2005.0,-,A,1.0,,,20200726.0,250.0,64.0,7.0,5.0,5.0,3612.0,0.0,0,0.0,,-,A,-,N,,,,1.0,A,-,3,E,I,A,A,H,-,C,D,,,,,Y,,,
5,6698641,Y,M,52,A,-,E,N,C,N,3,-,-,N,-,B,1981.0,CHEVROLET,D,22.0,,H,20180911.0,1530.0,27.0,2.0,5.0,5.0,3012.0,0.0,0,10.0,W,N,A,-,N,,,,3.0,A,-,3,C,C,A,A,H,-,A,D,,,,,Y,,,
6,6698641,N,F,52,A,-,E,M,G,Y,3,-,-,N,-,A,2015.0,MAZDA,A,7.0,,W,20180911.0,1530.0,27.0,2.0,5.0,5.0,3012.0,0.0,0,10.0,W,N,A,-,N,,,,3.0,A,-,3,C,C,A,A,H,-,A,D,,,,,Y,,,
7,6698641,N,F,54,A,-,E,M,G,Y,3,-,-,N,-,A,2007.0,CHEVROLET,D,22.0,,H,20180911.0,1530.0,27.0,2.0,5.0,5.0,3012.0,0.0,0,10.0,W,N,A,-,N,,,,3.0,A,-,3,C,C,A,A,H,-,A,D,,,,,Y,,,
8,6698645,Y,M,31,B,-,N,M,G,N,3,,25,A,-,H,1998.0,NISSAN,A,1.0,,H,20180930.0,1945.0,2607.0,7.0,5.0,6.0,1950.0,0.0,0,0.0,,Y,A,-,Y,H,-,E,2.0,A,-,1,C,C,A,A,H,-,C,A,,,,,Y,Y,34.57264,118.04491
9,6698645,N,M,44,A,-,N,M,G,Y,3,-,-,N,-,A,2018.0,AUDI,A,1.0,,W,20180930.0,1945.0,2607.0,7.0,5.0,6.0,1950.0,0.0,0,0.0,,Y,A,-,Y,H,-,E,2.0,A,-,1,C,C,A,A,H,-,C,A,,,,,Y,Y,34.57264,118.04491


In [94]:
# export data
accidents.to_csv('accident_data.csv')

This is the end of this workbook. In order to save memory on my device I will continue exploring and cleaning the data in another workbook. This will also maintain a cleaner look for the project.  
Something that I am wondering at this point of the project is if I should only consider the data for people who were at fault to avoid doubling values for some columns. 
