In [54]:
from pandas.core.flags import Flags
import pandas as pd

samp_sz = 150

data = pd.read_csv('crash_data.csv')
crashes = data[data['Record Type'] == 1]
vehicles = data[data['Record Type'] == 2]
participants = data[data['Record Type'] == 3]

crash_sample = crashes.sample(samp_sz)
sample = crash_sample.join(vehicles, on='Crash ID', lsuffix='_left', rsuffix='_right').join(
    participants, on='Crash ID_left', lsuffix='-l', rsuffix='-r'
)

passed = True

# 1a, every crash has a county code
if (crash_sample.count()['County Code'] != samp_sz):
  print("error, some data is missing a county code")
  passed = False

# 1b, every crash has a date
if (crash_sample.count()['Crash Month'] 
    + crash_sample.count()['Crash Day'] 
    + crash_sample.count()['Crash Year']!= 3*samp_sz):
  print("error, some data is missing dat info")
  passed = False

# 2a, lat/lon values make sense
if (not (crash_sample['Latitude Degrees'] >= 41).all() or not 
        (crash_sample['Latitude Degrees'] <= 46).all() or not
        (crash_sample['Latitude Minutes'] >= 0).all()  or not
        (crash_sample['Latitude Minutes'] <= 59).all() or not
        (crash_sample['Latitude Seconds'] >= 0).all()  or not
        (crash_sample['Latitude Seconds'] <= 59.99).all() or not
        (crash_sample['Longitude Degrees'] >= -124).all() or not 
        (crash_sample['Longitude Degrees'] <= -116).all() or not
        (crash_sample['Longitude Minutes'] >= 0).all()  or not
        (crash_sample['Longitude Minutes'] <= 59).all() or not
        (crash_sample['Longitude Seconds'] >= 0).all()  or not
        (crash_sample['Longitude Seconds'] <= 59.99).all()):
  print("error, location doesn't make sense")
  passed = False

# 2b, check week
if (not (crash_sample['Week Day Code'] >= 1).all() or not
        (crash_sample['Week Day Code'] <= 7).all()):
  print("error, crashes must occur during the week")
  passed = False

fatal_crashes = crash_sample[crash_sample['Crash Severity'] == 2]
# 3a, crash type 2 should have fatalities
if (not ((fatal_crashes['Total Pedalcyclist Fatality Count'] + 
        fatal_crashes['Total Pedestrian Fatality Count'] + 
        fatal_crashes['Total Fatality Count'] +
        fatal_crashes['Total Unknown Non-Motorist Fatality Count']) > 0).all()):
  print("error, fatal crashes should have fatalities")
  passed = False

participants = sample[sample['Record Type'] == 3]
# 3b, all type 3 records should have a value in the age column
if (not participants['Age'].all()):
  print("error, type 3 records must have an age code")
  passed = False


# 4a, every type 1 record should have at least one type 2 or 3 record
for index, row in crash_sample.iterrows():
  if (not (vehicles['Crash ID'] == row['Crash ID']).any() and not
          (participants['Crash ID'] == row['Crash ID']).any()):
    print("error, type 1 record should have at least one type 2/3 record")
    passed = False
    break

# 4b, total vehicle count should equal number of type 2 records for that crash ID
for index, row in crash_sample.iterrows():
  if (not (vehicles[(vehicles['Crash ID'] == row['Crash ID'])].count()[0] == row['Total Vehicle Count'])):
    print("error, should be as many type 2 records as value of 'Vehicle Count' ")
    row['Total Vehicle Count'] = vehicles[(vehicles['Crash ID'] == row['Crash ID'])].count()[0]
    print("Updated 'Vehicle Count' to reflect number of type 2 records with that crash ID")
    passed = True
    break

# 5a, Number of crashes should not greatly exceed the number of vehicles
if ((crashes.count()[0] / vehicles.count()[0]) > 5):
  print("error, more crashes than vehicles by a factor of 5 or more")
  passed = False

# 5b, Number of crashes should not be more than 100,000
if (crashes.count()[0] > 100000):
  print("error, too many crashes")
  passed = False

# 6a, winter crashes are more common than summer crashes
if (not (crashes[crashes['Crash Month'] == 12].count()[0] +
        crashes[crashes['Crash Month'] == 1].count()[0] +
        crashes[crashes['Crash Month'] == 2].count()[0]) >
        (crashes[crashes['Crash Month'] == 6].count()[0] +
        crashes[crashes['Crash Month'] == 7].count()[0] +
        crashes[crashes['Crash Month'] == 8].count()[0])):
  print("error, should be more crashes in the winter")
  passed = False

# 6b, severe crashes should be less common
if (not (crashes[crashes['Crash Severity'] == 2].count()[0] +
        crashes[crashes['Crash Severity'] == 4].count()[0]) >
        crashes[crashes['Crash Severity'] == 5].count()[0]):
  print("error, most crashes should cause death or injury")
  passed = False

if (passed):
  print("data has been validated, saving now!")
  crash_sample.to_csv('updated_crash_sample.csv')
  vehicles.to_csv('vehicle_records.csv')
  participants.to_csv('participant_records.csv')

error, should be as many type 2 records as value of 'Vehicle Count' 
Updated 'Vehicle Count' to reflect number of type 2 records with that crash ID
data has been validated, saving now!
