In [10]:
# Import all dependencies
import pandas as pd
import numpy as np

#SQLAlchemy libraries
import sqlite3
from sqlalchemy import create_engine # library to create the connection between DB and Python


# Datetime libraries
from datetime import datetime as dt, timedelta

# library to read data from zipfile
from zipfile import ZipFile as ZF
import os

# libraries to read json
import json
import requests
from pandas.io.json import json_normalize


In [11]:
# Declare global variables
dbPath = "trafficViolations/static/db"

dbName = "trafficViolations.sqlite"

importDir = "rawdata"
importFile = "traffic-violations-in-usa.zip"

#columns name (after it is read onto pandas)
df_col_names = ['DateOfStop', 'TimeOfStop', 'Agency', 'SubAgency', 'Description',
       'Location', 'Latitude', 'Longitude', 'Accident', 'Belts',
       'PersonalInjury', 'PropertyDamage', 'Fatal', 'CommercialLicense',
       'HAZMAT', 'CommercialVehicle', 'Alcohol', 'WorkZone', 'State',
       'VehicleType', 'Year', 'Make', 'Model', 'Color', 'ViolationType',
       'Charge', 'Article', 'ContributedToAccident', 'Race', 'Gender',
       'DriverCity', 'DriverState', 'DLState', 'ArrestType',
       'Geolocation']

violationCat = {"Impaired" : ["ALCOHOL","DRUGS","ALCO"],
                "Offense" : ["REGISTRATION","LICENSE","INSURANCE","PLATE","REG.PLATE"],
                "Safety":["UNSAFE","SEATBELT","HELMET","EQUIP","EQUIPMENT","WINDSHIELD","MIRRORS","BRAKE","INADEQUATE,INOPERATIVE","OPERATING"],
                "Violation":["SPEEDING","SPEED","STOP","PARKING","FAILURE","ELLUDE,LAMP,LAMPS","DEVICE","SIGNAL","LIGHT","LIGHTS","AVOIDING","AVOID","INTERSECTION"],
                "Distraction":["HANDHELD","MOBILE","ELECTRONIC","VIDEO","EARPLUGS","SOUND","TEXT","MSG."]
                }

vehicleGrp = {
    "Automobile":["Automobile","Limousine","Station Wagon"],
"Truck":["Light Duty Truck","Heavy Duty Truck"],
"Motorcyle":["Motorcycle","Moped"],
"Other":["Recreational Vehicle","Unknown","Commercial Rig","Camper"],
"Bus":["Transit Bus","School Bus","Cross Country Bus"],
"FarmVehicle":["Farm Vehicle","Farm Equipment"],
"Trailer":["Utility Trailer","Mobile Home","Tandem Trailer","Travel/Home Trailer","Boat Trailer"],
"RMS":["Fire Vehicle","Ambulance","Police Vehicle","Police(Emerg)","Ambulance(Emerg)","Ambulance(Non-Emerg)","Police(Non-Emerg)","Fire(Non-Emerg)","Fire(Emerg)"]
}

mnthToQtr = {1:1,2:1,3:1,4:2,5:2,6:2,7:3,8:3,9:3,10:4,11:4,12:4}

# zipCode JSON file URL
zipCd_URL = "https://data.montgomerycountymd.gov/resource/mmib-2cgz.json"

#Police District JSON link
police_dist_URL = "https://data.montgomerycountymd.gov/resource/vxy6-ve2e.json"

In [12]:
################ Global Helper Functions ###################################
# Function to derive Violation category based on description
# ensure violationCat dict is declared as global variable
def assignViolationCat(desc):
    for key,value in violationCat.items():
    #     print(value)
        if any(v in desc for v in value):
            return key
    else:
        return "Other"

def getPoliceDistrict(str):
    return(str if(str != "H") else "8")

def assignVehGrp(vehType):
    for key,value in vehicleGrp.items():
    #     print(value)
        if any(v in vehType for v in value):
            return key
    else:
        return "Other"
    

### Read csv data file and import into sqlite

In [13]:
# Read accident data from all the zip files
violationsDF = pd.concat(\
    [pd.read_csv(ZF(os.path.join(importDir,importFile)).open("Traffic_Violations.csv"), low_memory=False, header = 0, names = df_col_names) \
     for file in os.listdir(importDir) \
     if file.endswith("zip")], \
        ignore_index = True)

ValueError: No objects to concatenate

In [9]:
violationsDF.head(2)

NameError: name 'violationsDF' is not defined

In [None]:
violationsDF.columns

In [None]:
dropCols = ["TimeOfStop","Agency","Location","Latitude","Longitude","Accident",'Belts',"CommercialLicense",\
            "HAZMAT",'CommercialVehicle', 'Alcohol', 'WorkZone',"State","Year","Make","Model","Color","Charge",\
            "Article","Race",'Gender',"DriverCity","DLState","ArrestType",'Geolocation']

violationsDF.drop(dropCols, axis = 1, inplace = True)
violationsDF.columns

In [None]:
violationsDF["DateOfStop"].count()

In [None]:
# clean the data
# drop nas
violationsDF = violationsDF.dropna(how = "any")
violationsDF["DateOfStop"].count()

In [None]:
dup_violations = violationsDF[violationsDF.duplicated()]
violationsDF.drop_duplicates(inplace = True)
violationsDF.duplicated().sum()

In [None]:
#convert yes and no values to boolean (0, 1)
factorCols = ['PersonalInjury', 'PropertyDamage', 'Fatal','ContributedToAccident']

for f in factorCols:
    violationsDF[f] = violationsDF[f].map({"Yes" : True, "No" : False})

violationsDF.head(2)


In [None]:
#convert Date of Stop and time of stop as Datatime objects
violationsDF['DateOfStop'] = violationsDF['DateOfStop'].map(lambda r: dt.strptime(r, "%m/%d/%Y"))

In [None]:
violationsDF['Year'] = violationsDF['DateOfStop'].apply(lambda r: r.year)
violationsDF['Month'] = violationsDF['DateOfStop'].apply(lambda r: r.month)
violationsDF['Qtr'] = violationsDF['Month'].apply(lambda r : mnthToQtr[r])

In [None]:
violationsDF['ViolationCategory'] = violationsDF.Description.map(lambda r: assignViolationCat(str(r)))
violationsDF.ViolationCategory.unique()

In [None]:
violationsDF.head(2)

In [None]:
# Add column for Police District based on SubAgency for GeoJSON mapping
violationsDF['PoliceDistrictID'] = violationsDF.SubAgency.map(lambda r : getPoliceDistrict(r[0]))
violationsDF.PoliceDistrictID = violationsDF.PoliceDistrictID.astype(int)

In [None]:
violationsDF.dtypes

In [None]:
violationsDF.columns

In [None]:
# write out to CSV
violationsDF.to_csv(os.path.join(importDir,"Traffic_Violations_cleaned.csv"), index = False)

### Alternate method - without Geolocation


In [None]:
#Add a col called ViolationCount to sum the total violation while grouping
violationsDF['ViolationCount'] = 1

In [None]:
# Add vehicle group reducing the vehicletype grouping
violationsDF['VehicleGroup'] = violationsDF.VehicleType.map(lambda r : assignVehGrp(str(r)))
violationsDF['VehicleGroup'].head()

In [None]:
violationsDF.columns

In [None]:
#Group the data by Year, Month, SubAgency, PoliceDistrictID, Gender, VehicleType, ViolationType,Driver State,
# Geolocation, Violationcategory
vDF_grp_alt = violationsDF[['SubAgency', 'PersonalInjury','PropertyDamage', 'Fatal', \
                        'VehicleGroup', 'ViolationType', 'ContributedToAccident', \
                         'Year', 'Month', 'Qtr','ViolationCategory','PoliceDistrictID', 'ViolationCount']].\
            groupby(['Year','Qtr','Month','SubAgency','PoliceDistrictID','ViolationType','ViolationCategory',\
                     'VehicleGroup']).agg(np.sum)

vDF_grp_alt.reset_index(inplace = True)

In [None]:
vDF_grp_alt.count()

In [None]:
vDF_grp_alt.to_csv(os.path.join(importDir, "Traffic_violations_grouped_new.csv"), index = False)

### Write to sqlite db

In [None]:
#Create the table with ID column as primary key -IMPORTANT ELSE AUTOMAP_BASE will not work
conn = sqlite3.connect(f'{dbPath}/{dbName}')
c = conn.cursor()

c.executescript('''
    PRAGMA foreign_keys=off;

    BEGIN TRANSACTION;
    
    /*create a new table with the same column names and types while
    defining a primary key for the desired column*/
    
    CREATE TABLE `traffic_violations` (
    `ID` BIGINT PRIMARY KEY NOT NULL,
     `Year` BIGINT,
     `Qtr` BIGINT,
     `Month` BIGINT,     
     `SubAgency` TEXT,
     `PoliceDistrictID` BIGINT,
     `ViolationType` TEXT,
     `ViolationCategory` TEXT,
     `VehicleGroup` TEXT,
     `PersonalInjury` FLOAT,
     `PropertyDamage` FLOAT,
     `Fatal` FLOAT,
     `ContributedToAccident` FLOAT,
     `ViolationCount` BIGINT
    );

    COMMIT TRANSACTION;

    PRAGMA foreign_keys=on;''')

#close out the connection
c.close()
conn.close()

In [None]:
engine = create_engine(f"sqlite:///{dbPath}/{dbName}", echo = True)
print(f"sqlite:///{dbPath}/{dbName}")

In [None]:
vDF_grp_alt.to_sql("traffic_violations", engine, if_exists = "append", index = True, index_label = "ID")