In [34]:
# Import all dependencies
import pandas as pd
import numpy as np

#SQLAlchemy libraries
import sqlite3
from sqlalchemy import create_engine # library to create the connection between DB and Python
from sqlalchemy import func # library to use aggregate functions
from sqlalchemy.ext.declarative import declarative_base # to create new tables or other metadata
from sqlalchemy.ext.automap import automap_base # library specific to reflecting existing schema from DB and its mapping
from sqlalchemy import func, desc, and_, or_ # library provides basic sql functions like count, max, min, avg
from sqlalchemy.orm import Session # Library that sets session for SQL transaction
from sqlalchemy import inspect # library to inspect the SQL table from the Session
from sqlalchemy.sql.functions import coalesce # library to use check if null and replace with 0 or any value

# Datetime libraries
from datetime import datetime as dt, timedelta

# library to read data from zipfile
from zipfile import ZipFile as ZF
import os

# libraries to read json
import json
import requests
from pandas.io.json import json_normalize


In [3]:
# Declare global variables
dbPath = "trafficViolations/static/db"

dbName = "trafficViolations.sqlite"

importDir = "rawdata"
importFile = "traffic-violations-in-usa.zip"

#columns name (after it is read onto pandas)
df_col_names = ['DateOfStop', 'TimeOfStop', 'Agency', 'SubAgency', 'Description',
       'Location', 'Latitude', 'Longitude', 'Accident', 'Belts',
       'PersonalInjury', 'PropertyDamage', 'Fatal', 'CommercialLicense',
       'HAZMAT', 'CommercialVehicle', 'Alcohol', 'WorkZone', 'State',
       'VehicleType', 'Year', 'Make', 'Model', 'Color', 'ViolationType',
       'Charge', 'Article', 'ContributedToAccident', 'Race', 'Gender',
       'DriverCity', 'DriverState', 'DLState', 'ArrestType',
       'Geolocation']

violationCat = {"Impaired" : ["ALCOHOL","DRUGS","ALCO"],
                "Offense" : ["REGISTRATION","LICENSE","INSURANCE","PLATE","REG.PLATE"],
                "Safety":["UNSAFE","SEATBELT","HELMET","EQUIP","EQUIPMENT","WINDSHIELD","MIRRORS","BRAKE","INADEQUATE,INOPERATIVE","OPERATING"],
                "Violation":["SPEEDING","SPEED","STOP","PARKING","FAILURE","ELLUDE,LAMP,LAMPS","DEVICE","SIGNAL","LIGHT","LIGHTS","AVOIDING","AVOID","INTERSECTION"],
                "Distraction":["HANDHELD","MOBILE","ELECTRONIC","VIDEO","EARPLUGS","SOUND","TEXT","MSG."]
                }

vehicleGrp = {
    "Automobile":["Automobile","Limousine","Station Wagon"],
"Truck":["Light Duty Truck","Heavy Duty Truck"],
"Motorcyle":["Motorcycle","Moped"],
"Other":["Recreational Vehicle","Unknown","Commercial Rig","Camper"],
"Bus":["Transit Bus","School Bus","Cross Country Bus"],
"FarmVehicle":["Farm Vehicle","Farm Equipment"],
"Trailer":["Utility Trailer","Mobile Home","Tandem Trailer","Travel/Home Trailer","Boat Trailer"],
"RMS":["Fire Vehicle","Ambulance","Police Vehicle","Police(Emerg)","Ambulance(Emerg)","Ambulance(Non-Emerg)","Police(Non-Emerg)","Fire(Non-Emerg)","Fire(Emerg)"]
}

# zipCode JSON file URL
zipCd_URL = "https://data.montgomerycountymd.gov/resource/mmib-2cgz.json"

#Police District JSON link
police_dist_URL = "https://data.montgomerycountymd.gov/resource/vxy6-ve2e.json"

In [4]:
################ Global Helper Functions ###################################
# Function to derive Violation category based on description
# ensure violationCat dict is declared as global variable
def assignViolationCat(desc):
    for key,value in violationCat.items():
    #     print(value)
        if any(v in desc for v in value):
            return key
    else:
        return "Other"

def getPoliceDistrict(str):
    return(str if(str != "H") else "8")

def assignVehGrp(vehType):
    for key,value in vehicleGrp.items():
    #     print(value)
        if any(v in vehType for v in value):
            return key
    else:
        return "Other"

### Read csv data file and import into sqlite

In [6]:
# Read accident data from all the zip files
violationsDF = pd.concat(\
    [pd.read_csv(ZF(os.path.join(importDir,importFile)).open("Traffic_Violations.csv"), low_memory=False, header = 0, names = df_col_names) \
     for file in os.listdir(importDir) \
     if file.endswith("zip")], \
        ignore_index = True)

In [7]:
violationsDF.head(2)

Unnamed: 0,DateOfStop,TimeOfStop,Agency,SubAgency,Description,Location,Latitude,Longitude,Accident,Belts,...,Charge,Article,ContributedToAccident,Race,Gender,DriverCity,DriverState,DLState,ArrestType,Geolocation
0,09/24/2013,17:11:00,MCP,"3rd district, Silver Spring",DRIVING VEHICLE ON HIGHWAY WITH SUSPENDED REGI...,8804 FLOWER AVE,,,No,No,...,13-401(h),Transportation Article,No,BLACK,M,TAKOMA PARK,MD,MD,A - Marked Patrol,
1,12/20/2012,00:41:00,MCP,"2nd district, Bethesda",DRIVING WHILE IMPAIRED BY ALCOHOL,NORFOLK AVE / ST ELMO AVE,38.983578,-77.093105,No,No,...,21-902(b1),Transportation Article,No,WHITE,M,DERWOOD,MD,MD,A - Marked Patrol,"(38.9835782, -77.09310515)"


In [8]:
dropCols = ["TimeOfStop","Agency","Location","Latitude","Longitude","Accident","CommercialLicense",\
            "HAZMAT","State","Year","Make","Model","Color","Charge","Article","Race","DriverCity","DLState","ArrestType"]

violationsDF.drop(dropCols, axis = 1, inplace = True)
violationsDF.columns

Index(['DateOfStop', 'SubAgency', 'Description', 'Belts', 'PersonalInjury',
       'PropertyDamage', 'Fatal', 'CommercialVehicle', 'Alcohol', 'WorkZone',
       'VehicleType', 'ViolationType', 'ContributedToAccident', 'Gender',
       'DriverState', 'Geolocation'],
      dtype='object')

In [9]:
violationsDF["DateOfStop"].count()

1018634

In [10]:
violationsDF.columns

Index(['DateOfStop', 'SubAgency', 'Description', 'Belts', 'PersonalInjury',
       'PropertyDamage', 'Fatal', 'CommercialVehicle', 'Alcohol', 'WorkZone',
       'VehicleType', 'ViolationType', 'ContributedToAccident', 'Gender',
       'DriverState', 'Geolocation'],
      dtype='object')

In [11]:
# clean the data
# drop nas
violationsDF = violationsDF.dropna(how = "any")
violationsDF["DateOfStop"].count()

933583

In [12]:
dup_violations = violationsDF[violationsDF.duplicated()]
violationsDF.drop_duplicates(inplace = True)
violationsDF.duplicated().sum()

0

In [13]:
#convert yes and no values to boolean (0, 1)
factorCols = ['Belts','PersonalInjury', 'PropertyDamage', 'Fatal', 'CommercialVehicle', 'Alcohol', 
              'WorkZone','ContributedToAccident']

for f in factorCols:
    violationsDF[f] = violationsDF[f].map({"Yes" : True, "No" : False})

violationsDF.head(2)


Unnamed: 0,DateOfStop,SubAgency,Description,Belts,PersonalInjury,PropertyDamage,Fatal,CommercialVehicle,Alcohol,WorkZone,VehicleType,ViolationType,ContributedToAccident,Gender,DriverState,Geolocation
1,12/20/2012,"2nd district, Bethesda",DRIVING WHILE IMPAIRED BY ALCOHOL,False,False,False,False,False,False,False,02 - Automobile,Citation,False,M,MD,"(38.9835782, -77.09310515)"
2,07/20/2012,"5th district, Germantown",FAILURE TO STOP AT STOP SIGN,False,False,False,False,False,False,False,02 - Automobile,Citation,False,F,MD,"(39.1618098166667, -77.25358095)"


In [14]:
#convert Date of Stop and time of stop as Datatime objects
violationsDF['DateOfStop'] = violationsDF['DateOfStop'].map(lambda r: dt.strptime(r, "%m/%d/%Y"))

In [15]:
violationsDF['Year'] = violationsDF['DateOfStop'].apply(lambda r: r.year)
violationsDF['Month'] = violationsDF['DateOfStop'].apply(lambda r: r.month)

In [16]:
violationsDF['ViolationCategory'] = violationsDF.Description.map(lambda r: assignViolationCat(str(r)))
violationsDF.ViolationCategory.unique()

array(['Impaired', 'Violation', 'Distraction', 'Safety', 'Offense',
       'Other'], dtype=object)

In [17]:
violationsDF.head(2)

Unnamed: 0,DateOfStop,SubAgency,Description,Belts,PersonalInjury,PropertyDamage,Fatal,CommercialVehicle,Alcohol,WorkZone,VehicleType,ViolationType,ContributedToAccident,Gender,DriverState,Geolocation,Year,Month,ViolationCategory
1,2012-12-20,"2nd district, Bethesda",DRIVING WHILE IMPAIRED BY ALCOHOL,False,False,False,False,False,False,False,02 - Automobile,Citation,False,M,MD,"(38.9835782, -77.09310515)",2012,12,Impaired
2,2012-07-20,"5th district, Germantown",FAILURE TO STOP AT STOP SIGN,False,False,False,False,False,False,False,02 - Automobile,Citation,False,F,MD,"(39.1618098166667, -77.25358095)",2012,7,Violation


In [18]:
# Add column for Police District based on SubAgency for GeoJSON mapping
violationsDF['PoliceDistrictID'] = violationsDF.SubAgency.map(lambda r : getPoliceDistrict(r[0]))
violationsDF.PoliceDistrictID = violationsDF.PoliceDistrictID.astype(int) 

In [19]:
violationsDF.dtypes

DateOfStop               datetime64[ns]
SubAgency                        object
Description                      object
Belts                              bool
PersonalInjury                     bool
PropertyDamage                     bool
Fatal                              bool
CommercialVehicle                  bool
Alcohol                            bool
WorkZone                           bool
VehicleType                      object
ViolationType                    object
ContributedToAccident              bool
Gender                           object
DriverState                      object
Geolocation                      object
Year                              int64
Month                             int64
ViolationCategory                object
PoliceDistrictID                  int32
dtype: object

In [20]:
violationsDF.columns

Index(['DateOfStop', 'SubAgency', 'Description', 'Belts', 'PersonalInjury',
       'PropertyDamage', 'Fatal', 'CommercialVehicle', 'Alcohol', 'WorkZone',
       'VehicleType', 'ViolationType', 'ContributedToAccident', 'Gender',
       'DriverState', 'Geolocation', 'Year', 'Month', 'ViolationCategory',
       'PoliceDistrictID'],
      dtype='object')

In [85]:
# write out to CSV
violationsDF.to_csv(os.path.join(importDir,"Traffic_Violations_cleaned.csv"), index = False)

### Alternate method - without Geolocation


In [23]:
# group all the out of state drivers under OutOfState
violationsDF['DL_State_type'] = violationsDF.DriverState.map(lambda r : "In-State" if(r == "MD") else "Out-Of-State")

In [24]:
# Add vehicle group reducing the vehicletype grouping
violationsDF['VehicleGroup'] = violationsDF.VehicleType.map(lambda r : assignVehGrp(str(r)))
violationsDF['VehicleGroup'].head()

1    Automobile
2    Automobile
3    Automobile
4    Automobile
5    Automobile
Name: VehicleGroup, dtype: object

In [25]:
#Group the data by Year, Month, SubAgency, PoliceDistrictID, Gender, VehicleType, ViolationType,Driver State,
# Geolocation, Violationcategory
vDF_grp_alt = violationsDF[['SubAgency', 'Belts', 'PersonalInjury','PropertyDamage', 'Fatal', 'CommercialVehicle', 'Alcohol', \
                        'WorkZone','VehicleGroup', 'ViolationType', 'ContributedToAccident', 'Gender','DL_State_type', 
                         'Year', 'Month', 'ViolationCategory','PoliceDistrictID', 'ViolationCount']].\
            groupby(['Year','Month','SubAgency','PoliceDistrictID','ViolationType','ViolationCategory','Gender',\
                     'VehicleGroup','DL_State_type']).agg(np.sum)

vDF_grp_alt.reset_index(inplace = True)

In [26]:
vDF_grp_alt.count()

Year                     40249
Month                    40249
SubAgency                40249
PoliceDistrictID         40249
ViolationType            40249
ViolationCategory        40249
Gender                   40249
VehicleGroup             40249
DL_State_type            40249
Belts                    40249
PersonalInjury           40249
PropertyDamage           40249
Fatal                    40249
CommercialVehicle        40249
Alcohol                  40249
WorkZone                 40249
ContributedToAccident    40249
ViolationCount           40249
dtype: int64

In [27]:
vDF_grp_alt.to_csv(os.path.join(importDir, "Traffic_violations_grouped.csv"), index = False)

### Write to sqlite db

In [40]:
#Create the table with ID column as primary key -IMPORTANT ELSE AUTOMAP_BASE will not work
conn = sqlite3.connect(f'{dbPath}/{dbName}')
c = conn.cursor()

c.executescript('''
    PRAGMA foreign_keys=off;

    BEGIN TRANSACTION;
    
    /*create a new table with the same column names and types while
    defining a primary key for the desired column*/
    
    CREATE TABLE `traffic_violations` (
    `ID` BIGINT PRIMARY KEY NOT NULL,
     `Year` BIGINT,
     `Month` BIGINT,
     `SubAgency` TEXT,
     `PoliceDistrictID` BIGINT,
     `ViolationType` TEXT,
     `ViolationCategory` TEXT,
     `Gender` TEXT,
     `VehicleGroup` TEXT,
     `DL_State_type` TEXT,
     `Belts` FLOAT,
     `PersonalInjury` FLOAT,
     `PropertyDamage` FLOAT,
     `Fatal` FLOAT,
     `CommercialVehicle` FLOAT,
     `Alcohol` FLOAT,
     `WorkZone` FLOAT,
     `ContributedToAccident` FLOAT,
     `ViolationCount` BIGINT
    );

    COMMIT TRANSACTION;

    PRAGMA foreign_keys=on;''')

#close out the connection
c.close()
conn.close()

In [41]:
engine = create_engine(f"sqlite:///{dbPath}/{dbName}", echo = True)
print(f"sqlite:///{dbPath}/{dbName}")

sqlite:///trafficViolations/static/db/trafficViolations.sqlite


In [42]:
vDF_grp_alt.to_sql("traffic_violations", engine, if_exists = "append", index = True, index_label = "ID")

2019-03-29 22:27:23,741 INFO sqlalchemy.engine.base.Engine SELECT CAST('test plain returns' AS VARCHAR(60)) AS anon_1


INFO:sqlalchemy.engine.base.Engine:SELECT CAST('test plain returns' AS VARCHAR(60)) AS anon_1


2019-03-29 22:27:23,744 INFO sqlalchemy.engine.base.Engine ()


INFO:sqlalchemy.engine.base.Engine:()


2019-03-29 22:27:23,748 INFO sqlalchemy.engine.base.Engine SELECT CAST('test unicode returns' AS VARCHAR(60)) AS anon_1


INFO:sqlalchemy.engine.base.Engine:SELECT CAST('test unicode returns' AS VARCHAR(60)) AS anon_1


2019-03-29 22:27:23,750 INFO sqlalchemy.engine.base.Engine ()


INFO:sqlalchemy.engine.base.Engine:()


2019-03-29 22:27:23,753 INFO sqlalchemy.engine.base.Engine PRAGMA table_info("traffic_violations")


INFO:sqlalchemy.engine.base.Engine:PRAGMA table_info("traffic_violations")


2019-03-29 22:27:23,755 INFO sqlalchemy.engine.base.Engine ()


INFO:sqlalchemy.engine.base.Engine:()


2019-03-29 22:27:23,846 INFO sqlalchemy.engine.base.Engine BEGIN (implicit)


INFO:sqlalchemy.engine.base.Engine:BEGIN (implicit)


2019-03-29 22:27:24,531 INFO sqlalchemy.engine.base.Engine INSERT INTO traffic_violations ("ID", "Year", "Month", "SubAgency", "PoliceDistrictID", "ViolationType", "ViolationCategory", "Gender", "VehicleGroup", "DL_State_type", "Belts", "PersonalInjury", "PropertyDamage", "Fatal", "CommercialVehicle", "Alcohol", "WorkZone", "ContributedToAccident", "ViolationCount") VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)


INFO:sqlalchemy.engine.base.Engine:INSERT INTO traffic_violations ("ID", "Year", "Month", "SubAgency", "PoliceDistrictID", "ViolationType", "ViolationCategory", "Gender", "VehicleGroup", "DL_State_type", "Belts", "PersonalInjury", "PropertyDamage", "Fatal", "CommercialVehicle", "Alcohol", "WorkZone", "ContributedToAccident", "ViolationCount") VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)






2019-03-29 22:27:24,779 INFO sqlalchemy.engine.base.Engine COMMIT


INFO:sqlalchemy.engine.base.Engine:COMMIT


### Summarize data by Year and Month - Option 1 - NOt recommended

In [21]:
#Group the data by Year, Month, SubAgency, PoliceDistrictID, Gender, VehicleType, ViolationType,Driver State,
# Geolocation, Violationcategory

#add a column for violation count
violationsDF['ViolationCount'] = 1

vDF_grp = violationsDF[['SubAgency', 'Belts', 'PersonalInjury','PropertyDamage', 'Fatal', 'CommercialVehicle', 'Alcohol', \
                        'WorkZone','VehicleType', 'ViolationType', 'ContributedToAccident', 'Gender','DriverState', 
                        'Geolocation', 'Year', 'Month', 'ViolationCategory','PoliceDistrictID', 'ViolationCount']].\
            groupby(['Year','Month','SubAgency','PoliceDistrictID','ViolationType','ViolationCategory','Gender',\
                     'VehicleType','DriverState','Geolocation']).agg(np.sum)

vDF_grp.reset_index(inplace = True)

In [22]:
vDF_grp.count()

Year                     719809
Month                    719809
SubAgency                719809
PoliceDistrictID         719809
ViolationType            719809
ViolationCategory        719809
Gender                   719809
VehicleType              719809
DriverState              719809
Geolocation              719809
Belts                    719809
PersonalInjury           719809
PropertyDamage           719809
Fatal                    719809
CommercialVehicle        719809
Alcohol                  719809
WorkZone                 719809
ContributedToAccident    719809
ViolationCount           719809
dtype: int64

In [88]:
vDF_grp.head(3)

Unnamed: 0,Year,Month,SubAgency,PoliceDistrictID,ViolationType,ViolationCategory,Gender,VehicleType,DriverState,Geolocation,Belts,PersonalInjury,PropertyDamage,Fatal,CommercialVehicle,Alcohol,WorkZone,ContributedToAccident,ViolationCount
0,2012,1,"1st district, Rockville",1,Citation,Distraction,F,02 - Automobile,MD,"(39.05553475, -77.1073418666667)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,2012,1,"1st district, Rockville",1,Citation,Distraction,F,02 - Automobile,MD,"(39.0691647, -77.1447812666667)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,2012,1,"1st district, Rockville",1,Citation,Distraction,F,02 - Automobile,MD,"(39.1201810333333, -77.19798135)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [95]:
vDF_grp.to_csv(os.path.join(importDir, "TrafficViolation_Grouped.csv"), index = False)

### Establish connection to sqlite

In [93]:

#create Engine to con
#create Engine to connect to sqlite db#### Establish connection to the sqlite database using create_engine
print(f"sqlite:///{dbPath}/{dbName}")
engine = create_engine(f"sqlite:///{dbPath}/{dbName}", echo = True)

sqlite:///trafficViolations/db/trafficViolations.sqlite


In [94]:
vDF_grp.to_sql("traffic_violations", engine, if_exists = "replace", index = True, index_label = 'ID')

2019-03-29 14:05:12,971 INFO sqlalchemy.engine.base.Engine SELECT CAST('test plain returns' AS VARCHAR(60)) AS anon_1
2019-03-29 14:05:12,975 INFO sqlalchemy.engine.base.Engine ()
2019-03-29 14:05:12,980 INFO sqlalchemy.engine.base.Engine SELECT CAST('test unicode returns' AS VARCHAR(60)) AS anon_1
2019-03-29 14:05:12,983 INFO sqlalchemy.engine.base.Engine ()
2019-03-29 14:05:13,004 INFO sqlalchemy.engine.base.Engine PRAGMA table_info("traffic_violations")
2019-03-29 14:05:13,007 INFO sqlalchemy.engine.base.Engine ()
2019-03-29 14:05:13,028 INFO sqlalchemy.engine.base.Engine 
CREATE TABLE traffic_violations (
	"ID" BIGINT, 
	"Year" BIGINT, 
	"Month" BIGINT, 
	"SubAgency" TEXT, 
	"PoliceDistrictID" BIGINT, 
	"ViolationType" TEXT, 
	"ViolationCategory" TEXT, 
	"Gender" TEXT, 
	"VehicleType" TEXT, 
	"DriverState" TEXT, 
	"Geolocation" TEXT, 
	"Belts" FLOAT, 
	"PersonalInjury" FLOAT, 
	"PropertyDamage" FLOAT, 
	"Fatal" FLOAT, 
	"CommercialVehicle" FLOAT, 
	"Alcohol" FLOAT, 
	"WorkZone" FLO

### Create a json for Geolocation - Not viable

In [114]:
vDF_grp_geo = violationsDF[['Geolocation','Year', 'ViolationCount']].\
            groupby(['Geolocation','Year']).agg(np.sum)

vDF_grp_geo.reset_index(inplace= True)

In [115]:
vDF_grp_geo.count()

Geolocation       498246
Year              498246
ViolationCount    498246
dtype: int64

In [116]:
vDF_grp_geo.to_json(os.path.join(dbPath, "geoLoc.json"), orient = "records")

### Read police district geojson

In [117]:
police_dist_URL

# Extract JSON though requests.get()
try:
    resp = requests.get(police_dist_URL)
    
    #check if the status code is other 200 (ie. not successful request)
    if(resp.status_code != 200):
        raise HTTPError
    
    # extract the JSON data
    dist_json = resp.json()
    
except ConnectionError as c:
    print("Error in Connection :" + e)

except HTTPError as h:
    print("Unsuccessful in obtaining JSON : " + h)

In [130]:
# Extract JSON data into dataframe using normalize function
dist_DF = json_normalize(dist_json)

dist_DF.head()
                

Unnamed: 0,dist,objectid,shape_area,shape_len,the_geom.coordinates,the_geom.type
0,2,2,1565751500.03924,248658.778723035,"[[[[-77.1457618498, 39.037860378201], [-77.145...",MultiPolygon
1,1,1,4182813700.92157,453658.955673688,"[[[[-77.187567669146, 39.110811579485], [-77.1...",MultiPolygon
2,4,4,2313490108.83212,418565.068792495,"[[[[-77.115896846336, 39.126723839894], [-77.1...",MultiPolygon
3,6,6,1133619164.7974,337359.39232825703,"[[[[-77.246867924553, 39.175370078619], [-77.2...",MultiPolygon
4,8,7,58130070.0200888,44856.0606935784,"[[[[-77.001499589417, 38.989799321823], [-77.0...",MultiPolygon


In [131]:
# only select the columns needed from the JSON on Police district
dist_DF = dist_DF[['dist', 'the_geom.coordinates','the_geom.type']]

dist_DF.rename(columns = {"dist" : "PoliceDistrictID"}, inplace = True)

dist_DF.PoliceDistrictID = dist_DF.PoliceDistrictID.astype(int)

In [123]:
vDF_grp_alt.columns

Index(['Year', 'Month', 'SubAgency', 'PoliceDistrictID', 'ViolationType',
       'ViolationCategory', 'Gender', 'VehicleGroup', 'DL_State_type', 'Belts',
       'PersonalInjury', 'PropertyDamage', 'Fatal', 'CommercialVehicle',
       'Alcohol', 'WorkZone', 'ContributedToAccident', 'ViolationCount'],
      dtype='object')

In [124]:
# obtain the total violation across years by police districts
tv_overall = vDF_grp_alt[['SubAgency', 'PoliceDistrictID','ViolationCount']].\
                    groupby(['SubAgency', 'PoliceDistrictID']).agg(np.sum)

tv_overall.reset_index(inplace = True)

In [128]:
tv_overall.dtypes

SubAgency           object
PoliceDistrictID     int64
ViolationCount       int64
dtype: object

In [132]:
tv_overall = pd.merge(tv_overall, dist_DF, on="PoliceDistrictID", how = "inner")

tv_overall

Unnamed: 0,SubAgency,PoliceDistrictID,ViolationCount,the_geom.coordinates,the_geom.type
0,"1st district, Rockville",1,108585,"[[[[-77.187567669146, 39.110811579485], [-77.1...",MultiPolygon
1,"2nd district, Bethesda",2,132846,"[[[[-77.1457618498, 39.037860378201], [-77.145...",MultiPolygon
2,"3rd district, Silver Spring",3,195723,"[[[[-77.05481528585, 39.013802780305], [-77.05...",MultiPolygon
3,"4th district, Wheaton",4,226657,"[[[[-77.115896846336, 39.126723839894], [-77.1...",MultiPolygon
4,"5th district, Germantown",5,98148,"[[[[-77.255447065893, 39.313587822258], [-77.2...",MultiPolygon
5,"6th district, Gaithersburg / Montgomery Village",6,125278,"[[[[-77.246867924553, 39.175370078619], [-77.2...",MultiPolygon
6,Headquarters and Special Operations,8,31595,"[[[[-77.001499589417, 38.989799321823], [-77.0...",MultiPolygon


In [163]:
#covert all values to string for easy json creation
tv_overall.PoliceDistrictID = tv_overall.PoliceDistrictID.astype(str)
tv_overall.ViolationCount = tv_overall.ViolationCount.astype(str)

tv_overall.dtypes

SubAgency               object
PoliceDistrictID        object
ViolationCount          object
the_geom.coordinates    object
the_geom.type           object
dtype: object

In [164]:
# declare default structure of geoJSON
tv_map_json = {
    "type": "FeatureCollection",
    "features" : []
}

In [165]:
# function that constructs the feature details
def genFeatureDict(info):
    
    f = {
        "type" : "Feature",
        "geometry":{
            "type": info['the_geom.type'],
            "coordinates": info['the_geom.coordinates']
        },
        "properties" : {
            "name" : info['SubAgency'],
            "distID" : info['PoliceDistrictID'],
            "total_traffic_violations" : info['ViolationCount']
        }
    }
    
    return f
    
#construct the geoJSON features tv_map_json['features'].append(genFeatureDict(r))
for i in range(0,tv_overall.index.size):
    tv_map_json['features'].append(genFeatureDict(tv_overall.iloc[i,]))

In [166]:
tv_map_json

{'type': 'FeatureCollection',
 'features': [{'type': 'Feature',
   'geometry': {'type': 'MultiPolygon',
    'coordinates': [[[[-77.187567669146, 39.110811579485],
       [-77.187498140679, 39.111007489072],
       [-77.187498181099, 39.111007470991],
       [-77.187503853674, 39.111004858608],
       [-77.187507267379, 39.111003285757],
       [-77.187515330709, 39.111011064754],
       [-77.187509902889, 39.111013098434],
       [-77.187508031625, 39.111013799547],
       [-77.187502564532, 39.111015848602],
       [-77.187500643691, 39.111012217858],
       [-77.187498169729, 39.111007543974],
       [-77.187498136125, 39.111007516103],
       [-77.187498092342, 39.111007574725],
       [-77.187453499362, 39.111086903664],
       [-77.187450258572, 39.111092667648],
       [-77.18744086075, 39.111109386175],
       [-77.187151453291, 39.111847693036],
       [-77.186987198582, 39.112235390086],
       [-77.18659165912, 39.113168975553],
       [-77.186551379322, 39.113262530732],
   

In [167]:
with open("data_file.json", "w") as write_file:
    json.dump(tv_map_json, write_file)