In [1]:
import os

import pandas as pd
import numpy as np

import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine
from sqlalchemy import func, distinct # library to use aggregate functions
from sqlalchemy.sql import operators

In [2]:
# Declare global variables
dbPath = "trafficViolations/static/db"

dbName = "trafficViolations.sqlite"

In [3]:
# create the connection to SQLite db
eng = create_engine(f"sqlite:///{dbPath}/{dbName}")

In [4]:
# reflect an existing database into a new model
Base  = automap_base()

#prepare and reflect all tables wih data
Base.prepare(eng, reflect = True)

In [5]:
print(Base.classes.keys())

['traffic_violations']


In [6]:
V = Base.classes['traffic_violations']

In [7]:
# create session
session = Session(bind = eng)

### DATA Extraction Queries


In [8]:
# extract unique years
def getYears():
    res = pd.DataFrame(session.query(distinct(V.Year)).all(), columns = ['Year'])
    return res

#getYears()

In [9]:
# extract uniq Months
def getMonths():
    res = pd.DataFrame(session.query(distinct(V.Month)).all(), columns = ['Month'])
    return res

#getMonths()

In [10]:
# extract uniq Qtr
def getQtrs():
    res = pd.DataFrame(session.query(distinct(V.Qtr)).all(), columns = ['Qtr'])
    return res

#getQtrs()

In [11]:
# extract uniq SubAgency and Police District
def getPoliceDist():
    res = pd.DataFrame(session.query(V.SubAgency,V.PoliceDistrictID).distinct().all(), columns = ['SubAgency','PoliceDistrictID'])
    return res

# getPoliceDist()

In [12]:
# extract uniq ViolaitonCategory
def getVioCat():
    res = pd.DataFrame(session.query(distinct(V.ViolationCategory)).all(), columns = ['ViolationCategory'])
    return res

# getVioCat()

In [13]:
# extract uniq ViolationType
def getVioType():
    res = pd.DataFrame(session.query(distinct(V.ViolationType)).all(), columns = ['ViolationType'])
    return res

# getVioType()

In [14]:
# extract uniq Vehicle Grp
def getVehGrp():
    res = pd.DataFrame(session.query(distinct(V.VehicleGroup)).all(), columns = ['VehicleGroup'])
    return res

# getVehGrp()

In [15]:
def summarize_YR_QTR():
    res = pd.DataFrame(session.query(V.Year, V.Qtr, func.sum(V.ViolationCount)).\
            group_by(V.Qtr).group_by(V.Year).all(), columns = ['Year', 'Qtr','Total_ViolationCount'])
    return res

# summarize_YR_QTR()

In [18]:
# extract violation count by year and qtr
def violation_YOY_Change():
    df = summarize_YR_QTR()
    df_yoy = pd.pivot_table(df, values = "Total_ViolationCount", index = ['Year'], columns = ["Qtr"], aggfunc = np.sum)
    #caclculate year on year change
    df_yoy = df_yoy.pct_change()
    # drop na
    df_yoy = df_yoy.dropna(how = "any")    
    
    # reshape to normal dataframe strcuture
    df_yoy = pd.DataFrame(df_yoy.unstack())
    df_yoy.rename(columns = {0: 'YOY_Change_PCT'}, inplace = True)
    df_yoy.reset_index(inplace = True)
    
    #
    
    #merge total violations and yoy df for final df
    df_final = pd.merge(df, df_yoy, on = ['Year','Qtr'])
    
    return df_final
    
    
#violation_YOY_Change() 

Unnamed: 0,Year,Qtr,Total_ViolationCount,YOY_Change_PCT
0,2013,1,29480,0.266976
1,2014,1,34673,0.176153
2,2015,1,36555,0.054279
3,2016,1,39279,0.074518
4,2013,2,32007,0.123131
5,2014,2,40661,0.270378
6,2015,2,41111,0.011067
7,2016,2,40576,-0.013014
8,2013,3,35095,0.359639
9,2014,3,37998,0.082718


In [27]:
# extract violation count by police dist, year and qtr
def dist_Contrib_YOY():
    res = pd.DataFrame(session.query(V.SubAgency, V.Year, V.Qtr, func.sum(V.ViolationCount)).\
            group_by(V.SubAgency).group_by(V.Qtr).group_by(V.Year).all(), \
                       columns = ['SubAgency','Year', 'Qtr','Total_ViolationCount'])
    
    #   reshape result to calculate diff between Qtrs
    df_diff = pd.pivot_table(res, values = "Total_ViolationCount", index = ['SubAgency','Qtr'], columns = ['Year'])
    #calculate difference
    df_diff = df_diff.diff(axis = 1)
    
    # unstack to remove multilevel index
    d = df_diff.unstack().unstack().reset_index()
    
    #drop NAN and reset index
    d.dropna(how = "any", inplace = True)
    d.reset_index()
    
    # extract the data for total values
    df_tot = summarize_YR_QTR()
    
    # iterate to calculat the Contribution %
    df_result = []
    for index, row in d.iterrows():
        pct = (row[0]/(df_tot[(df_tot['Year'] == row.Year-1) & (df_tot.Qtr == row.Qtr)]['Total_ViolationCount'])).iloc[0]    
        df_result.append({
            'Year': row.Year,
            'Qtr':row.Qtr,
            'SubAgency':row.SubAgency,
            'Contrib_pct':pct
        })
        
        
    # merge data for violations by district/qtr/year with contribution pct
    df_final = pd.merge(res, pd.DataFrame(df_result), on = ['SubAgency','Qtr','Year'])
    
    return df_final
    
#dist_Contrib_YOY()


Unnamed: 0,SubAgency,Year,Qtr,Total_ViolationCount,Contrib_pct
0,"1st district, Rockville",2013,1,4258,0.042720
1,"1st district, Rockville",2014,1,4024,-0.007938
2,"1st district, Rockville",2015,1,3911,-0.003259
3,"1st district, Rockville",2016,1,4000,0.002435
4,"1st district, Rockville",2013,2,3498,-0.027125
5,"1st district, Rockville",2014,2,5433,0.060456
6,"1st district, Rockville",2015,2,5260,-0.004255
7,"1st district, Rockville",2016,2,4871,-0.009462
8,"1st district, Rockville",2013,3,4265,0.027623
9,"1st district, Rockville",2014,3,5060,0.022653


### Data extraction for Dynamic charts

In [38]:
# function to extract Violation  by district
# parameters Year (All, specific Year), Category (All & specific category) and District (All and specific)
def filterData_main(yr = 0, cat = "all", dist = 0):
    
    _filter = [1==1, V.Qtr.in_([1,2,3,4])]
    
    if(yr != 0):
        _filter.append(V.Year.in_([yr]))
    
    if(cat != "all"):
        _filter.append(V.ViolationCategory.in_([cat]))
    
    if(dist != 0):
        _filter.append(V.PoliceDistrictID.in_([dist]))
    
    
    #list of items to select
    selList = [V.Year,V.Qtr,V.Month,V.SubAgency,V.PoliceDistrictID,V.ViolationType,V.ViolationCategory,
               V.VehicleGroup,V.PersonalInjury,V.PropertyDamage,V.Fatal,V.ContributedToAccident,V.ViolationCount]
    
    res = session.query(*selList).filter(*_filter).all()
                       
    df = pd.DataFrame(res, columns = ["Year","Qtr","Month","SubAgency","PoliceDistrictID","ViolationType",
                                      "ViolationCategory","VehicleGroup","PersonalInjury","PropertyDamage","Fatal",
                                      "ContributedToAccident","ViolationCount"])
    
    return df

In [40]:
filterData_main(2012, 'Distraction', 2)

Unnamed: 0,Year,Qtr,Month,SubAgency,PoliceDistrictID,ViolationType,ViolationCategory,VehicleGroup,PersonalInjury,PropertyDamage,Fatal,ContributedToAccident,ViolationCount
0,2012,1,1,"2nd district, Bethesda",2,Citation,Distraction,Automobile,0.0,0.0,0.0,0.0,6
1,2012,1,1,"2nd district, Bethesda",2,Citation,Distraction,Truck,0.0,0.0,0.0,0.0,1
2,2012,1,1,"2nd district, Bethesda",2,Warning,Distraction,Automobile,0.0,0.0,0.0,0.0,4
3,2012,1,1,"2nd district, Bethesda",2,Warning,Distraction,Truck,0.0,0.0,0.0,0.0,2
4,2012,1,2,"2nd district, Bethesda",2,Citation,Distraction,Automobile,0.0,0.0,0.0,0.0,13
5,2012,1,2,"2nd district, Bethesda",2,Citation,Distraction,Truck,0.0,0.0,0.0,0.0,1
6,2012,1,2,"2nd district, Bethesda",2,Warning,Distraction,Automobile,0.0,0.0,0.0,0.0,5
7,2012,1,2,"2nd district, Bethesda",2,Warning,Distraction,Truck,0.0,0.0,0.0,0.0,1
8,2012,1,3,"2nd district, Bethesda",2,Citation,Distraction,Automobile,0.0,0.0,0.0,0.0,11
9,2012,1,3,"2nd district, Bethesda",2,Citation,Distraction,Truck,0.0,0.0,0.0,0.0,2


In [51]:
def getViolation_ByDist(yr, cat, dist):
    """ FUNCTION: getViolation_ByDist """
    """ desc : extract violation count by district and other filters as given by user (Year, Category and District) % """
    """ return DataFrame with SubAgency, Police District, Total Violations """
    
    df_all = filterData_main(yr, cat, dist)
    
    df_all = df_all[['SubAgency','PoliceDistrictID','ViolationCount']].\
                groupby(['SubAgency','PoliceDistrictID']).agg(np.sum)
    
    df_all.reset_index(inplace = True)
    
    return df_all

# getViolation_ByDist(0,"all",0)

In [52]:
def getViolation_ByCat(yr, cat, dist):
    """ FUNCTION: getViolation_ByCat """
    """ desc : extract violation count by Category and other filters as given by user (Year, Category and District) % """
    """ return DataFrame with ViolationCategory, Total Violations """
    
    df_all = filterData_main(yr, cat, dist)
    
    df_all = df_all[['ViolationCategory','ViolationCount']].\
                groupby(['ViolationCategory']).agg(np.sum)
    
    df_all.reset_index(inplace = True)
    
    return df_all

# getViolation_ByCat(0,"all",0)

Unnamed: 0,ViolationCategory,ViolationCount
0,Distraction,17562
1,Impaired,27126
2,Offense,196748
3,Other,102039
4,Safety,70398
5,Violation,282545


In [53]:
def getViolation_ByType(yr, cat, dist):
    """ FUNCTION: getViolation_ByType """
    """ desc : extract violation count by Violation Type and other filters as given by user (Year, Category and District) % """
    """ return DataFrame with ViolationType, Total Violations """
    
    df_all = filterData_main(yr, cat, dist)
    
    df_all = df_all[['ViolationType','ViolationCount']].\
                groupby(['ViolationType']).agg(np.sum)
    
    df_all.reset_index(inplace = True)
    
    return df_all

getViolation_ByType(0,"all",0)

Unnamed: 0,ViolationType,ViolationCount
0,Citation,359997
1,ESERO,31120
2,SERO,832
3,Warning,304469


In [None]:
def getViolation_Impact(yr, cat, dist):
    """ FUNCTION: getViolation_ByType """
    """ desc : extract violation count by Violation Type and other filters as given by user (Year, Category and District) % """
    """ return DataFrame with ViolationType, Total Violations """
    
    df_all = filterData_main(yr, cat, dist)
    
    df_all = df_all[['ViolationType','ViolationCount']].\
                groupby(['ViolationType']).agg(np.sum)
    
    df_all.reset_index(inplace = True)
    
    return df_all

getViolation_ByType(0,"all",0)