In [340]:
import os
import pandas as pd
from pandas.api.types import CategoricalDtype
import numpy as np
import re

### Setup

In [341]:
MONTH = 'April 2019'
FOLDER = 'Z:/Data Projects/MOCJ Test ' + MONTH

In [342]:
CRIM_NAMES = ["Case#", "Open Date", "Closed Date", "Court", "Latest Event",
              "Intake Type", "TC C/L", "Top Charge", "Dispo"]
POST_DISPO_NAMES = ["Case#", "Open Date", "Closed Date", "Court", "Latest Event",
                    "Intake Type", "TC C/L", "Top Charge", "Dispo", "Case Type"]

MOCJ_CASE_TYPES = CategoricalDtype(categories=["Homicide",
                                               "Violent Felony",
                                               "Non-Violent Felony",
                                               "Misdemeanor",
                                               "Violation & Other"],
                                   ordered=True)
MOCJ_INTAKE_TYPES = CategoricalDtype(categories=["Primary Case", "Conflict Case", "Post Arraignment"],
                                     ordered=True)
MOCJ_OUTFLOW_TYPES = CategoricalDtype(categories=["Post-Arraignment Shift Transfer",
                                                  "Post-Arraignment Disposition",
                                                  "Arraignment Shift Disposition"],
                                      ordered=True)

Functions for classifying top count and intake type, to translate from PDCMS categories to MOCJ categories.

In [343]:
def classify_intake_type(intake_type):
    """Classifies intake type into MOCJ-requested categories, which are:
    New Cases - Primary Case at Arraignment Shift
    New Cases - Conflict Case at Arraignment Shift
    New Cases - Post Arraignment Shift Other Intake
    """
    intake_type = str(intake_type).strip().lower()
    if (intake_type in ["arraignment", "row"]):
        return ("Primary Case")
    elif (intake_type in ["conflict row", "conflict arraignment"]):
        return ("Conflict Case")
    elif (intake_type.startswith("transfer")) | (intake_type == "assignment"):
        return ("Post Arraignment")
    else:
        return ("DANGER")

In [344]:
def classify_tc(tc):
    """Classifies top charge class and levels into MOCJ-requested categories.
    """
    tc = str(tc) #coerce TC type to string
    if tc == "H":
        return ("Homicide")
    elif ("FL" in tc) or ("FV" in tc):
        return ("Violent Felony")
    elif (tc.endswith("F") or tc == "F"):
        return ("Non-Violent Felony")
    elif (tc.endswith("M") or tc == "M"):
        return ("Misdemeanor")
    else:
        return ("Violation & Other")

In [345]:
def categorize_disposition(row):
    if (row["Open Date"] == row["Closed Date"]):
        return "Arraignment Shift Disposition"
    elif (row["Dispo"] in ["RPC", "RLAS", "R18B", "RNDS", "ADBX", "ADBR", "ADQU", "ADRI"]):
        return "Post-Arraignment Shift Transfer"
    else:
        return "Post-Arraignment Shift Disposition"

## Criminal Cases
Criminal cases opened and closed in the month

In [346]:
crim_cases_opened = pd.read_csv("{}/crim_cases_opened.csv".format(FOLDER),
                                names=CRIM_NAMES,
                                header=0,
                               index_col=False)
crim_cases_closed = pd.read_csv("{}/crim_cases_closed.csv".format(FOLDER),
                                names=CRIM_NAMES,
                                header=0,
                               index_col=False)


crim_cases = pd.concat([crim_cases_opened, crim_cases_closed])
print(crim_cases.shape)

crim_cases = crim_cases.drop_duplicates().reset_index(drop=True)
print(crim_cases.shape)

(1381, 9)
(1110, 9)


In [347]:
crim_cases["Case#"].value_counts()
crim_cases[crim_cases["Case#"] == "21-722054"]

Unnamed: 0,Case#,Open Date,Closed Date,Court,Latest Event,Intake Type,TC C/L,Top Charge,Dispo
155,21-722054,4/13/2019,4/13/2019,AR4,"04/13/2019, ARR, SENTENCED",ROW,0,OTHER (UNK),PLSE


In [348]:
crim_cases_pending_1st = pd.read_csv("{}/crim_cases_pending_1st.csv".format(FOLDER),
                                     names=CRIM_NAMES, header=0, index_col=False)
crim_cases_pending_end = pd.read_csv("{}/crim_cases_pending_last.csv".format(FOLDER),
                                      names=CRIM_NAMES, header=0, index_col=False)

crim_cases = pd.concat([crim_cases,
                        crim_cases_pending_1st,
                        crim_cases_pending_end])
print(crim_cases.shape)

(5908, 9)


In [349]:
crim_cases = crim_cases.dropna(how="all")
print(crim_cases.shape)

crim_cases = crim_cases.drop_duplicates(subset=["Case#", "Open Date", "Closed Date", "Intake Type", "Dispo"],
                                        keep='last').reset_index(drop = True)
print(crim_cases.shape)

# Checking whether a known duplicate has been de-duplicated
crim_cases[crim_cases["Case#"] == '21-709207']

(5908, 9)
(3156, 9)


Unnamed: 0,Case#,Open Date,Closed Date,Court,Latest Event,Intake Type,TC C/L,Top Charge,Dispo
1087,21-709207,3/9/2018,,51,05/02/19 - NCD,Arraignment,FL,1252501 - MURDER 2 (AFL),


In [350]:
crim_cases["Open Date"] = pd.to_datetime(crim_cases["Open Date"])
crim_cases["Closed Date"] = pd.to_datetime(crim_cases["Closed Date"])

### Cleaning cases
We don't want non-criminal IDV, VOP, or VOCD cases in these counts, so we're getting rid of cases where the top charge is either VISITATION or FAMILY OFFENSE.

Some top charges have more than one class and level, so we consolidate those.

In [351]:
# Getting rid of IDV/Visitation cases
crim_cases = crim_cases[~crim_cases['Top Charge'].str.contains("VISITATION", na=False)]
crim_cases = crim_cases[~crim_cases['Top Charge'].str.contains('FAMILY OFFENSE', na=False)]
crim_cases = crim_cases.loc[~((crim_cases["Court"] == "IDV")
                              & (crim_cases["Intake Type"] == "Assignment"))]
crim_cases = crim_cases.loc[~((crim_cases["Court"] == "IDV")
                              & (crim_cases["Intake Type"].str.startswith("Trans")))]

In [352]:
# Re-code homicide cases to a separate Homicide class, but don't include attempted homicides
# Finds top charges that have "MUR" but don't have "ATT"
crim_cases.loc[crim_cases["Top Charge"].str.contains('MUR', na=False) \
                & ~(crim_cases["Top Charge"].str.contains("ATT", na=False)), "TC C/L"] = "H"
crim_cases.loc[crim_cases["Top Charge"].str.contains("- MUR", na=False), "TC C/L"] = "H"

In [353]:
crim_cases[crim_cases["TC C/L"] == "H"]

Unnamed: 0,Case#,Open Date,Closed Date,Court,Latest Event,Intake Type,TC C/L,Top Charge,Dispo
758,21-118538,2016-05-07,NaT,61,03/11/19 - NCD,Arraignment,H,1252501 - MURDER 2 (AFL),
1087,21-709207,2018-03-09,NaT,51,05/02/19 - NCD,Arraignment,H,1252501 - MURDER 2 (AFL),
2634,21-721516,2019-03-26,NaT,71,06/03/19 - NCD / Indicted,Assignment,H,1252501 - MURDER 2 (AFL),
2850,21-721905,2019-04-10,NaT,F,05/16/19 - NCD,Arraignment,H,1252501 - MURDER 2 (AFL),


In [354]:
# Re-code TC C/L for extradition cases
crim_cases.loc[(crim_cases["Top Charge"].str.contains("FUG", na=False)), 'TC C/L'] = "M"

In [355]:
# AC Misdemeanors don't get counted correctly
crim_cases.loc[(crim_cases["Top Charge"].str.contains("AC - M", na=False)), 'TC C/L'] = "M"

We also check to see if any VOCDs or VOPs have snuck in.

In [356]:
# Check for any VOCD or VOP cases, then fix in PDCMS until there are no more cases returned
crim_cases[(crim_cases['Top Charge'].str.contains("VIO", na=False))
           & (crim_cases['Top Charge'].str.contains('COND', na=False))]

Unnamed: 0,Case#,Open Date,Closed Date,Court,Latest Event,Intake Type,TC C/L,Top Charge,Dispo
68,21-721837,2019-04-06,2019-04-06,AR4,"04/06/2019, ARR, DISPOSED",ROW,UV,VIOLATION OF COND DISC,ACD
139,21-722075,2019-04-13,2019-04-13,AR4,"04/13/2019, ARR, SENTENCED",ROW,UV,VIOLATION OF COND DISC,PLSE
245,21-722337,2019-04-24,2019-04-24,AR1,"04/24/2019, ARR, SENTENCED",ROW,UV,VIOLATION OF COND DISC,PLSE
247,21-722343,2019-04-24,2019-04-24,AR1,"04/24/2019, ARR, SENTENCED",ROW,UV,VIOLATION OF COND DISC,PLSE
2825,21-721849,2019-04-06,NaT,A,05/15/19 - NCD / New Atty Assigned,ROW,V,410.90 - VIOLATION OF COND DISC (UV),
2826,21-721850,2019-04-06,NaT,A,05/15/19 - NCD / New Atty Assigned,ROW,V,410.90 - VIOLATION OF COND DISC (UV),
2827,21-721851,2019-04-06,NaT,A,05/15/19 - NCD / New Atty Assigned,ROW,V,410.90 - VIOLATION OF COND DISC (UV),
3035,21-722202,2019-04-18,NaT,B,06/11/19 - NCD,ROW,V,410.90 - VIOLATION OF COND DISC (UV),


In [357]:
# Check for any VOCD or VOP cases, then fix in PDCMS until there are no more cases returned
crim_cases[(crim_cases['Top Charge'].str.contains("VOP", na=False))]

Unnamed: 0,Case#,Open Date,Closed Date,Court,Latest Event,Intake Type,TC C/L,Top Charge,Dispo
2989,21-722135,2019-04-16,NaT,54,06/06/19 - NCD,,F,4104000 - VOP (F) (EF),


### Determining Start and End

Because this is now a combined dataset, we need to find out whether the cases were opened and/or closed in the month.

In [358]:
crim_cases[crim_cases["Open Date"] < pd.to_datetime('2019-04-01')]["Open Date"].describe()

count                    2407
unique                    536
top       2019-03-21 00:00:00
freq                       51
first     1997-10-29 00:00:00
last      2019-03-30 00:00:00
Name: Open Date, dtype: object

In [359]:
def check_in_month(dt):
    """Check if a date is within the same month and year
    """
    
    month = pd.datetime.today().month -1
    year = pd.datetime.today().year
    return True if (dt.year == year) and (dt.month == month) else False

crim_cases["opened_in_month"] = crim_cases["Open Date"].apply(lambda x: check_in_month(x))
crim_cases["closed_in_month"] = crim_cases["Closed Date"].apply(lambda x: check_in_month(x))

In [360]:
# Because the dataset is only cases that either opened or closed in the month
# we know that any case not opened in this month will have been pending as of the 1st,
# and we know that any case not closed in this month will have been pending as of the
# last of the month.

crim_cases["pending_first"] = crim_cases["Open Date"].apply(lambda x: True if x < pd.to_datetime('2019-04-01') else False)

In [361]:
crim_cases["pending_last"] = crim_cases["Closed Date"].apply(
                            lambda x: True if ((x > pd.to_datetime('2019-04-30')) | pd.isnull(x)) else False)

In [362]:
crim_cases[crim_cases.pending_first == True][["Open Date", "Closed Date"]].describe()

Unnamed: 0,Open Date,Closed Date
count,2407,411
unique,536,24
top,2019-03-21 00:00:00,2019-04-16 00:00:00
freq,51,32
first,1997-10-29 00:00:00,2019-04-01 00:00:00
last,2019-03-30 00:00:00,2019-05-07 00:00:00


In [363]:
crim_cases[crim_cases.pending_last == True][["Closed Date", "Open Date"]].describe()

Unnamed: 0,Closed Date,Open Date
count,3,2433
unique,2,532
top,2019-05-07 00:00:00,2019-03-21 00:00:00
freq,2,43
first,2019-05-03 00:00:00,1997-10-29 00:00:00
last,2019-05-07 00:00:00,2019-04-25 00:00:00


### Classifying Top Charges
Now we translate the top charge type to MOCJ categories and check that everything worked OK.

In [364]:
#crim_cases["TC C/L"].value_counts()

In [365]:
crim_cases["TC_Type"] = crim_cases["TC C/L"].apply(lambda x: classify_tc(x))
crim_cases.groupby(["TC_Type", "TC C/L"]).size()

TC_Type             TC C/L
Homicide            H            4
Misdemeanor         AM         156
                    BM          13
                    M         1343
                    UM          16
Non-Violent Felony  AF           1
                    BF           2
                    DF           5
                    EF           9
                    F          904
Violation & Other   /I          14
                    0          156
                    0V           5
                    UV           4
                    V            5
Violent Felony      AFL          1
                    CFV          6
                    FL          20
                    FV         446
dtype: int64

In [366]:

crim_cases["TC_Type"] = crim_cases.TC_Type.astype(MOCJ_CASE_TYPES)

### Classifying Intake Type

Now we translate the intake type to MOCJ categories and check that everything is OK.

We are assuming that everything with a PDCMS intake type of "Assignment" and a court other than IDV is probably actually a transfer.

In [367]:
#crim_cases_closed["intake type"] = crim_cases_closed["TC C/L"].apply(lambda x: classify_tc(x))
crim_cases["intake type"] = crim_cases["Intake Type"].apply(lambda x: classify_intake_type(x))

crim_cases["intake type"].value_counts()

Primary Case        2398
Conflict Case        570
Post Arraignment     143
DANGER                 2
Name: intake type, dtype: int64

In [368]:
crim_cases[crim_cases["intake type"] == "DANGER"]

Unnamed: 0,Case#,Open Date,Closed Date,Court,Latest Event,Intake Type,TC C/L,Top Charge,Dispo,opened_in_month,closed_in_month,pending_first,pending_last,TC_Type,intake type
391,21-712635,2018-05-30,2019-04-16,D,04/16/19 - NCD / Disposed,,M,1356003 - COERCION-2ND (AM),DISM,False,True,True,False,Misdemeanor,DANGER
2989,21-722135,2019-04-16,NaT,54,06/06/19 - NCD,,F,4104000 - VOP (F) (EF),,True,False,False,True,Non-Violent Felony,DANGER


In [369]:
# Make intake type into a Pandas categorical type, so pivot tables are sorted
crim_cases["intake type"] = crim_cases["intake type"].astype(MOCJ_INTAKE_TYPES)

### Classifying Outflow Type

Translate the disposition to MOCJ categories.

In [370]:
crim_cases["MOCJ_Dispo"] = crim_cases.apply(lambda x: categorize_disposition(x), axis=1)
crim_cases.groupby(["MOCJ_Dispo", "Dispo"]).size()

MOCJ_Dispo                          Dispo
Arraignment Shift Disposition       ACD       30
                                    ADBR       8
                                    ADBX      15
                                    ADQU       7
                                    ADRI       1
                                    DISM      36
                                    DISS       6
                                    EXH        1
                                    MACD       1
                                    PLEA       1
                                    PLSE     124
                                    R18B       1
Post-Arraignment Shift Disposition  730        7
                                    ACD       32
                                    ACDD       1
                                    ACQ        1
                                    CLOS       1
                                    CLSS       2
                                    COV        1
                           

In [371]:
crim_cases.groupby(["MOCJ_Dispo"]).size()

MOCJ_Dispo
Arraignment Shift Disposition          232
Post-Arraignment Shift Disposition    2796
Post-Arraignment Shift Transfer         85
dtype: int64

### Final Aggregate Counts

In [372]:
crim_cases[crim_cases.opened_in_month == True].pivot_table(values=["Case#"],
                                                                index=["TC_Type", "intake type"],
                                                                aggfunc=np.count_nonzero)

Unnamed: 0_level_0,Unnamed: 1_level_0,Case#
TC_Type,intake type,Unnamed: 2_level_1
Homicide,Primary Case,1.0
Violent Felony,Primary Case,41.0
Violent Felony,Conflict Case,15.0
Violent Felony,Post Arraignment,2.0
Non-Violent Felony,Primary Case,80.0
Non-Violent Felony,Conflict Case,21.0
Non-Violent Felony,Post Arraignment,1.0
Misdemeanor,Primary Case,336.0
Misdemeanor,Conflict Case,58.0
Misdemeanor,Post Arraignment,14.0


In [373]:
crim_cases[crim_cases.closed_in_month == True].pivot_table(values=["Case#"],
                                                           index=["TC_Type", "MOCJ_Dispo"],
                                                           aggfunc=np.count_nonzero)

Unnamed: 0_level_0,Unnamed: 1_level_0,Case#
TC_Type,MOCJ_Dispo,Unnamed: 2_level_1
Violent Felony,Post-Arraignment Shift Disposition,34.0
Violent Felony,Post-Arraignment Shift Transfer,13.0
Non-Violent Felony,Post-Arraignment Shift Disposition,90.0
Non-Violent Felony,Post-Arraignment Shift Transfer,19.0
Misdemeanor,Arraignment Shift Disposition,124.0
Misdemeanor,Post-Arraignment Shift Disposition,233.0
Misdemeanor,Post-Arraignment Shift Transfer,43.0
Violation & Other,Arraignment Shift Disposition,108.0
Violation & Other,Post-Arraignment Shift Disposition,11.0
Violation & Other,Post-Arraignment Shift Transfer,5.0


In [374]:
crim_cases[crim_cases.pending_first == True].pivot_table(values=["Case#"], index=["TC_Type"],
                                                         aggfunc=np.count_nonzero)

Unnamed: 0_level_0,Case#
TC_Type,Unnamed: 1_level_1
Homicide,3
Violent Felony,415
Non-Violent Felony,818
Misdemeanor,1120
Violation & Other,51


In [375]:
crim_cases[crim_cases.pending_last == True].pivot_table(values=["Case#"], index=["TC_Type"],
                                                        aggfunc=np.count_nonzero)

Unnamed: 0_level_0,Case#
TC_Type,Unnamed: 1_level_1
Homicide,4
Violent Felony,426
Non-Violent Felony,812
Misdemeanor,1128
Violation & Other,63


In [376]:
crim_cases["case_age"] = (crim_cases["Closed Date"] - crim_cases["Open Date"]).dt.days
crim_cases.case_age.describe()

count     683.000000
mean      120.474378
std       245.860439
min         0.000000
25%         0.000000
50%        36.000000
75%       158.000000
max      2797.000000
Name: case_age, dtype: float64

In [377]:
crim_cases[crim_cases.closed_in_month == True].groupby('TC_Type').agg({'case_age': 'describe'})

Unnamed: 0_level_0,case_age,case_age,case_age,case_age,case_age,case_age,case_age,case_age
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
TC_Type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Homicide,0.0,,,,,,,
Violent Felony,47.0,194.148936,200.832721,4.0,52.5,139.0,295.0,964.0
Non-Violent Felony,109.0,252.165138,320.282413,4.0,34.0,187.0,322.0,1866.0
Misdemeanor,400.0,98.1125,179.240726,0.0,0.0,47.0,130.25,2490.0
Violation & Other,124.0,49.08871,318.564693,0.0,0.0,0.0,0.0,2797.0


In [378]:
crim_cases[crim_cases.case_age > 200][["Case#", "Court", "Latest Event", "Top Charge", "Dispo", "case_age"]]

Unnamed: 0,Case#,Court,Latest Event,Top Charge,Dispo,case_age
274,21-122034,92,"04/11/2019, RW, SENTENCED",POS FORG INST 2,PLSE,1002.0
275,21-132199,J9,"04/17/2019, RW, SENTENCED",AGG UNLIC OPER2,PLSE,802.0
279,21-141054,C,"04/10/2019, RW, SENTENCED",ATT CRIM CONTEMPT 2,PLSE,606.0
280,21-145541,F,"04/01/2019, NCD, SENTENCED",GRAND LARCENY 4,PLSE,492.0
281,21-146489,C,"04/24/2019, RW, SENTENCED",ASSAULT 3,PLSE,495.0
282,21-24096,AR1,"04/09/2019, RW, SENTENCED",ADMIN CODE,PLSE,2797.0
283,21-40166,AR2,"04/02/2019, RW, ROW",CRIM MISCHIEF 4,ACD,2490.0
284,21-709581,C,"04/17/2019, NCD, DISPOSED",PETIT LARCENY,RPC,391.0
285,21-710236,B,"04/13/2019, RW, SENTENCED",CRIM MISCHIEF-4,PLSE,373.0
286,21-712336,C,"04/04/2019, RW, ROW",CRIM MISCHIEF 4,ACD,309.0


## Post-Disposition Cases

In [393]:
pd_cases = pd.concat([pd.read_csv("{}/post_dispo_opened.csv".format(FOLDER),
                                  names=POST_DISPO_NAMES,
                                  header=0),
                      pd.read_csv("{}/post_dispo_closed.csv".format(FOLDER),
                                  names=POST_DISPO_NAMES,
                                  header=0),
                      pd.read_csv("{}/post_dispo_pending_1st.csv".format(FOLDER),
                                  names=POST_DISPO_NAMES,
                                  header=0),
                      pd.read_csv("{}/post_dispo_pending_last.csv".format(FOLDER),
                                  names=POST_DISPO_NAMES,
                                  header=0)])

pd_cases = pd_cases.drop_duplicates(subset=["Case#", "Open Date", "Closed Date"],
                                    keep='last').reset_index(drop = True)
pd_cases = pd_cases.dropna()

pd_cases["Case Type"] = pd_cases["Case Type"].str.upper()

Create dummy variables for whether the case was opened and/or closed in the month.

In [394]:
pd_cases["Open Date"] = pd.to_datetime(pd_cases["Open Date"])
pd_cases["Closed Date"] = pd.to_datetime(pd_cases["Closed Date"])
pd_cases["opened_in_month"] = pd_cases["Open Date"].apply(lambda x: check_in_month(x))
pd_cases["closed_in_month"] = pd_cases["Closed Date"].apply(lambda x: check_in_month(x))

Create a dummy variable for pending on the 1st of the month or the last.

In [397]:
pd_cases["pending_first"] = pd_cases["Open Date"].apply(lambda x: True if x < pd.to_datetime('2019-04-01') else False)

In [398]:
pd_cases["pending_last"] = pd_cases["Closed Date"].apply(
                            lambda x: True if ((x > pd.to_datetime('2019-04-30')) | pd.isnull(x)) else False)

In [399]:
pd_cases[pd_cases.pending_first == True].groupby("Case Type").size()

Case Type
VOP    2
dtype: int64

In [400]:
pd_cases[pd_cases.opened_in_month == True].groupby("Case Type").size()

Case Type
VOCD    3
VOP     2
dtype: int64

In [401]:
pd_cases[pd_cases.closed_in_month == True].groupby("Case Type").size()

Case Type
VOCD    3
VOP     4
dtype: int64

In [402]:
pd_cases[pd_cases.pending_last == True].groupby("Case Type").size()

Series([], dtype: int64)