In [None]:
import os
import pandas as pd
from pandas.api.types import CategoricalDtype
import numpy as np
import re

### Setup

In [None]:
MONTH = 'April 2019'
MONTHDT = pd.to_datetime(MONTH)
FOLDER = 'U:\Data and Analytics\Projects\Data\MOCJ ' + MONTH

In [None]:
CRIM_NAMES = ["Case#", "Open Date", "Closed Date", "Court", "Latest Event",
              "Intake Type", "TC C/L", "Top Charge", "Dispo"]
POST_DISPO_NAMES = ["Case#", "Open Date", "Closed Date", "Court", "Latest Event",
                    "Intake Type", "TC C/L", "Top Charge", "Dispo", "Case Type"]

MOCJ_CASE_TYPES = CategoricalDtype(categories=["Homicide",
                                               "Violent Felony",
                                               "Non-Violent Felony",
                                               "Misdemeanor",
                                               "Violation & Other"],
                                   ordered=True)
MOCJ_INTAKE_TYPES = CategoricalDtype(categories=["Primary Case", "Conflict Case", "Post Arraignment"],
                                     ordered=True)
MOCJ_OUTFLOW_TYPES = CategoricalDtype(categories=["Post-Arraignment Shift Transfer",
                                                  "Post-Arraignment Disposition",
                                                  "Arraignment Shift Disposition"],
                                      ordered=True)

## Criminal Cases
Criminal cases opened and closed in the month

#### Importing CSVs

We import CSVs for opened, closed, pending as of the first, and pending as of the last of the month. Then we concatenate, drop rows with only NAs, and deduplicate.

In [None]:
crim_cases_opened = pd.read_csv("{}/crim_cases_opened.csv".format(FOLDER),
                                names=CRIM_NAMES, header=0, index_col=False)
crim_cases_closed = pd.read_csv("{}/crim_cases_closed.csv".format(FOLDER),
                                names=CRIM_NAMES, header=0, index_col=False)
crim_cases_pending_1st = pd.read_csv("{}/crim_cases_pending_1st.csv".format(FOLDER),
                                     names=CRIM_NAMES, header=0, index_col=False)
crim_cases_pending_end = pd.read_csv("{}/crim_cases_pending_last.csv".format(FOLDER),
                                      names=CRIM_NAMES, header=0, index_col=False)

print("Raw")
crim_cases = pd.concat([crim_cases_opened,
                        crim_cases_closed,
                        crim_cases_pending_1st,
                        crim_cases_pending_end])
print(crim_cases.shape)

In [None]:
print("NAs dropped")
crim_cases = crim_cases.dropna(how="all")
print(crim_cases.shape)

print("Duplicates dropped")
crim_cases = crim_cases.drop_duplicates(subset=["Case#", "Open Date", "Closed Date", "Intake Type", "Dispo"],
                                        keep='last').reset_index(drop = True)
print(crim_cases.shape) # Should be about 1/3 of the raw concatenated dataset

In [None]:
# Double-check case #s to make sure none are duplicated
# value_counts() returns a sorted list, so if the first item is 1, we're OK
crim_cases["Case#"].value_counts()[0]

In [None]:
crim_cases["Open Date"] = pd.to_datetime(crim_cases["Open Date"])
crim_cases["Closed Date"] = pd.to_datetime(crim_cases["Closed Date"])

#### Cleaning cases
We don't want non-criminal IDV, VOP, or VOCD cases in these counts, so we're getting rid of cases where the top charge is either VISITATION or FAMILY OFFENSE.

Some top charges have more than one class and level, so we consolidate those.

In [None]:
# Getting rid of IDV/Visitation cases
crim_cases = crim_cases[~crim_cases['Top Charge'].str.contains("VISITATION", na=False)]
crim_cases = crim_cases[~crim_cases['Top Charge'].str.contains('FAMILY OFFENSE', na=False)]
crim_cases = crim_cases[~crim_cases['Top Charge'].str.contains('CUSTODY', na=False)]
crim_cases = crim_cases.loc[~((crim_cases["Court"] == "IDV")
                              & (crim_cases["Intake Type"] == "Assignment"))]
crim_cases = crim_cases.loc[~((crim_cases["Court"] == "IDV")
                              & (crim_cases["Intake Type"].str.startswith("Trans")))]

In [None]:
# Re-code homicide cases to a separate Homicide class, but don't include attempted homicides
# Finds top charges that have "MUR" but don't have "ATT"
crim_cases.loc[crim_cases["Top Charge"].str.contains('MUR', na=False) \
                & ~(crim_cases["Top Charge"].str.contains("ATT", na=False)), "TC C/L"] = "H"
crim_cases.loc[crim_cases["Top Charge"].str.contains("- MUR", na=False), "TC C/L"] = "H"

In [None]:
crim_cases[crim_cases["TC C/L"] == "H"]

In [None]:
# Re-code TC C/L for extradition cases
crim_cases.loc[(crim_cases["Top Charge"].str.contains("FUG", na=False)), 'TC C/L'] = "M"

In [None]:
# AC Misdemeanors don't get counted correctly
crim_cases.loc[(crim_cases["Top Charge"].str.contains("AC - M", na=False)), 'TC C/L'] = "M"

We also check to see if any VOCDs or VOPs have snuck in.

In [None]:
# Check for any VOCD or VOP cases, then fix in PDCMS until there are no more cases returned
crim_cases[(crim_cases['Top Charge'].str.contains("VIO", na=False))
           & (crim_cases['Top Charge'].str.contains('COND', na=False))]

In [None]:
# Check for any VOCD or VOP cases, then fix in PDCMS until there are no more cases returned
crim_cases[(crim_cases['Top Charge'].str.contains("VOP", na=False))]

### Determining Start and End

Because this is now a combined dataset, we need to find out whether the cases were opened and/or closed in the month.

In [None]:
def get_month_dates(month):
    return(month, month + pd.offsets.MonthEnd(0))

In [None]:
def check_in_month(dt, month):
    """Check if a date is within the same month and year
    """
    start, end = get_month_dates(month)
    return True if (dt >= start) & (dt <= end) else False
    #return True if dt else False

crim_cases["opened_in_month"] = crim_cases["Open Date"].apply(lambda x: check_in_month(x, MONTHDT))
crim_cases["closed_in_month"] = crim_cases["Closed Date"].apply(lambda x: check_in_month(x, MONTHDT))

In [None]:
def pending_first(case_open, case_close, month):
    """Check if the case was pending on the first of the month
    """
    first = get_month_dates(month)[0]
    return(True if (case_open < first) & ((case_close >= first) | pd.isnull(case_close)) else False)

def pending_last(case_open, case_close, month):
    """Check if the case was pending on the last of the month
    """
    last = get_month_dates(month)[1]
    return(True if (case_open <= last) & ((case_close >= last) | pd.isnull(case_close)) else False)

In [None]:
crim_cases['pending_first'] = crim_cases.apply(lambda row: pending_first(case_open = row['Open Date'],
                                                                         case_close = row['Closed Date'],
                                                                         month = MONTHDT), axis=1)
crim_cases['pending_last'] = crim_cases.apply(lambda row: pending_last(case_open = row['Open Date'],
                                                                       case_close = row['Closed Date'],
                                                                       month = MONTHDT), axis=1)

In [None]:
crim_cases.groupby(['pending_first', 'pending_last']).agg({"Open Date": ['min', 'max'], "Closed Date": ['min', 'max']})

### Classifying Top Charges
Now we translate the top charge type to MOCJ categories and check that everything worked OK.

Functions for classifying top count and intake type, to translate from PDCMS categories to MOCJ categories.

In [None]:
def classify_tc(tc):
    """Classifies top charge class and levels into MOCJ-requested categories.
    """
    tc = str(tc) #coerce TC type to string
    if tc == "H":
        return ("Homicide")
    elif ("FL" in tc) or ("FV" in tc):
        return ("Violent Felony")
    elif (tc.endswith("F") or tc == "F"):
        return ("Non-Violent Felony")
    elif (tc.endswith("M") or tc == "M"):
        return ("Misdemeanor")
    else:
        return ("Violation & Other")

In [None]:
#crim_cases["TC C/L"].value_counts()

In [None]:
crim_cases["TC_Type"] = crim_cases["TC C/L"].apply(lambda x: classify_tc(x))
crim_cases.groupby(["TC_Type", "TC C/L"]).size()

In [None]:
# Coerce the data type to categorical so it sorts correctly
crim_cases["TC_Type"] = crim_cases.TC_Type.astype(MOCJ_CASE_TYPES)

### Classifying Intake Type

Now we translate the intake type to MOCJ categories and check that everything is OK.

We are assuming that everything with a PDCMS intake type of "Assignment" and a court other than IDV is probably actually a transfer.

In [None]:
def classify_intake_type(intake_type):
    """Classifies intake type into MOCJ-requested categories, which are:
    New Cases - Primary Case at Arraignment Shift
    New Cases - Conflict Case at Arraignment Shift
    New Cases - Post Arraignment Shift Other Intake
    """
    intake_type = str(intake_type).strip().lower()
    if (intake_type in ["arraignment", "row"]):
        return ("Primary Case")
    elif (intake_type in ["conflict row", "conflict arraignment"]):
        return ("Conflict Case")
    elif (intake_type.startswith("transfer")) | (intake_type == "assignment"):
        return ("Post Arraignment")
    else:
        return ("DANGER")

In [None]:
#crim_cases_closed["intake type"] = crim_cases_closed["TC C/L"].apply(lambda x: classify_tc(x))
crim_cases["intake type"] = crim_cases["Intake Type"].apply(lambda x: classify_intake_type(x))

crim_cases["intake type"].value_counts()

In [None]:
crim_cases[crim_cases["intake type"] == "DANGER"]

In [None]:
# Make intake type into a Pandas categorical type, so pivot tables are sorted
crim_cases["intake type"] = crim_cases["intake type"].astype(MOCJ_INTAKE_TYPES)

### Classifying Outflow Type

Translate the disposition to MOCJ categories.

In [None]:
def categorize_disposition(row):
    if (row["Open Date"] == row["Closed Date"]):
        return "Arraignment Shift Disposition"
    elif (row["Dispo"] in ["RPC", "RLAS", "R18B", "RNDS", "ADBX", "ADBR", "ADQU", "ADRI"]):
        return "Post-Arraignment Shift Transfer"
    else:
        return "Post-Arraignment Shift Disposition"

In [None]:
crim_cases["MOCJ_Dispo"] = crim_cases.apply(lambda x: categorize_disposition(x), axis=1)
crim_cases.groupby(["MOCJ_Dispo", "Dispo"]).size()

In [None]:
crim_cases.groupby(["MOCJ_Dispo"]).size()

### Final Aggregate Counts

In [None]:
crim_cases.groupby(['TC_Type'])[['opened_in_month', 'closed_in_month', 'pending_first', 'pending_last']].agg('sum').stack()

In [None]:
_outflow = crim_cases[crim_cases.closed_in_month == True].pivot_table(values=["Case#"],
                                                           index=["TC_Type", "MOCJ_Dispo"],
                                                           aggfunc=np.count_nonzero).reset_index(1)
_inflow = crim_cases[crim_cases.opened_in_month == True].pivot_table(values=["Case#"],
                                                                index=["TC_Type", "intake type"],
                                                                aggfunc=np.count_nonzero).reset_index(1)
cases_in_month = pd.concat([_inflow, _outflow], sort=True) \
                   .set_index(['MOCJ_Dispo', 'intake type'], append=True) \
                   .sort_values(by=['TC_Type', 'intake type', 'MOCJ_Dispo'])

In [None]:
cases_in_month.to_csv("{}/monthly_crim_cases.csv".format(FOLDER))

In [None]:
crim_cases[crim_cases.pending_first == True].pivot_table(values=["Case#"], index=["TC_Type"],
                                                         aggfunc=np.count_nonzero)

In [None]:
crim_cases[crim_cases.pending_last == True].pivot_table(values=["Case#"], index=["TC_Type"],
                                                        aggfunc=np.count_nonzero)

In [None]:
crim_cases["case_age"] = (crim_cases["Closed Date"] - crim_cases["Open Date"]).dt.days
crim_cases.case_age.describe()

In [None]:
crim_cases[crim_cases.closed_in_month == True].groupby('TC_Type').agg({'case_age': 'describe'})

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
crim_cases[(crim_cases.closed_in_month == True) & (crim_cases['TC_Type'] == 'Non-Violent Felony')]['case_age'].plot.hist()

In [None]:
crim_cases[crim_cases.case_age > 1000][["Case#", "Court", "Latest Event", "Top Charge", "Dispo", "case_age"]]

## Post-Disposition Cases

In [None]:
pd_cases = pd.concat([pd.read_csv("{}/post_dispo_opened.csv".format(FOLDER),
                                  names=POST_DISPO_NAMES,
                                  header=0),
                      pd.read_csv("{}/post_dispo_closed.csv".format(FOLDER),
                                  names=POST_DISPO_NAMES,
                                  header=0),
                      pd.read_csv("{}/post_dispo_pending_1st.csv".format(FOLDER),
                                  names=POST_DISPO_NAMES,
                                  header=0),
                      pd.read_csv("{}/post_dispo_pending_last.csv".format(FOLDER),
                                  names=POST_DISPO_NAMES,
                                  header=0)])

pd_cases = pd_cases.drop_duplicates(subset=["Case#", "Open Date", "Closed Date"],
                                    keep='last').reset_index(drop = True)
pd_cases = pd_cases.dropna(how='all')

pd_cases["Case Type"] = pd_cases["Case Type"].str.upper()

Create dummy variables for whether the case was opened and/or closed in the month.

In [None]:
pd_cases["Open Date"] = pd.to_datetime(pd_cases["Open Date"])
pd_cases["Closed Date"] = pd.to_datetime(pd_cases["Closed Date"])
pd_cases["opened_in_month"] = pd_cases["Open Date"].apply(lambda x: check_in_month(x, MONTHDT))
pd_cases["closed_in_month"] = pd_cases["Closed Date"].apply(lambda x: check_in_month(x, MONTHDT))

Create a dummy variable for pending on the 1st of the month or the last.

In [None]:
pd_cases["pending_first"] = pd_cases.apply(lambda row: pending_first(row['Open Date'], row['Closed Date'], MONTHDT), axis=1)

In [None]:
pd_cases["pending_last"] = pd_cases.apply(lambda row: pending_last(row['Open Date'], row['Closed Date'], MONTHDT), axis=1)

In [None]:
pd_cases[['Case#', 'Case Type', 'opened_in_month', 'closed_in_month', 'pending_first', 'pending_last']].groupby(['Case Type'])[['opened_in_month', 'closed_in_month', 'pending_first', 'pending_last']].agg('sum').stack()

In [None]:
pd_cases[pd_cases.closed_in_month == True].groupby("Case Type").size()

In [None]:
pd_cases[pd_cases.pending_last == True].groupby("Case Type").size()