Tips to navigate the notebook: Everytime that you need to verify a claim made in the report, search for the corresponding section name. For some sections (mainly related with the model itself), the code can also be located in the notebook that contains the pipeline or in transformers.py.

In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import numpy as np
import category_encoders as ce
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score, precision_score, accuracy_score,precision_recall_curve
from sklearn.base import TransformerMixin, ClassifierMixin
from sklearn.pipeline import Pipeline
from category_encoders.target_encoder import TargetEncoder
from category_encoders.binary import BinaryEncoder
import category_encoders as ce
import pickle
import json
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import os
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from transformers import (
    Test, 
    CreateCyclicalFeatures, 
    CategoricalDataCleaning, 
    NumericalDataCleaning
)



RANDOM_STATE = 1

In [3]:
def read_data() -> pd.DataFrame:
    """Imports the data.
    
    Returns:
        data (pd.DataFrame): DataFrame with the data. 
                             
    """
    path = os.path.join('data', 'train.csv')
    data = pd.read_csv(path)
    return data

train_df = read_data()
train_df.head()

Unnamed: 0,VehicleSearchedIndicator,ContrabandIndicator,Department Name,InterventionDateTime,InterventionLocationName,InterventionReasonCode,ReportingOfficerIdentificationID,ResidentIndicator,SearchAuthorizationCode,StatuteReason,SubjectAge,SubjectEthnicityCode,SubjectRaceCode,SubjectSexCode,TownResidentIndicator
0,False,False,New Haven,10/01/2013 12:00:00 AM,NEW HAVEN,V,262,True,N,Stop Sign,31.0,H,W,M,True
1,False,False,State Police,10/01/2013 12:00:00 AM,WILLINGTON,V,1000002715,False,N,Other,29.0,M,W,M,False
2,False,False,Plymouth,10/01/2013 12:00:00 AM,Terryville,V,21,True,N,Speed Related,18.0,N,W,M,True
3,False,False,Plymouth,10/01/2013 12:00:00 AM,Plymouth,V,D1,True,N,Speed Related,52.0,N,W,F,False
4,False,False,Bethel,10/01/2013 12:00:00 AM,BETHEL,V,08M,True,N,Cell Phone,34.0,N,W,M,False


Section General Analysis

In [None]:
train_df.SubjectSexCode.value_counts()

In [None]:
ax = train_df.groupby('SubjectSexCode')['VehicleSearchedIndicator'].sum().plot.barh(title = "Number of Searched Vehicles by Sex Code");
ax.set_xlabel("Searched Vehicles")

In [None]:
train_df.SubjectRaceCode.value_counts()

In [None]:
train_df.groupby('SubjectRaceCode')['VehicleSearchedIndicator'].sum()

In [None]:
53524/2018931

In [None]:
22501/386325

In [None]:
220/19746

In [None]:
498/48641

In [None]:
train_df.SubjectEthnicityCode.value_counts()

In [None]:
train_df.groupby('SubjectEthnicityCode')['VehicleSearchedIndicator'].sum()

In [None]:
876/45561

In [None]:
58343/2099632

In [None]:
17524/328450

In [None]:
train_df.ReportingOfficerIdentificationID.nunique()

In [None]:
average_searched_vehicles_by_officer = train_df.groupby("ReportingOfficerIdentificationID")["VehicleSearchedIndicator"].mean().reset_index()
average_searched_vehicles_by_officer.columns = ["ReportingOfficerIdentificationID", "VehicleSearchedIndicator"]
average_searched_vehicles_by_officer = average_searched_vehicles_by_officer.set_index('ReportingOfficerIdentificationID', drop=False)
print(average_searched_vehicles_by_officer.VehicleSearchedIndicator.min())
print(average_searched_vehicles_by_officer.VehicleSearchedIndicator.max())

In [None]:
average_contraband_by_officer = train_df.groupby("ReportingOfficerIdentificationID")["ContrabandIndicator"].mean().reset_index()
average_contraband_by_officer.columns = ["ReportingOfficerIdentificationID", "ContrabandIndicator"]
average_contraband_by_officer = average_contraband_by_officer.set_index('ReportingOfficerIdentificationID', drop=False)
print(average_contraband_by_officer.ContrabandIndicator.min())
print(average_contraband_by_officer.ContrabandIndicator.max())

In [None]:
train_df['Department Name'].nunique()

In [None]:
average_searched_vehicles_by_department = train_df.groupby("Department Name")["VehicleSearchedIndicator"].mean().reset_index()
average_searched_vehicles_by_department.columns = ["Department Name", "VehicleSearchedIndicator"]
average_searched_vehicles_by_department = average_searched_vehicles_by_department.set_index('Department Name', drop=False)
print(average_searched_vehicles_by_department.VehicleSearchedIndicator.min())
print(average_searched_vehicles_by_department.VehicleSearchedIndicator.max())

In [None]:
average_contraband_by_department = train_df.groupby("Department Name")["ContrabandIndicator"].mean().reset_index()
average_contraband_by_department.columns = ["Department Name", "ContrabandIndicator"]
average_contraband_by_department = average_contraband_by_department.set_index('Department Name', drop=False)
print(average_contraband_by_department.ContrabandIndicator.min())
print(average_contraband_by_department.ContrabandIndicator.max())

In [None]:
train_df['InterventionDateTime'] = pd.to_datetime(train_df['InterventionDateTime'], format='%m/%d/%Y %I:%M:%S %p')

In [None]:
train_df['InterventionDateMonth'] = train_df['InterventionDateTime'].dt.month

In [None]:
average_contraband_by_month = train_df.loc[train_df.VehicleSearchedIndicator == True].groupby("InterventionDateMonth")["ContrabandIndicator"].mean().reset_index()
average_contraband_by_month.columns = ["InterventionDateMonth", "ContrabandIndicator"]
average_contraband_by_month = average_contraband_by_month.set_index('InterventionDateMonth', drop=False)
average_contraband_by_month.ContrabandIndicator.plot(label="ContrabandIndicator", kind='barh')
plt.xlabel('ContrabandIndicator')
axis = plt.gca();

Section Business Question Analysis and Business questions technical support and Conclusions and Recommendations

In [None]:
precision_male = train_df.ContrabandIndicator[np.equal(train_df.SubjectSexCode, "M")].sum()/train_df.VehicleSearchedIndicator[np.equal(train_df.SubjectSexCode, "M")].sum()
precision_female = train_df.ContrabandIndicator[np.equal(train_df.SubjectSexCode, "F")].sum()/train_df.VehicleSearchedIndicator[np.equal(train_df.SubjectSexCode, "F")].sum()
precision_hispanic = train_df.ContrabandIndicator[np.equal(train_df.SubjectEthnicityCode, "H")].sum()/train_df.VehicleSearchedIndicator[np.equal(train_df.SubjectEthnicityCode, "H")].sum()
precision_middle_east = train_df.ContrabandIndicator[np.equal(train_df.SubjectEthnicityCode, "M")].sum()/train_df.VehicleSearchedIndicator[np.equal(train_df.SubjectEthnicityCode, "M")].sum()
precision_not_applicable_ethnicity = train_df.ContrabandIndicator[np.equal(train_df.SubjectEthnicityCode, "N")].sum()/train_df.VehicleSearchedIndicator[np.equal(train_df.SubjectEthnicityCode, "N")].sum()
precision_white = train_df.ContrabandIndicator[np.equal(train_df.SubjectRaceCode, "W")].sum()/train_df.VehicleSearchedIndicator[np.equal(train_df.SubjectRaceCode, "W")].sum()
precision_black = train_df.ContrabandIndicator[np.equal(train_df.SubjectRaceCode, "B")].sum()/train_df.VehicleSearchedIndicator[np.equal(train_df.SubjectRaceCode, "B")].sum()
precision_asian = train_df.ContrabandIndicator[np.equal(train_df.SubjectRaceCode, "A")].sum()/train_df.VehicleSearchedIndicator[np.equal(train_df.SubjectRaceCode, "A")].sum()
precision_indian = train_df.ContrabandIndicator[np.equal(train_df.SubjectRaceCode, "I")].sum()/train_df.VehicleSearchedIndicator[np.equal(train_df.SubjectRaceCode, "I")].sum()
print(precision_male, precision_female)
print(precision_hispanic, precision_middle_east, precision_not_applicable_ethnicity)
print(precision_white, precision_black, precision_asian, precision_indian)

In [None]:
from itertools import combinations 
for i in np.unique(train_df['Department Name'].values):
    precision_male = train_df.ContrabandIndicator[np.equal(train_df.SubjectSexCode, "M") & np.equal(train_df['Department Name'], i)].sum()/train_df.VehicleSearchedIndicator[np.equal(train_df.SubjectSexCode, "M") & np.equal(train_df['Department Name'], i)].sum()
    precision_female = train_df.ContrabandIndicator[np.equal(train_df.SubjectSexCode, "F") & np.equal(train_df['Department Name'], i)].sum()/train_df.VehicleSearchedIndicator[np.equal(train_df.SubjectSexCode, "F") & np.equal(train_df['Department Name'], i)].sum()
    precision_hispanic = train_df.ContrabandIndicator[np.equal(train_df.SubjectEthnicityCode, "H") & np.equal(train_df['Department Name'], i)].sum()/train_df.VehicleSearchedIndicator[np.equal(train_df.SubjectEthnicityCode, "H") & np.equal(train_df['Department Name'], i)].sum()
    precision_middle_east = train_df.ContrabandIndicator[np.equal(train_df.SubjectEthnicityCode, "M") & np.equal(train_df['Department Name'], i)].sum()/train_df.VehicleSearchedIndicator[np.equal(train_df.SubjectEthnicityCode, "M") & np.equal(train_df['Department Name'], i)].sum()
    precision_not_applicable_ethnicity = train_df.ContrabandIndicator[np.equal(train_df.SubjectEthnicityCode, "N") & np.equal(train_df['Department Name'], i)].sum()/train_df.VehicleSearchedIndicator[np.equal(train_df.SubjectEthnicityCode, "N") & np.equal(train_df['Department Name'], i)].sum()
    precision_white = train_df.ContrabandIndicator[np.equal(train_df.SubjectRaceCode, "W") & np.equal(train_df['Department Name'], i)].sum()/train_df.VehicleSearchedIndicator[np.equal(train_df.SubjectRaceCode, "W") & np.equal(train_df['Department Name'], i)].sum()
    precision_black = train_df.ContrabandIndicator[np.equal(train_df.SubjectRaceCode, "B") & np.equal(train_df['Department Name'], i)].sum()/train_df.VehicleSearchedIndicator[np.equal(train_df.SubjectRaceCode, "B") & np.equal(train_df['Department Name'], i)].sum()
    precision_indian = train_df.ContrabandIndicator[np.equal(train_df.SubjectRaceCode, "I") & np.equal(train_df['Department Name'], i)].sum()/train_df.VehicleSearchedIndicator[np.equal(train_df.SubjectRaceCode, "I") & np.equal(train_df['Department Name'], i)].sum()
    precision_asian = train_df.ContrabandIndicator[np.equal(train_df.SubjectRaceCode, "A") & np.equal(train_df['Department Name'], i)].sum()/train_df.VehicleSearchedIndicator[np.equal(train_df.SubjectRaceCode, "A") & np.equal(train_df['Department Name'], i)].sum()
    test_list = [precision_male, precision_female]
    test_list_2 = [precision_hispanic, precision_middle_east, precision_not_applicable_ethnicity]
    test_list_3 = [precision_white, precision_black, precision_indian, precision_asian]
    res = max(combinations(test_list, 2), key = lambda sub: abs(sub[0]-sub[1]))
    res_2 = max(combinations(test_list_2, 2), key = lambda sub: abs(sub[0]-sub[1]))
    res_3 = max(combinations(test_list_3, 2), key = lambda sub: abs(sub[0]-sub[1]))
    if((abs(res[0]-res[1]) <= 0.05) & (abs(res_2[0]-res_2[1]) <= 0.05) & (abs(res_3[0]-res_3[1]) <= 0.05)):
        print(i)

In [None]:
differences_of_bigger_than_0dot9 = []
for i in np.unique(train_df['Department Name'].values):
    precision_male = train_df.ContrabandIndicator[np.equal(train_df.SubjectSexCode, "M") & np.equal(train_df['Department Name'], i)].sum()/train_df.VehicleSearchedIndicator[np.equal(train_df.SubjectSexCode, "M") & np.equal(train_df['Department Name'], i)].sum()
    precision_female = train_df.ContrabandIndicator[np.equal(train_df.SubjectSexCode, "F") & np.equal(train_df['Department Name'], i)].sum()/train_df.VehicleSearchedIndicator[np.equal(train_df.SubjectSexCode, "F") & np.equal(train_df['Department Name'], i)].sum()
    precision_hispanic = train_df.ContrabandIndicator[np.equal(train_df.SubjectEthnicityCode, "H") & np.equal(train_df['Department Name'], i)].sum()/train_df.VehicleSearchedIndicator[np.equal(train_df.SubjectEthnicityCode, "H") & np.equal(train_df['Department Name'], i)].sum()
    precision_middle_east = train_df.ContrabandIndicator[np.equal(train_df.SubjectEthnicityCode, "M") & np.equal(train_df['Department Name'], i)].sum()/train_df.VehicleSearchedIndicator[np.equal(train_df.SubjectEthnicityCode, "M") & np.equal(train_df['Department Name'], i)].sum()
    precision_not_applicable_ethnicity = train_df.ContrabandIndicator[np.equal(train_df.SubjectEthnicityCode, "N") & np.equal(train_df['Department Name'], i)].sum()/train_df.VehicleSearchedIndicator[np.equal(train_df.SubjectEthnicityCode, "N") & np.equal(train_df['Department Name'], i)].sum()
    precision_white = train_df.ContrabandIndicator[np.equal(train_df.SubjectRaceCode, "W") & np.equal(train_df['Department Name'], i)].sum()/train_df.VehicleSearchedIndicator[np.equal(train_df.SubjectRaceCode, "W") & np.equal(train_df['Department Name'], i)].sum()
    precision_black = train_df.ContrabandIndicator[np.equal(train_df.SubjectRaceCode, "B") & np.equal(train_df['Department Name'], i)].sum()/train_df.VehicleSearchedIndicator[np.equal(train_df.SubjectRaceCode, "B") & np.equal(train_df['Department Name'], i)].sum()
    precision_indian = train_df.ContrabandIndicator[np.equal(train_df.SubjectRaceCode, "I") & np.equal(train_df['Department Name'], i)].sum()/train_df.VehicleSearchedIndicator[np.equal(train_df.SubjectRaceCode, "I") & np.equal(train_df['Department Name'], i)].sum()
    precision_asian = train_df.ContrabandIndicator[np.equal(train_df.SubjectRaceCode, "A") & np.equal(train_df['Department Name'], i)].sum()/train_df.VehicleSearchedIndicator[np.equal(train_df.SubjectRaceCode, "A") & np.equal(train_df['Department Name'], i)].sum()
    test_list = [precision_male, precision_female]
    test_list_2 = [precision_hispanic, precision_middle_east, precision_not_applicable_ethnicity]
    test_list_3 = [precision_white, precision_black, precision_indian, precision_asian]
    res = max(combinations(test_list, 2), key = lambda sub: abs(sub[0]-sub[1]))
    res_2 = max(combinations(test_list_2, 2), key = lambda sub: abs(sub[0]-sub[1]))
    res_3 = max(combinations(test_list_3, 2), key = lambda sub: abs(sub[0]-sub[1]))
    if(((abs(res[0]-res[1]) > 0.9)) | ((abs(res_2[0]-res_2[1]) > 0.9)) | ((abs(res_3[0]-res_3[1]) > 0.9))):
        differences_of_bigger_than_0dot9.append(i)
print(differences_of_bigger_than_0dot9)

In [None]:
from itertools import combinations 
differences_of_bigger_than_0dot9_officer = []
ax = train_df['ReportingOfficerIdentificationID'].values
ax = [str(i) for i in ax]
ax = np.unique(ax)

for i in ax:
    if train_df.VehicleSearchedIndicator[np.equal(train_df['ReportingOfficerIdentificationID'], i)].sum() > 100:
        precision_male = train_df.ContrabandIndicator[np.equal(train_df.SubjectSexCode, "M") & np.equal(train_df['ReportingOfficerIdentificationID'], i)].sum()/train_df.VehicleSearchedIndicator[np.equal(train_df.SubjectSexCode, "M") & np.equal(train_df['ReportingOfficerIdentificationID'], i)].sum()
        precision_female = train_df.ContrabandIndicator[np.equal(train_df.SubjectSexCode, "F") & np.equal(train_df['ReportingOfficerIdentificationID'], i)].sum()/train_df.VehicleSearchedIndicator[np.equal(train_df.SubjectSexCode, "F") & np.equal(train_df['ReportingOfficerIdentificationID'], i)].sum()
        precision_hispanic = train_df.ContrabandIndicator[np.equal(train_df.SubjectEthnicityCode, "H") & np.equal(train_df['ReportingOfficerIdentificationID'], i)].sum()/train_df.VehicleSearchedIndicator[np.equal(train_df.SubjectEthnicityCode, "H") & np.equal(train_df['ReportingOfficerIdentificationID'], i)].sum()
        precision_middle_east = train_df.ContrabandIndicator[np.equal(train_df.SubjectEthnicityCode, "M") & np.equal(train_df['ReportingOfficerIdentificationID'], i)].sum()/train_df.VehicleSearchedIndicator[np.equal(train_df.SubjectEthnicityCode, "M") & np.equal(train_df['ReportingOfficerIdentificationID'], i)].sum()
        precision_not_applicable_ethnicity = train_df.ContrabandIndicator[np.equal(train_df.SubjectEthnicityCode, "N") & np.equal(train_df['ReportingOfficerIdentificationID'], i)].sum()/train_df.VehicleSearchedIndicator[np.equal(train_df.SubjectEthnicityCode, "N") & np.equal(train_df['ReportingOfficerIdentificationID'], i)].sum()
        precision_white = train_df.ContrabandIndicator[np.equal(train_df.SubjectRaceCode, "W") & np.equal(train_df['ReportingOfficerIdentificationID'], i)].sum()/train_df.VehicleSearchedIndicator[np.equal(train_df.SubjectRaceCode, "W") & np.equal(train_df['ReportingOfficerIdentificationID'], i)].sum()
        precision_black = train_df.ContrabandIndicator[np.equal(train_df.SubjectRaceCode, "B") & np.equal(train_df['ReportingOfficerIdentificationID'], i)].sum()/train_df.VehicleSearchedIndicator[np.equal(train_df.SubjectRaceCode, "B") & np.equal(train_df['ReportingOfficerIdentificationID'], i)].sum()
        precision_indian = train_df.ContrabandIndicator[np.equal(train_df.SubjectRaceCode, "I") & np.equal(train_df['ReportingOfficerIdentificationID'], i)].sum()/train_df.VehicleSearchedIndicator[np.equal(train_df.SubjectRaceCode, "I") & np.equal(train_df['ReportingOfficerIdentificationID'], i)].sum()
        precision_asian = train_df.ContrabandIndicator[np.equal(train_df.SubjectRaceCode, "A") & np.equal(train_df['ReportingOfficerIdentificationID'], i)].sum()/train_df.VehicleSearchedIndicator[np.equal(train_df.SubjectRaceCode, "A") & np.equal(train_df['ReportingOfficerIdentificationID'], i)].sum()
        test_list = [precision_male, precision_female]
        test_list_2 = [precision_hispanic, precision_middle_east, precision_not_applicable_ethnicity]
        test_list_3 = [precision_white, precision_black, precision_indian, precision_asian]
        res = max(combinations(test_list, 2), key = lambda sub: abs(sub[0]-sub[1]))
        res_2 = max(combinations(test_list_2, 2), key = lambda sub: abs(sub[0]-sub[1]))
        res_3 = max(combinations(test_list_3, 2), key = lambda sub: abs(sub[0]-sub[1]))
        if(((abs(res[0]-res[1]) > 0.9) & ((abs(res[0]-res[1]) <= 1))) | ((abs(res_2[0]-res_2[1]) > 0.9) & (abs(res_2[0]-res_2[1]) <= 1)) | ((abs(res_3[0]-res_3[1]) > 0.9) & (abs(res_3[0]-res_3[1]) <=1))):
            differences_of_bigger_than_0dot9_officer.append(i)
print(differences_of_bigger_than_0dot9_officer)

In [None]:
precision_global = train_df.ContrabandIndicator.sum()/train_df.VehicleSearchedIndicator.sum()
precision_global

Section Model expected outcomes overview

In [None]:
train_df[train_df['Department Name'] == 'WCSU'].SubjectEthnicityCode.value_counts()

Section Model specifications.
Regarding this section, the code is in the notebook that contains the pipeline and in transformers.py. Please note that some phrases (e.g. the binary encoder gave a little improvement in roc_auc compared to the ordinal encoder) can't be confirmed in the notebook. You will have to trust me on those claims :) both sources of code contain only the final version 

Section Analysis of expected outcomes based on training set. The code is in the notebook that contains the pipeline

Section Alternatives considered. As described earlier, there is no code to prove this section

Section Model Dataset technical analysis

In [None]:
train_df.dtypes

In [None]:
# percentile list 
perc =[.20, .40, .60, .80] 
  
# list of dtypes to include 
include =['float64']

# calling describe method 
desc = train_df.describe(percentiles = perc, include = include) 
desc 

In [None]:
np.unique(train_df.SubjectAge.values)

In [None]:
include =['object']
  
# calling describe method 
desc = train_df.describe(percentiles = perc, include = include) 
desc 

In [None]:
include =['bool']
  
# calling describe method 
desc = train_df.describe(percentiles = perc, include = include) 
desc 

In [None]:
train_df.VehicleSearchedIndicator.value_counts()

In [None]:
7624/(237607+7624)

In [None]:
train_df.ContrabandIndicator.value_counts()

In [None]:
2839/(242392+2839)

In [None]:
train_df[train_df.VehicleSearchedIndicator == True].ContrabandIndicator.value_counts()

In [None]:
2539/(5085+2539)

In [None]:
train_df.isnull().sum()

In [None]:
train_df.shape

In [None]:
duplicate_rows_df = train_df[train_df.duplicated()]
duplicate_rows_df.shape

In [None]:
train_df.shape

In [None]:
204756/2473643

In [None]:
inches_wide = 8
inches_high = 8
plt.rcParams["figure.figsize"] = [inches_wide, inches_high]
ax = train_df.SubjectAge.plot.hist(xlim=(0,100), bins=100, color='red', xticks = [0, 10, 16, 20, 40, 60, 80, 100], title = "Age Distribution");
ax.set_xlabel("Age")
axis = plt.gca();

In [None]:
corr = train_df.corr()
corr

In [None]:
train_df[train_df.VehicleSearchedIndicator == False].ContrabandIndicator.value_counts()

In [None]:
cardinality = train_df.describe(exclude=np.number).T
cardinality

In [None]:
train_df[train_df.SubjectEthnicityCode == 'M'].tail()

Section Model technical analysis. Code is in the notebook that contains the pipeline