In [27]:
# Import packages

import pandas as pd
import sklearn
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import precision_score, accuracy_score, recall_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

## Import: Terry Stops

In [2]:
# import Terry_Stops.csv

df_ts = pd.read_csv('data/Terry_Stops.csv')
df_ts.head()

Unnamed: 0,Subject Age Group,Subject ID,GO / SC Num,Terry Stop ID,Stop Resolution,Weapon Type,Officer ID,Officer YOB,Officer Gender,Officer Race,...,Reported Time,Initial Call Type,Final Call Type,Call Type,Officer Squad,Arrest Flag,Frisk Flag,Precinct,Sector,Beat
0,-,-1,20140000120677,92317,Arrest,,7500,1984,M,Black or African American,...,11:32:00,-,-,-,SOUTH PCT 1ST W - ROBERT,N,N,South,O,O2
1,-,-1,20150000001463,28806,Field Contact,,5670,1965,M,White,...,07:59:00,-,-,-,,N,N,-,-,-
2,-,-1,20150000001516,29599,Field Contact,,4844,1961,M,White,...,19:12:00,-,-,-,,N,-,-,-,-
3,-,-1,20150000001670,32260,Field Contact,,7539,1963,M,White,...,04:55:00,-,-,-,,N,N,-,-,-
4,-,-1,20150000001739,33155,Field Contact,,6973,1977,M,White,...,00:41:00,-,-,-,,N,N,-,-,-


In [3]:
df_ts["Stop Resolution"].value_counts() # significant / life-altering outcome - all without field, citation

Field Contact               19048
Offense Report              15657
Arrest                      11685
Referred for Prosecution      728
Citation / Infraction         179
Name: Stop Resolution, dtype: int64

In [4]:
df_ts["Stop Resolution"].value_counts().to_dict()

{'Field Contact': 19048,
 'Offense Report': 15657,
 'Arrest': 11685,
 'Referred for Prosecution': 728,
 'Citation / Infraction': 179}

In [5]:
# map to column

df_ts['Stop Resolution'] = df_ts['Stop Resolution'].map({'Field Contact': 0,
                                                   'Offense Report': 1,
                                                   'Arrest': 1,
                                                   'Referred for Prosecution': 1,
                                                   'Citation / Infraction': 0})


In [6]:
df_ts.iloc[[38667,38712]]

Unnamed: 0,Subject Age Group,Subject ID,GO / SC Num,Terry Stop ID,Stop Resolution,Weapon Type,Officer ID,Officer YOB,Officer Gender,Officer Race,...,Reported Time,Initial Call Type,Final Call Type,Call Type,Officer Squad,Arrest Flag,Frisk Flag,Precinct,Sector,Beat
38667,36 - 45,17590179604,20210000038872,20880421872,1,"Personal Weapons (hands, feet, etc.)",8637,1990,F,White,...,01:21:54,BURG - IP/JO - COMM BURG (INCLUDES SCHOOLS),--BURGLARY - NON RESIDENTIAL/COMMERCIAL,911,WEST PCT 3RD W - QUEEN,Y,Y,West,Q,Q3
38712,36 - 45,19715133364,20210000009404,19715073116,0,"Personal Weapons (hands, feet, etc.)",8516,1988,M,White,...,14:30:29,"WEAPN-IP/JO-GUN,DEADLY WPN (NO THRT/ASLT/DIST)",--SUSPICIOUS CIRCUM. - SUSPICIOUS PERSON,911,WEST PCT 2ND W - MARY,N,Y,-,-,-


In [7]:
# predict *whether an arrest was made after a Terry Stop*
# Stop Resolution

# given information about the presence of weapons, the time of day of the call
# investigate subject demographics, associated precinct that made stop
# dataset also includes _information about gender and race

# What does discretion mean in this context?

In [8]:
df_ts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47297 entries, 0 to 47296
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Subject Age Group         47297 non-null  object
 1   Subject ID                47297 non-null  int64 
 2   GO / SC Num               47297 non-null  int64 
 3   Terry Stop ID             47297 non-null  int64 
 4   Stop Resolution           47297 non-null  int64 
 5   Weapon Type               47297 non-null  object
 6   Officer ID                47297 non-null  object
 7   Officer YOB               47297 non-null  int64 
 8   Officer Gender            47297 non-null  object
 9   Officer Race              47297 non-null  object
 10  Subject Perceived Race    47297 non-null  object
 11  Subject Perceived Gender  47297 non-null  object
 12  Reported Date             47297 non-null  object
 13  Reported Time             47297 non-null  object
 14  Initial Call Type     

## Clean Data

In [9]:
# Classification, dealing with null values
# Train test
# Data cleaning
# Convert training data to numeric information
# FSM - first simple model (throw data in and see if it works)

In [10]:
df_ts.iloc[17].head()
df_ts.describe()

Unnamed: 0,Subject ID,GO / SC Num,Terry Stop ID,Stop Resolution,Officer YOB
count,47297.0,47297.0,47297.0,47297.0,47297.0
mean,2567544000.0,20175620000000.0,3663859000.0,0.593484,1982.969766
std,4608972000.0,94402950000.0,6368353000.0,0.491188,9.083073
min,-1.0,-1.0,28020.0,0.0,1900.0
25%,-1.0,20160000000000.0,182146.0,0.0,1978.0
50%,-1.0,20180000000000.0,405037.0,1.0,1985.0
75%,7725995000.0,20190000000000.0,8333543000.0,1.0,1990.0
max,23459400000.0,20210000000000.0,23465240000.0,1.0,1998.0


In [11]:
# delete column with null values

df_ts.drop(columns=["Officer Squad"], inplace=True)

In [12]:
# drop rows 

df_ts.dropna(inplace=True)

In [13]:
df_ts.isna().sum()

Subject Age Group           0
Subject ID                  0
GO / SC Num                 0
Terry Stop ID               0
Stop Resolution             0
Weapon Type                 0
Officer ID                  0
Officer YOB                 0
Officer Gender              0
Officer Race                0
Subject Perceived Race      0
Subject Perceived Gender    0
Reported Date               0
Reported Time               0
Initial Call Type           0
Final Call Type             0
Call Type                   0
Arrest Flag                 0
Frisk Flag                  0
Precinct                    0
Sector                      0
Beat                        0
dtype: int64

In [14]:
df_ts['Arrest Flag'].value_counts()

N    43820
Y     3477
Name: Arrest Flag, dtype: int64

In [15]:
df_ts['Arrest Flag'] = df_ts['Arrest Flag'].replace('Y', 1)
df_ts['Arrest Flag'] = df_ts['Arrest Flag'].replace('N', 0)
df_ts['Arrest Flag'].value_counts()

# replace w sklearn labelencoder

0    43820
1     3477
Name: Arrest Flag, dtype: int64

In [16]:
df_ts['Weapon Type'].value_counts()

None                                    32565
-                                       11935
Lethal Cutting Instrument                1482
Knife/Cutting/Stabbing Instrument         636
Handgun                                   291
Firearm Other                             100
Blunt Object/Striking Implement            86
Club, Blackjack, Brass Knuckles            49
Firearm                                    38
Mace/Pepper Spray                          28
Other Firearm                              23
Firearm (unk type)                         15
Taser/Stun Gun                             10
Club                                        9
Fire/Incendiary Device                      7
None/Not Applicable                         7
Rifle                                       7
Shotgun                                     3
Automatic Handgun                           2
Personal Weapons (hands, feet, etc.)        2
Blackjack                                   1
Brass Knuckles                    

In [17]:
df_ts['Weapon Type'] = df_ts['Weapon Type'].map({'Lethal Cutting Instrument': 'Non-Firearm', 
                                           'Knife/Cutting/Stabbing Instrument': 'Non-Firearm',
                                           'Club, Blackjack, Brass Knuckles': 'Non-Firearm',
                                           'Blunt Object/Striking Implement': 'Non-Firearm',
                                           'Mace/Pepper Spray': 'Non-Firearm', 'Club':'Non-Firearm',
                                           'Taser/Stun Gun':'Non-Firearm', 'Blackjack':'Non-Firearm',
                                           'Brass Knuckles':'Non-Firearm', 'Fire/Incendiary Device':'Non-Firearm',
                                           'Handgun':'Firearm','Firearm Other':'Firearm',
                                           'Firearm (unk type)':'Firearm','Firearm':'Firearm',
                                           'Other Firearm':'Firearm', 'Rifle':'Firearm','Shotgun':'Firearm',
                                           'Automatic Handgun':'Firearm','None':'None','-':'None',
                                           'None/Not Applicable':'None',
                                           'Personal Weapons (hands, feet, etc.)':'Non-Firearm'})


In [18]:
df_ts[df_ts['Weapon Type'].isna()]

Unnamed: 0,Subject Age Group,Subject ID,GO / SC Num,Terry Stop ID,Stop Resolution,Weapon Type,Officer ID,Officer YOB,Officer Gender,Officer Race,...,Reported Date,Reported Time,Initial Call Type,Final Call Type,Call Type,Arrest Flag,Frisk Flag,Precinct,Sector,Beat


In [19]:
# change column datatypes

In [20]:
# create new dataframe with only relevant columns

# relevant_cols = df_ts[['Arrest Flag','Weapon Type','Frisk Flag','Precinct','Officer Gender','Subject Perceived Gender','Subject Perceived Race']]

In [21]:
# how else do we need to modify these columns: 'Weapon Type','Frisk Flag','Precinct','Officer Gender','Subject Perceived Gender','Subject Perceived Race'

In [22]:
df_ts.isnull().sum().sum()
display(df_ts['Weapon Type'].isnull().values.any())
display(df_ts['Frisk Flag'].isnull().values.any())
display(df_ts['Precinct'].isnull().values.any())
display(df_ts['Officer Gender'].isnull().values.any())
display(df_ts['Subject Perceived Gender'].isnull().values.any())
display(df_ts['Arrest Flag'].isnull().values.any())
df_ts['Subject Perceived Race'].isnull().values.any()

False

False

False

False

False

False

False

In [23]:
df_ts.drop(columns=['Subject ID','GO / SC Num','Terry Stop ID','Officer ID','Officer YOB','Officer Race',
                'Reported Date','Reported Time','Initial Call Type','Final Call Type','Call Type','Sector','Beat'], axis=1, inplace=True)

## Train Test Split

In [24]:
X = df_ts.loc[:, ['Weapon Type','Frisk Flag','Precinct','Officer Gender','Subject Perceived Gender',
                  'Subject Perceived Race']]
y = df_ts.loc[:, 'Stop Resolution'] #see Smote

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3,random_state=42)

In [25]:
# use ohe on training data
ohe = OneHotEncoder()

ohe.fit(X_train)
X_train_ohe = ohe.transform(X_train).toarray()
X_test_ohe = ohe.transform(X_test).toarray()

# create dataframe with training and testing data
ohe_df1 = pd.DataFrame(X_train_ohe, columns=ohe.get_feature_names(X_train.columns))
ohe_df2 = pd.DataFrame(X_test_ohe, columns=ohe.get_feature_names(X_test.columns))
ohe_df = pd.concat([ohe_df1,ohe_df2])

## Create Logistic Regression Model

In [26]:
lr = LogisticRegression()
lr.fit(X_train_ohe, y_train)
y_pred = lr.predict(X_test_ohe)

# accuracy check - (tp + tn) / (p + n)
display('Logistic Regression Accuracy: ', accuracy_score(y_test, y_pred)*100,'%')

# precision - tp / (tp + fp)
precision = precision_score(y_test, y_pred)
print('Precision: %f' % precision)

# recall - tp / (tp + fn)
recall = recall_score(y_test, y_pred)
print('Recall: %f' % recall)

# f1 - 2 tp / (2 tp + fp + fn)
f1 = f1_score(y_test, y_pred)
print('F1 score: %f' % f1)

# classification report
# display(classification_report(y_test, y_pred))

# true test is confusion matrix due to 0 across



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


'Logistic Regression Accuracy: '

78.61874559548978

'%'

Precision: 0.748813
Recall: 0.960029
F1 score: 0.841368


## Create K Nearest Neighbor Model

In [None]:
def find_best_k(X_train, y_train, X_test, y_test, min_k=1, max_k=25):
    '''This function will determine which k value is optimal to use for knn classification by selecting the k value
    with the highest overall accuracy score.
    Inputs:
    X_train, y_train, X_test, y_test: tran and test set values
    min_k: minimum value to try for k
    max_k: maximum value to try for k
        '''
    best_k = 0
    best_score = 0.0
    for k in range(min_k, max_k+1,2):
        knn = KNeighborsClassifier(n_neighbors=k, algorithm='brute')
        knn.fit(X_train, y_train)
        preds = knn.predict(X_test)
        accuracy = accuracy_score(y_test, preds)
        if accuracy > best_score:
            best_k = k
            best_score = accuracy
    
    print("Best Value for k: {}".format(best_k))
    print("Accuracy Score: {}".format(best_score))
    
print(find_best_k(X_train_ohe, y_train, X_test_ohe, y_test))

In [30]:
model = KNeighborsClassifier(n_neighbors=3, p=1)
cross_val_score(model, X_train, y_train, cv=5)

Traceback (most recent call last):
  File "/opt/anaconda3/envs/learn-env/lib/python3.6/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/envs/learn-env/lib/python3.6/site-packages/sklearn/neighbors/_base.py", line 1132, in fit
    multi_output=True)
  File "/opt/anaconda3/envs/learn-env/lib/python3.6/site-packages/sklearn/base.py", line 432, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/opt/anaconda3/envs/learn-env/lib/python3.6/site-packages/sklearn/utils/validation.py", line 72, in inner_f
    return f(**kwargs)
  File "/opt/anaconda3/envs/learn-env/lib/python3.6/site-packages/sklearn/utils/validation.py", line 802, in check_X_y
    estimator=estimator)
  File "/opt/anaconda3/envs/learn-env/lib/python3.6/site-packages/sklearn/utils/validation.py", line 72, in inner_f
    return f(**kwargs)
  File "/opt/anaconda3/envs/learn-env/lib/python3.6/site-packages/s

array([nan, nan, nan, nan, nan])

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = {'n_neighbors':np.arange(1,10,2), 'p':[1, 2]}

model = knn
grid_search = GridSearchCV(model, parameters, cv=5)
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_score_

In [None]:
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

In [None]:
best_model.score(X_test, y_test)

## Create Decision Tree Model

In [None]:
dt = DecisionTreeClassifier()
