#### Problem Statement: Predicting the probability of crimes in District of Columbia.

In [57]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import OrdinalEncoder

from sklearn.ensemble import RandomForestClassifier
import lightgbm
import mlflow

from sklearn.metrics import roc_auc_score
from sklearn.metrics import mutual_info_score
from sklearn.metrics import f1_score


##### Feature Engineering & Selection

In [58]:
df = pd.read_csv('cleaned_dc_crime.csv')

In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24535 entries, 0 to 24534
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   x                     24535 non-null  float64
 1   y                     24535 non-null  float64
 2   ccn                   24535 non-null  int64  
 3   report_dat            24535 non-null  object 
 4   shift                 24535 non-null  object 
 5   method                24535 non-null  object 
 6   offense               24535 non-null  object 
 7   block                 24535 non-null  object 
 8   xblock                24535 non-null  float64
 9   yblock                24535 non-null  float64
 10  ward                  24535 non-null  float64
 11  anc                   24535 non-null  object 
 12  district              24535 non-null  float64
 13  psa                   24535 non-null  float64
 14  neighborhood_cluster  24535 non-null  object 
 15  block_group        

In [60]:
num_types = ['int64', 'float64']

In [61]:
df['anc'].unique()
#shift, method,offense,block,anc

array(['5A', '7F', '2E', '8C', '2C', '3E', '5F', '5C', '5D', '1B', '7D',
       '8E', '8D', '1A', '8F', '1E', '3C', '2A', '4A', '4B', '4E', '6B',
       '8A', '7B', '7C', '2B', '6D', '6E', '3A', '1C', '3F', '2F', '2G',
       '7E', '6A', '5E', '5B', '3/4G', '4C', '6C', '8B', '4D', '1D', '2D',
       '3B', '3D'], dtype=object)

In [62]:
features = ['ward','district','shift','method','block','anc','offense']
new_df = df[features]

In [63]:
new_df.head()

Unnamed: 0,ward,district,shift,method,block,anc,offense
0,5.0,4.0,DAY,OTHERS,700 - 742 BLOCK OF DELAFIELD STREET NE,5A,THEFT F/AUTO
1,7.0,6.0,EVENING,OTHERS,4500 - 4599 BLOCK OF EADS PLACE NE,7F,THEFT/OTHER
2,7.0,6.0,MIDNIGHT,GUN,4400 - 4499 BLOCK OF TEXAS AVENUE SE,7F,ROBBERY
3,2.0,2.0,EVENING,OTHERS,1851 - 2008 BLOCK OF WISCONSIN AVENUE NW,2E,THEFT/OTHER
4,8.0,7.0,EVENING,OTHERS,710 - 798 BLOCK OF ALABAMA AVENUE SE,8C,THEFT/OTHER


In [64]:
new_df['offense'].value_counts()

offense
THEFT/OTHER                   10874
THEFT F/AUTO                   5541
MOTOR VEHICLE THEFT            4247
ROBBERY                        1821
ASSAULT W/DANGEROUS WEAPON      887
BURGLARY                        860
HOMICIDE                        168
SEX ABUSE                       132
ARSON                             5
Name: count, dtype: int64

In [65]:
# Ordinal Encoding is used to encode the value of offense
#8.0 => THEFT/OTHER 
#7.0 => THEFT F/AUTO 
#4.0 => MOTOR VEHICLE THEFT 
#5.0 => ROBBERY 
#1.0 => ASSAULT W/DANGEROUS WEAPON 
#2.0 => BURGLARY 
#3.0 => HOMICIDE 
#6.0 => SEX ABUSE
#0.0 => ARSON
encoder = OrdinalEncoder()
new_df['offense'] = encoder.fit_transform(df[['offense']])
new_df['offense'].value_counts(normalize=True)

offense
8.0    0.443204
7.0    0.225841
4.0    0.173100
5.0    0.074221
1.0    0.036152
2.0    0.035052
3.0    0.006847
6.0    0.005380
0.0    0.000204
Name: proportion, dtype: float64

In [66]:
SEED = 42
df_full_train, df_test = train_test_split(new_df, test_size=0.2, random_state=SEED)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=SEED)
len(df_full_train), len(df_test), len(df_train)+len(df_val), len(df_train), len(df_val)

(19628, 4907, 19628, 14721, 4907)

In [67]:
# split the customer_status feature from the rest of the dataframe
y_train = df_train['offense'].values
y_val = df_val['offense'].values
y_test = df_test['offense'].values

In [68]:
del df_train['offense']
del df_val['offense']
del df_test['offense']

In [69]:
df_full_train = df_full_train.reset_index(drop=True)

In [70]:
df_train = df_train.reset_index(drop=True)
df_val =  df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [71]:
total_offense = df_full_train.offense.value_counts()[0.0]
total_offense_rate = total_offense/df_full_train.offense.count()

In [72]:
total_offense_rate = total_offense_rate
total_offense_rate

0.00025473812920317915

In [73]:
num_types = ['int64', 'float64']
numeric = list(new_df.select_dtypes(include=num_types))

In [74]:
categorical = list(df_full_train.select_dtypes(include=['O']))

In [75]:
categorical

['shift', 'method', 'block', 'anc']

In [76]:
for c in categorical:
    print(c)
    df_group = df_full_train.groupby(c).offense.agg(['mean', 'count'])
    df_group['diff'] = total_offense_rate - df_group['mean']
    df_group['risk_ratio'] =  df_group['mean'] / total_offense_rate 
    display(df_group)
    print()
    print()

shift


Unnamed: 0_level_0,mean,count,diff,risk_ratio
shift,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
DAY,6.561651,7729,-6.561396,25758.416872
EVENING,6.558448,7836,-6.558193,25745.844206
MIDNIGHT,5.545656,4063,-5.545401,21770.026877




method


Unnamed: 0_level_0,mean,count,diff,risk_ratio
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GUN,3.702044,1517,-3.701789,14532.741991
KNIFE,3.183824,272,-3.183569,12498.417647
OTHERS,6.623521,17839,-6.623267,26001.295992




block


Unnamed: 0_level_0,mean,count,diff,risk_ratio
block,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0 - 0 BLOCK OF BEACH DRIVE NW,7.0,1,-6.999745,27479.2
1 - 1 BLOCK OF CHEVY CHASE CIRCLE NW,7.5,2,-7.499745,29442.0
1 - 1 BLOCK OF THOMAS CIRCLE NW,8.0,33,-7.999745,31404.8
1 - 10 BLOCK OF KENNEDY STREET NW,4.0,1,-3.999745,15702.4
1 - 10 BLOCK OF RIDGE ROAD SE,1.0,1,-0.999745,3925.6
...,...,...,...,...
WHITEHAVEN PARKWAY NW AND WISCONSIN AVENUE NW,7.0,1,-6.999745,27479.2
WHITNEY M YOUNG BRIDGE (BRIDGE),1.0,1,-0.999745,3925.6
WISCONSIN AVENUE NW AND FESSENDEN STREET NW,4.0,1,-3.999745,15702.4
WOODLEY PLACE NW AND WOODLEY ROAD NW,7.0,1,-6.999745,27479.2




anc


Unnamed: 0_level_0,mean,count,diff,risk_ratio
anc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1A,6.697436,780,-6.697181,26291.454359
1B,6.714148,1039,-6.713893,26357.06025
1C,6.983368,481,-6.983113,27413.909356
1D,5.942857,210,-5.942602,23329.28
1E,6.326733,404,-6.326478,24836.221782
2A,7.417544,285,-7.417289,29118.310175
2B,7.163842,531,-7.163587,28122.377401
2C,7.322702,1066,-7.322447,28745.997749
2D,7.142857,35,-7.142602,28040.0
2E,7.391924,421,-7.391669,29017.736817






In [77]:
def mutual_info_offense_rate(series):
    return mutual_info_score(series, df_full_train.offense)

In [78]:
mutual_info = df_full_train[categorical].apply(mutual_info_offense_rate)
mutual_info.sort_values(ascending=False)

block     0.852577
method    0.203896
anc       0.093815
shift     0.045702
dtype: float64

In [79]:
df_full_train[numeric].corrwith(df_full_train.offense)

ward       -0.252866
district   -0.255469
offense     1.000000
dtype: float64

In [80]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("dc-crime-experiment")

<Experiment: artifact_location='file:///d:/code/dc_crime/notebook/mlruns/1', creation_time=1733341301047, experiment_id='1', last_update_time=1733341301047, lifecycle_stage='active', name='dc-crime-experiment', tags={}>

In [81]:
def train(data, y,model):
    dicts = data[categorical + numeric].to_dict(orient='records')

    dv = DictVectorizer(sparse=True)
    X_train = dv.fit_transform(dicts)

    m = model.fit(X_train, y)

    return dv, m

def predict(data, dv, model):
    dicts = data[categorical + numeric].to_dict(orient='records')

    X = dv.transform(dicts)

    y_pred = model.predict(X)
    y_pred_prob = model.predict_proba(X)

    return y_pred,y_pred_prob

In [84]:
rf = RandomForestClassifier(
    max_depth=None,
    n_estimators=100,
    min_samples_leaf=1,
    random_state=1,
    n_jobs=-1,
        )