In [None]:
# %%

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# %%

data_raw = pd.read_csv('df_clean.csv')
data_raw

  data_raw = pd.read_csv('df_clean.csv')


Unnamed: 0,year,pct,ser_num,datestop,timestop,recstat,inout,trhsloc,perobs,crimsusp,...,addrpct,sector,beat,post,xcoord,ycoord,dettypcm,linecm,detailcm,height
0,2012,40,17,1012012,115,1,O,Neither,2,ROBBERY,...,40,C,*,,1008031,233036,CM,1,85,69
1,2012,23,691,1012012,310,1,I,Neither,2,M,...,23,D,,12,1000852,228179,CM,1,9,67
2,2012,81,3714,1012012,2000,1,O,Neither,1,ROBBERY,...,81,C,3,,1001869,190702,CM,1,85,69
3,2012,81,633,1022012,1245,1,O,Neither,3,ROBBERY,...,81,J,*,,1005306,186668,CM,1,85,65
4,2012,66,36,1042012,2220,A,O,Neither,2,FELONY,...,66,J,,,986887,173599,CM,1,46,67
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
527410,2012,123,1949,11042012,1930,A,O,Neither,1,FELONY,...,123,B,,,916798,125363,CM,1,14,70
527411,2012,123,1950,11042012,1930,A,O,Neither,1,FELONY,...,123,B,,,916798,125363,CM,1,14,64
527412,2012,123,1951,11042012,2305,A,O,Neither,2,FELONY,...,123,C,,,928562,126358,CM,1,14,65
527413,2012,123,1952,11042012,2305,A,O,Neither,2,FELONY,...,123,C,,,928562,126358,CM,1,14,68


In [None]:
# %%

# Check the content and missing values for the identified weapon-related columns
selected_weapon_columns = ['pistol', 'riflshot', 'asltweap', 'knifcuti', 'machgun', 'othrweap']
weapon_info = data_raw[selected_weapon_columns].describe(include='all')

# Display the summary of weapon-related columns
weapon_info

Unnamed: 0,pistol,riflshot,asltweap,knifcuti,machgun,othrweap
count,527415,527415,527415,527415,527415,527415
unique,2,2,2,2,2,2
top,N,N,N,N,N,N
freq,526754,527402,527364,522754,527412,526105


In [None]:
# %%

# Create a binary target variable 'armed'
# Assign 1 if any weapon-related column has 'Y', otherwise 0
data_raw['armed'] = ((data_raw['pistol'] == 'Y') | (data_raw['riflshot'] == 'Y') | (data_raw['asltweap'] == 'Y') | (data_raw['knifcuti'] == 'Y') | (data_raw['machgun'] == 'Y') | (data_raw['othrweap'] == 'Y')).astype(int)

# Check the distribution of the new 'armed' column
armed_distribution = data_raw['armed'].value_counts()

# Display the distribution of the 'armed' target variable
armed_distribution

armed
0    521033
1      6382
Name: count, dtype: int64

In [None]:
# %%

# Feature selection based on relevance and initial analysis
selected_features = ['trhsloc', 'perobs', 'frisked', 'searched', 'contrabn', 'inout', 'sex', 'race', 'height', 'build']

# Extract the selected features and target variable
feature_data = data_raw[selected_features + ['armed']]

# Check for missing values in the selected features
missing_values = feature_data.isnull().sum()

# Display the missing values information
missing_values

trhsloc     0
perobs      0
frisked     0
searched    0
contrabn    0
inout       0
sex         0
race        0
height      0
build       0
armed       0
dtype: int64

In [None]:
# %%

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Identify categorical, numerical, binary columns
categorical_columns = ['trhsloc', 'inout', 'sex', 'race', 'build']
numerical_columns = ['perobs', 'height'] 
binary_columns = ['frisked', 'searched', 'contrabn']

for col in binary_columns:
    data_raw[col] = (data_raw[col] == 'Y').astype(int)

In [None]:
# %%

# Create transformers for categorical and numerical data
categorical_transformer = OneHotEncoder(drop='first')  # Use drop='first' to avoid dummy variable trap
numerical_transformer = StandardScaler()

# Combine transformers into a preprocessor with ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    ])

# Apply transformations to the selected features
X = feature_data.drop('armed', axis=1)  # Features
y = feature_data['armed']  # Target variable

# Create a preprocessing pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])
X_transformed = pipeline.fit_transform(X)

# Display the shape of the transformed feature matrix to confirm the changes
X_transformed.shape

(527415, 16)

In [None]:
# %%

X_transformed

array([[-0.11584972,  0.12620042,  1.        , ...,  0.        ,
         0.        ,  1.        ],
       [-0.11584972, -0.50599435,  1.        , ...,  1.        ,
         0.        ,  0.        ],
       [-0.39943181,  0.12620042,  1.        , ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [-0.11584972, -1.13818913,  1.        , ...,  1.        ,
         0.        ,  0.        ],
       [-0.11584972, -0.18989697,  1.        , ...,  1.        ,
         0.        ,  0.        ],
       [-0.39943181,  0.44229781,  1.        , ...,  1.        ,
         0.        ,  0.        ]])

In [None]:
# %%

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Define the model: Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)

# Define the cross-validation strategy: StratifiedKFold to maintain the proportion of the target class
cv_strategy = StratifiedKFold(n_splits=5)

# Define scoring metrics
scoring_metrics = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score),
}

# Perform cross-validation using multiple scoring metrics
cv_results = cross_validate(rf_classifier, X_transformed, y, cv=cv_strategy, scoring=scoring_metrics)

# Collect average scores across all cross-validation folds for each metric
average_scores = {metric: scores.mean() for metric, scores in cv_results.items() if 'test_' in metric}

# Display the average scores
average_scores

{'test_accuracy': 0.6943848771840012,
 'test_precision': 0.016940385458758908,
 'test_recall': 0.42478538797092524,
 'test_f1': 0.03257700583733002}

In [None]:
# %%

# Re-select features for correlation analysis including the target 'armed'
selected_features_for_correlation = ['trhsloc', 'perobs', 'frisked', 'searched', 'contrabn', 'inout', 'sex', 'race', 'height', 'build', 'armed']

# Extract these features from the dataset
correlation_data = data_raw[selected_features_for_correlation]

# Explicitly convert binary and categorical variables for correlation analysis
categorical_to_convert = ['frisked', 'searched', 'contrabn', 'sex', 'race', 'inout', 'trhsloc', 'build']

# Convert 'Y'/'N' binary features to 0/1 and other categorical features to numerical codes
for col in categorical_to_convert:
    # If the feature is known to be binary and stored as 'Y'/'N'
    if data_raw[col].dtype == object and sorted(data_raw[col].unique()) == ['N', 'Y']:
        data_raw[col] = (data_raw[col] == 'Y').astype(int)
    else:
        # Convert other categorical features to category codes
        data_raw[col] = data_raw[col].astype('category').cat.codes

# Now recalculate the correlation matrix with these adjustments
correlation_matrix = data_raw[categorical_to_convert + ['armed']].corr()

# Display the updated correlation matrix
correlation_matrix

Unnamed: 0,frisked,searched,contrabn,sex,race,inout,trhsloc,build,armed
frisked,1.0,0.240968,0.078869,0.130021,-0.013849,0.138681,-0.014846,0.014751,0.080365
searched,0.240968,1.0,0.273535,0.011989,0.01159,-0.001536,-0.00787,0.004497,0.243112
contrabn,0.078869,0.273535,1.0,-0.009198,0.005614,0.003809,-0.033605,0.00034,0.073159
sex,0.130021,0.011989,-0.009198,1.0,0.003588,0.08784,0.005456,-0.020446,0.007211
race,-0.013849,0.01159,0.005614,0.003588,1.0,0.043612,0.06769,-0.028573,0.018387
inout,0.138681,-0.001536,0.003809,0.08784,0.043612,1.0,-0.131959,-0.010237,-0.013674
trhsloc,-0.014846,-0.00787,-0.033605,0.005456,0.06769,-0.131959,1.0,-0.002517,0.042146
build,0.014751,0.004497,0.00034,-0.020446,-0.028573,-0.010237,-0.002517,1.0,-0.003249
armed,0.080365,0.243112,0.073159,0.007211,0.018387,-0.013674,0.042146,-0.003249,1.0


In [None]:
# %%

# Check value counts for each binary and categorical variable
value_counts = {col: data_raw[col].value_counts() for col in categorical_to_convert}
value_counts

{'frisked': frisked
 1    294240
 0    233175
 Name: count, dtype: int64,
 'searched': searched
 0    483607
 1     43808
 Name: count, dtype: int64,
 'contrabn': contrabn
 0    518302
 1      9113
 Name: count, dtype: int64,
 'sex': sex
 1    482321
 0     37557
 2      7537
 Name: count, dtype: int64,
 'race': race
 2    281294
 6    128171
 5     49871
 3     35358
 1     16952
 4     13532
 0      2237
 Name: count, dtype: int64,
 'inout': inout
 1    413416
 0    113999
 Name: count, dtype: int64,
 'trhsloc': trhsloc
 1    409788
 0     76895
 2     40732
 Name: count, dtype: int64,
 'build': build
 1    308540
 3    173523
 0     43093
 2      2259
 Name: count, dtype: int64}

In [None]:
# %%

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Prepare the features (X) and target (y)
X = data_raw[selected_features]  # Assuming 'selected_features' includes all the features you want to use
y = data_raw['armed']            # The target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize and train the Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
rf.fit(X_train, y_train)

# Extract feature importance
feature_importances = rf.feature_importances_
features = X_train.columns
importance_dict = dict(zip(features, feature_importances))

# Display sorted features by their importance
sorted_importance = sorted(importance_dict.items(), key=lambda item: item[1], reverse=True)
sorted_importance

[('searched', 0.45338695349301583),
 ('height', 0.1476364546901742),
 ('frisked', 0.14310551184762763),
 ('perobs', 0.0702087664916942),
 ('race', 0.052632854670990686),
 ('trhsloc', 0.04894988955398196),
 ('build', 0.033421733890513144),
 ('contrabn', 0.024843098501893524),
 ('sex', 0.013336509876415133),
 ('inout', 0.012478226983693576)]

In [None]:
# %%

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Make predictions on the test data
y_pred = rf.predict(X_test)
y_prob = rf.predict_proba(X_test)[:, 1]  # Probabilities for the positive class

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)

# Display the metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC-AUC:", roc_auc)

Accuracy: 0.9204421565560327
Precision: 0.07973068745570518
Recall: 0.5289968652037618
F1 Score: 0.13857524122356804
ROC-AUC: 0.7549855366848353
