# Pre-Processing

In [1]:
#import Libraries

# Libraries to connect to database
from sqlalchemy import create_engine
import psycopg2
from config import passwordAWS

# Libraries used to build supervised model
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import datetime as dt
from sklearn.metrics import accuracy_score

In [2]:
# Create a database connection string
db_string = f"postgresql://postgres:{passwordAWS}@crime-time.ce0zc4gjswru.us-east-2.rds.amazonaws.com:5432/postgres"

# Create a database engine
engine = create_engine(db_string)

In [3]:
# Read in database
crime_df = pd.read_sql_query('SELECT * FROM "crime_merged"', con=engine)
crime_df.head()

Unnamed: 0,id_no,primary_type,location_description,arrest,domestic,crime_date,crime_time,block,district,latitude,longitude
0,10224740,NARCOTICS,SIDEWALK,True,False,2015-09-05,12:45:00,035XX W BARRY AVE,14,41.937406,-87.71665
1,10224789,WEAPONS VIOLATION,STREET,True,False,2015-09-05,15:12:00,052XX W QUINCY ST,15,41.877553,-87.75613
2,10224860,THEFT,RESIDENCE,False,False,2015-09-05,16:20:00,016XX N PARKSIDE AVE,25,41.910285,-87.766772
3,10224914,OTHER OFFENSE,SMALL RETAIL STORE,False,False,2015-09-05,19:10:00,032XX W LAWRENCE AVE,17,41.968435,-87.710088
4,10224944,PUBLIC PEACE VIOLATION,APARTMENT,True,False,2015-09-05,13:21:00,046XX S TALMAN AVE,9,41.809611,-87.690579


In [4]:
# Confirming no null data
crime_df.isnull().sum()

id_no                   0
primary_type            0
location_description    0
arrest                  0
domestic                0
crime_date              0
crime_time              0
block                   0
district                0
latitude                0
longitude               0
dtype: int64

In [5]:
# Counting rows
crime_df.count()

id_no                   1048575
primary_type            1048575
location_description    1048575
arrest                  1048575
domestic                1048575
crime_date              1048575
crime_time              1048575
block                   1048575
district                1048575
latitude                1048575
longitude               1048575
dtype: int64

In [6]:
# dropping unnessary columns
cleaned_crime_df = crime_df.drop(columns = ['id_no', 'block', 'domestic'])

In [7]:
## Extracting the hour from the Date column and then dropping the date column
cleaned_crime_df["Hour"] = pd.to_datetime(cleaned_crime_df["crime_time"]).dt.hour
cleaned_crime_df["Month"] = pd.to_datetime(cleaned_crime_df["crime_date"]).dt.month

In [8]:
#Binning the time data
bins = [-1, 3, 7, 11, 15, 19, 24]
cleaned_crime_df['Time_binned'] = pd.cut(cleaned_crime_df['Hour'],
                                         bins=bins, 
                                         labels = [1,2,3,4,5,6])

In [9]:
# Dropping the hour column in favor of Time_binned & confirming Dataframe looks as expected
cleaned_crime_df = cleaned_crime_df.drop(columns = ['Hour', 'crime_date', 'crime_time'])
cleaned_crime_df.head()

Unnamed: 0,primary_type,location_description,arrest,district,latitude,longitude,Month,Time_binned
0,NARCOTICS,SIDEWALK,True,14,41.937406,-87.71665,9,4
1,WEAPONS VIOLATION,STREET,True,15,41.877553,-87.75613,9,4
2,THEFT,RESIDENCE,False,25,41.910285,-87.766772,9,5
3,OTHER OFFENSE,SMALL RETAIL STORE,False,17,41.968435,-87.710088,9,5
4,PUBLIC PEACE VIOLATION,APARTMENT,True,9,41.809611,-87.690579,9,4


In [10]:
#Adding labelencoder for Primaty Type of crime
le = LabelEncoder()
cleaned_crime_df['primary_type'] = le.fit_transform(cleaned_crime_df['primary_type'])
cleaned_crime_df['location_description'] = le.fit_transform(cleaned_crime_df['location_description'])
cleaned_crime_df.head(10)

Unnamed: 0,primary_type,location_description,arrest,district,latitude,longitude,Month,Time_binned
0,18,120,True,14,41.937406,-87.71665,9,4
1,33,124,True,15,41.877553,-87.75613,9,4
2,32,103,False,25,41.910285,-87.766772,9,5
3,25,121,False,17,41.968435,-87.710088,9,5
4,28,17,True,9,41.809611,-87.690579,9,4
5,2,17,False,9,41.797308,-87.644017,9,1
6,6,17,False,20,41.973401,-87.672158,9,6
7,32,124,False,12,41.886324,-87.660787,9,4
8,3,110,False,14,41.906697,-87.671705,9,1
9,32,44,False,1,41.879112,-87.626111,9,6


# RandomForest Model

In [11]:
# Define the features
X = cleaned_crime_df.copy()
X = X.drop("arrest", axis=1)
X.head(10)

Unnamed: 0,primary_type,location_description,district,latitude,longitude,Month,Time_binned
0,18,120,14,41.937406,-87.71665,9,4
1,33,124,15,41.877553,-87.75613,9,4
2,32,103,25,41.910285,-87.766772,9,5
3,25,121,17,41.968435,-87.710088,9,5
4,28,17,9,41.809611,-87.690579,9,4
5,2,17,9,41.797308,-87.644017,9,1
6,6,17,20,41.973401,-87.672158,9,6
7,32,124,12,41.886324,-87.660787,9,4
8,3,110,14,41.906697,-87.671705,9,1
9,32,44,1,41.879112,-87.626111,9,6


In [12]:
# Define the target
y = cleaned_crime_df["arrest"].ravel()
y[:5]

array([ True,  True, False, False,  True])

In [13]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, train_size=0.25)

In [14]:
# Creating a StandardScaler
scaler = StandardScaler()

# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [15]:
#WARNING: COMPUTATIONALLY HEAVY
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=70, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [16]:
# sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.3167493765406706, 'primary_type'),
 (0.19823192252803026, 'latitude'),
 (0.19474515759861025, 'longitude'),
 (0.1122103688419249, 'location_description'),
 (0.09510691433409076, 'Month'),
 (0.04711283702669818, 'Time_binned'),
 (0.03584342312997507, 'district')]

In [17]:
# Making predictions using X_test_scaled
predictions = rf_model.predict(X_test_scaled)
predictions

array([False, False, False, ..., False,  True, False])

In [18]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Not Arrested", "Actual Arrested"], columns=["Predicted Not Arrested", "Predicted Arrested"])

cm_df

Unnamed: 0,Predicted Not Arrested,Predicted Arrested
Actual Not Arrested,605191,18597
Actual Arrested,79411,83233


In [19]:
report = classification_report(y_test, predictions)
print(report)

              precision    recall  f1-score   support

       False       0.88      0.97      0.93    623788
        True       0.82      0.51      0.63    162644

    accuracy                           0.88    786432
   macro avg       0.85      0.74      0.78    786432
weighted avg       0.87      0.88      0.86    786432



In [20]:
accuracy_score(y_test, predictions)

0.8753763834635416