# Pre-Processing

In [4]:
#import

import pandas as pd
import io
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import datetime as dt
from datetime import datetime
from sklearn.metrics import accuracy_score

from sqlalchemy import create_engine
import psycopg2
from config import passwordAWS

In [5]:
# Create a database connection string
db_string = f"postgresql://postgres:{passwordAWS}@crime-time.ce0zc4gjswru.us-east-2.rds.amazonaws.com:5432/postgres"

# Create a database engine
engine = create_engine(db_string)

In [7]:
# Read in database
crime_df = pd.read_sql_query('SELECT * FROM "crime_merged"', con=engine)
crime_df.head()

Unnamed: 0,id_no,primary_type,location_description,arrest,domestic,crime_date,crime_time,block,district,latitude,longitude
0,10224740,NARCOTICS,SIDEWALK,True,False,2015-09-05,12:45:00,035XX W BARRY AVE,14,41.937406,-87.71665
1,10224789,WEAPONS VIOLATION,STREET,True,False,2015-09-05,15:12:00,052XX W QUINCY ST,15,41.877553,-87.75613
2,10224860,THEFT,RESIDENCE,False,False,2015-09-05,16:20:00,016XX N PARKSIDE AVE,25,41.910285,-87.766772
3,10224914,OTHER OFFENSE,SMALL RETAIL STORE,False,False,2015-09-05,19:10:00,032XX W LAWRENCE AVE,17,41.968435,-87.710088
4,10224944,PUBLIC PEACE VIOLATION,APARTMENT,True,False,2015-09-05,13:21:00,046XX S TALMAN AVE,9,41.809611,-87.690579


In [8]:
# Taking Count
crime_df.count()

id_no                   1048575
primary_type            1048575
location_description    1048575
arrest                  1048575
domestic                1048575
crime_date              1048575
crime_time              1048575
block                   1048575
district                1048575
latitude                1048575
longitude               1048575
dtype: int64

In [9]:
# Drop na values
crime_df = crime_df.dropna()

In [10]:
# Validate drops in na
crime_df.count()

id_no                   1048575
primary_type            1048575
location_description    1048575
arrest                  1048575
domestic                1048575
crime_date              1048575
crime_time              1048575
block                   1048575
district                1048575
latitude                1048575
longitude               1048575
dtype: int64

In [14]:
# dropping unnessary columns ( atbitrary placeholders)
cleaned_crime_df = crime_df.drop(columns = ['id_no', 'block', 'crime_date'])

In [33]:
## Extracting the hour from the Date column and then dropping the date column
cleaned_crime_df["Hour"] = pd.to_datetime(cleaned_crime_df["crime_time"]).dt.hour
cleaned_crime_df = cleaned_crime_df.drop(columns = ['crime_time'])

In [34]:
cleaned_crime_df.head()

Unnamed: 0,primary_type,location_description,arrest,domestic,district,latitude,longitude,Time_binned,Hour
0,18,120,True,False,14,41.937406,-87.71665,4,12
1,33,124,True,False,15,41.877553,-87.75613,4,15
2,32,103,False,False,25,41.910285,-87.766772,5,16
3,25,121,False,False,17,41.968435,-87.710088,5,19
4,28,17,True,False,9,41.809611,-87.690579,4,13


In [35]:
#Binning the time data
bins = [-1, 3, 7, 11, 15, 19, 24]
cleaned_crime_df['Time_binned'] = pd.cut(cleaned_crime_df['Hour'],
                                         bins=bins, 
                                         labels = [1,2,3,4,5,6])

In [36]:
# Dropping the hour column in favor of Time_binned
cleaned_crime_df = cleaned_crime_df.drop(columns = ["Hour"])

In [37]:
cleaned_crime_df.head()

Unnamed: 0,primary_type,location_description,arrest,domestic,district,latitude,longitude,Time_binned
0,18,120,True,False,14,41.937406,-87.71665,4
1,33,124,True,False,15,41.877553,-87.75613,4
2,32,103,False,False,25,41.910285,-87.766772,5
3,25,121,False,False,17,41.968435,-87.710088,5
4,28,17,True,False,9,41.809611,-87.690579,4


In [38]:
#Adding labelencoder for Primaty Type of crime
le = LabelEncoder()
cleaned_crime_df['primary_type'] = le.fit_transform(cleaned_crime_df['primary_type'])
cleaned_crime_df['location_description'] = le.fit_transform(cleaned_crime_df['location_description'])

In [39]:
cleaned_crime_df.head(10)

Unnamed: 0,primary_type,location_description,arrest,domestic,district,latitude,longitude,Time_binned
0,18,120,True,False,14,41.937406,-87.71665,4
1,33,124,True,False,15,41.877553,-87.75613,4
2,32,103,False,False,25,41.910285,-87.766772,5
3,25,121,False,False,17,41.968435,-87.710088,5
4,28,17,True,False,9,41.809611,-87.690579,4
5,2,17,False,True,9,41.797308,-87.644017,1
6,6,17,False,False,20,41.973401,-87.672158,6
7,32,124,False,False,12,41.886324,-87.660787,4
8,3,110,False,False,14,41.906697,-87.671705,1
9,32,44,False,False,1,41.879112,-87.626111,6


# RandomForest Model

In [40]:
# Define the features set.
X = cleaned_crime_df.copy()
X = X.drop("arrest", axis=1)
X.head(10)

Unnamed: 0,primary_type,location_description,domestic,district,latitude,longitude,Time_binned
0,18,120,False,14,41.937406,-87.71665,4
1,33,124,False,15,41.877553,-87.75613,4
2,32,103,False,25,41.910285,-87.766772,5
3,25,121,False,17,41.968435,-87.710088,5
4,28,17,False,9,41.809611,-87.690579,4
5,2,17,True,9,41.797308,-87.644017,1
6,6,17,False,20,41.973401,-87.672158,6
7,32,124,False,12,41.886324,-87.660787,4
8,3,110,False,14,41.906697,-87.671705,1
9,32,44,False,1,41.879112,-87.626111,6


In [41]:
# Define the target set.
y = cleaned_crime_df["arrest"].ravel()
y[:5]

array([ True,  True, False, False,  True])

In [42]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, train_size=0.25)

In [43]:
# Creating a StandardScaler
scaler = StandardScaler()

# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [44]:
#WARNING: COMPUTATIONALLY HEAVY
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=70, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [45]:
# sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.3300773575972211, 'primary_type'),
 (0.23212550070943466, 'latitude'),
 (0.22763115083989696, 'longitude'),
 (0.11561846741482655, 'location_description'),
 (0.044789295003890436, 'Time_binned'),
 (0.03672375999658509, 'district'),
 (0.013034468438145246, 'domestic')]

In [46]:
# Making predictions using X_test_scaled
predictions = rf_model.predict(X_test_scaled)
predictions

array([False, False, False, ..., False, False, False])

In [47]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,598454,25334
Actual 1,75590,87054


In [48]:
report = classification_report(y_test, predictions)
print(report)

              precision    recall  f1-score   support

       False       0.89      0.96      0.92    623788
        True       0.77      0.54      0.63    162644

    accuracy                           0.87    786432
   macro avg       0.83      0.75      0.78    786432
weighted avg       0.86      0.87      0.86    786432



In [49]:
accuracy_score(y_test, predictions)

0.8716684977213541