In [1]:
# Imports for machine learning model

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [9]:
#Skeleton for our model

# Load the data
file_path = Path('./Resources/Machine Learning Features.csv')
df = pd.read_csv(file_path)

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()


In [10]:
df.head()

Unnamed: 0.1,Unnamed: 0,AnalysisNeighborhood,AvgHomeCost,CS Score,the_geom,TRACTCE10,PTrans_Sco,Trans_Sco,VCrim_Rate,LivAl_Per,EldLivAl_Per,Viol_Rate,Emp_per,Ec_Score,Pov_Per,PopDens,DayPopDens,Res_Score
0,0,Bayview,349602.14105,1.7384,MULTIPOLYGON (((-122.38903499954088 37.7329189...,23200,13.758,1.0,105.799,0.224654,0.073887,7.78,0.838265,1.0,0.413469,6944.77,9850.25,1.0
1,1,Bayview,349602.14105,1.7384,MULTIPOLYGON (((-122.38155800020716 37.7381230...,23102,13.758,1.0,105.799,0.224654,0.073887,7.78,0.838265,1.0,0.413469,6944.77,9850.25,1.0
2,2,Bayview,349602.14105,1.7384,MULTIPOLYGON (((-122.39262499954708 37.7292780...,23300,13.758,1.0,105.799,0.224654,0.073887,7.78,0.838265,1.0,0.413469,6944.77,9850.25,1.0
3,3,Bayview,349602.14105,1.7384,MULTIPOLYGON (((-122.38451700015729 37.7228620...,23400,13.758,1.0,105.799,0.224654,0.073887,7.78,0.838265,1.0,0.413469,6944.77,9850.25,1.0
4,4,Bayview,349602.14105,1.7384,MULTIPOLYGON (((-122.39594499999093 37.7377840...,23001,13.758,1.0,105.799,0.224654,0.073887,7.78,0.838265,1.0,0.413469,6944.77,9850.25,1.0


In [11]:
#drop columns that wont be used
df = df.drop(columns = ['the_geom', 'TRACTCE10'])

#drop duplicate neighborhoods
df = df.drop_duplicates(subset=['AnalysisNeighborhood'])

In [12]:
df.head()

Unnamed: 0.1,Unnamed: 0,AnalysisNeighborhood,AvgHomeCost,CS Score,PTrans_Sco,Trans_Sco,VCrim_Rate,LivAl_Per,EldLivAl_Per,Viol_Rate,Emp_per,Ec_Score,Pov_Per,PopDens,DayPopDens,Res_Score
0,0,Bayview,349602.14105,1.7384,13.758,1.0,105.799,0.224654,0.073887,7.78,0.838265,1.0,0.413469,6944.77,9850.25,1.0
11,11,Bernal Heights,481784.205633,1.788646,28.6832,3.0,43.7684,0.24307,0.050114,6.9,0.927948,3.0,0.242756,22066.8,16503.8,3.0
17,17,Castro/Upper Market,803432.976477,1.791669,36.8504,4.0,64.1234,0.428756,0.077243,9.34,0.930183,4.0,0.161876,23023.0,18531.0,5.0
23,23,Chinatown,945772.065002,1.565266,89.9066,5.0,50.8277,0.473996,0.247639,10.59,0.842658,1.0,0.657467,70416.6,278476.0,1.0
27,27,Excelsior,344472.019289,1.705869,23.0295,2.0,34.5181,0.177211,0.082659,4.33,0.913833,2.0,0.294074,23767.5,21733.0,2.0


In [27]:
df['VCrim_Rate'].mean()

45.83977666666668

In [28]:
df['CS Score'].mean()

1.7583995177741463

In [31]:
# Create categories for low crime and high crime 

col1= 'CS Score'
col2= 'VCrim_Rate'

conditions = [ (df[col1] >= 1.75) | (df[col2] >= 70), (df[col1] < 1.74) | (df[col2] < 69)]

choices = [ "more crime", 'less crime']

df["CS"] = np.select(conditions, choices, default=np.nan)


In [32]:
df

Unnamed: 0.1,Unnamed: 0,AnalysisNeighborhood,AvgHomeCost,CS Score,PTrans_Sco,Trans_Sco,VCrim_Rate,LivAl_Per,EldLivAl_Per,Viol_Rate,Emp_per,Ec_Score,Pov_Per,PopDens,DayPopDens,Res_Score,CS
0,0,Bayview,349602.1,1.7384,13.758,1.0,105.799,0.224654,0.073887,7.78,0.838265,1.0,0.413469,6944.77,9850.25,1.0,more crime
11,11,Bernal Heights,481784.2,1.788646,28.6832,3.0,43.7684,0.24307,0.050114,6.9,0.927948,3.0,0.242756,22066.8,16503.8,3.0,more crime
17,17,Castro/Upper Market,803433.0,1.791669,36.8504,4.0,64.1234,0.428756,0.077243,9.34,0.930183,4.0,0.161876,23023.0,18531.0,5.0,more crime
23,23,Chinatown,945772.1,1.565266,89.9066,5.0,50.8277,0.473996,0.247639,10.59,0.842658,1.0,0.657467,70416.6,278476.0,1.0,less crime
27,27,Excelsior,344472.0,1.705869,23.0295,2.0,34.5181,0.177211,0.082659,4.33,0.913833,2.0,0.294074,23767.5,21733.0,2.0,less crime
34,34,Crocker Amazon,344472.0,1.705869,18.5395,1.0,20.6687,0.174093,0.064249,3.83,0.884298,1.0,0.28779,28187.1,26867.7,1.0,less crime
38,38,Diamond Heights/Glen Park,577944.8,1.857835,24.6857,2.0,20.3355,0.376537,0.101817,5.47,0.937884,4.0,0.18451,12163.5,9734.72,5.0,more crime
41,41,Haight Ashbury,828805.9,1.781447,34.6076,4.0,29.4977,0.401279,0.062053,8.46,0.933325,4.0,0.2149,27822.6,21149.5,5.0,more crime
45,45,Downtown/Civic Center,801655.4,1.813581,83.2956,5.0,177.47,0.67612,0.144177,9.06,0.893339,1.0,0.561498,65411.7,101040.0,1.0,more crime
50,50,Inner Richmond,694302.9,1.708699,26.2058,3.0,17.3871,0.333675,0.071877,6.08,0.928048,3.0,0.256033,26842.1,30631.3,4.0,less crime


## Split into Train and Test

In [33]:
# Create our features
X = df.drop("CS", axis=1)

X = pd.get_dummies(X)

# Create our target
y = df["CS"]

In [34]:
X.describe()

Unnamed: 0.1,Unnamed: 0,AvgHomeCost,CS Score,PTrans_Sco,Trans_Sco,VCrim_Rate,LivAl_Per,EldLivAl_Per,Viol_Rate,Emp_per,...,AnalysisNeighborhood_Outer Sunset,AnalysisNeighborhood_Pacific Heights,AnalysisNeighborhood_Parkside,AnalysisNeighborhood_Potrero Hill,AnalysisNeighborhood_Russian Hill,AnalysisNeighborhood_South of Market,AnalysisNeighborhood_Twin Peaks,AnalysisNeighborhood_Visitacion Valley,AnalysisNeighborhood_West of Twin Peaks,AnalysisNeighborhood_Western Addition
count,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,...,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0
mean,83.833333,722405.8,1.7584,34.452245,2.966667,45.839777,0.370484,0.100716,6.259667,0.917259,...,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333
std,53.763905,457783.1,0.080385,22.422704,1.519604,42.82643,0.142839,0.040534,2.326066,0.031937,...,0.182574,0.182574,0.182574,0.182574,0.182574,0.182574,0.182574,0.182574,0.182574,0.182574
min,0.0,312501.1,1.565266,7.97374,1.0,13.1755,0.174093,0.042092,2.18,0.838265,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,42.0,441765.9,1.698709,18.6982,2.0,18.1242,0.222506,0.074401,4.3,0.906547,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,67.5,645603.4,1.751611,26.09665,3.0,29.70215,0.371407,0.087374,5.655,0.927877,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,130.0,832869.1,1.822479,45.68435,4.0,52.8989,0.508801,0.116617,8.07,0.933269,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,181.0,2803594.0,1.926499,89.9066,5.0,177.47,0.67612,0.247639,10.59,0.965727,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [35]:
# Check the balance of our target values
y.value_counts()

more crime    18
less crime    12
Name: CS, dtype: int64

In [36]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

## Balanced Random Forest Classifier

In [37]:
# Resample the training data with the BalancedRandomForestClassifier

from imblearn.ensemble import BalancedRandomForestClassifier
random_forest = BalancedRandomForestClassifier(n_estimators = 100)
random_forest = random_forest.fit(X_train, y_train)

In [38]:
# Calculated the balanced accuracy score

from sklearn.metrics import balanced_accuracy_score
y_pred = random_forest.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.8333333333333333

In [39]:
# Display the confusion matrix

from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[2, 1],
       [0, 5]], dtype=int64)

In [40]:
# Print the imbalanced classification report

from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

 less crime       1.00      0.67      1.00      0.80      0.82      0.64         3
 more crime       0.83      1.00      0.67      0.91      0.82      0.69         5

avg / total       0.90      0.88      0.79      0.87      0.82      0.67         8



In [41]:
# List the features sorted in descending order by feature importance

feature_names = X.columns
sorted(zip(random_forest.feature_importances_, feature_names), reverse=True)

[(0.1504486870395704, 'CS Score'),
 (0.1124685223211627, 'Pov_Per'),
 (0.08754124713210847, 'LivAl_Per'),
 (0.07813791980078745, 'AvgHomeCost'),
 (0.07364877574348727, 'EldLivAl_Per'),
 (0.06940872273469677, 'Emp_per'),
 (0.05728978607177137, 'Res_Score'),
 (0.05271268283507295, 'PopDens'),
 (0.04779618329655094, 'VCrim_Rate'),
 (0.04601457946815089, 'Unnamed: 0'),
 (0.04570901598401599, 'DayPopDens'),
 (0.04177635667903525, 'Trans_Sco'),
 (0.031717419281055655, 'PTrans_Sco'),
 (0.02682199675324675, 'Ec_Score'),
 (0.02525007284382284, 'Viol_Rate'),
 (0.010037516436554896, 'AnalysisNeighborhood_Outer Mission'),
 (0.006519909502262448, 'AnalysisNeighborhood_Chinatown'),
 (0.006075892857142857, 'AnalysisNeighborhood_Bayview'),
 (0.005061824178707296, 'AnalysisNeighborhood_Western Addition'),
 (0.003348214285714286, 'AnalysisNeighborhood_Outer Richmond'),
 (0.002777004813368448, 'AnalysisNeighborhood_Haight Ashbury'),
 (0.0024937908496732012, 'AnalysisNeighborhood_Excelsior'),
 (0.00248854