In [1]:
import pandas as pd
import numpy as np
crime = pd.read_csv("crime.csv")

In [2]:
#Check for the missing data
crime.isnull().sum()

TYPE                 0
YEAR                 0
MONTH                0
DAY                  0
HOUR             54362
MINUTE           54362
HUNDRED_BLOCK       13
NEIGHBOURHOOD    56624
X                    0
Y                    0
Latitude             0
Longitude            0
dtype: int64

In [3]:
#Column MINUTE can be deleted as we don't need to go to the minute level. 
crime.drop(['MINUTE'],axis = 1,inplace=True)

In [4]:
#We fill the missing values
crime['HOUR'].fillna(0,inplace = True)
crime['HUNDRED_BLOCK'].fillna('N/A',inplace=True)
crime['NEIGHBOURHOOD'].fillna('N/A',inplace = True)

In [5]:
#Let's take a look into our data to check for missing values and data types
crime.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 530652 entries, 0 to 530651
Data columns (total 11 columns):
TYPE             530652 non-null object
YEAR             530652 non-null int64
MONTH            530652 non-null int64
DAY              530652 non-null int64
HOUR             530652 non-null float64
HUNDRED_BLOCK    530652 non-null object
NEIGHBOURHOOD    530652 non-null object
X                530652 non-null float64
Y                530652 non-null float64
Latitude         530652 non-null float64
Longitude        530652 non-null float64
dtypes: float64(5), int64(3), object(3)
memory usage: 44.5+ MB


In [6]:
crime['TYPE'].value_counts()

Theft from Vehicle                                        172700
Mischief                                                   70413
Break and Enter Residential/Other                          60862
Offence Against a Person                                   54142
Other Theft                                                52167
Theft of Vehicle                                           38418
Break and Enter Commercial                                 33845
Theft of Bicycle                                           25730
Vehicle Collision or Pedestrian Struck (with Injury)       21901
Vehicle Collision or Pedestrian Struck (with Fatality)       254
Homicide                                                     220
Name: TYPE, dtype: int64

In [7]:
#Converting text labels to numbers
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
crime['TYPE'] = encoder.fit_transform(crime['TYPE'])

In [8]:
#can check it
encoder.classes_

array(['Break and Enter Commercial', 'Break and Enter Residential/Other',
       'Homicide', 'Mischief', 'Offence Against a Person', 'Other Theft',
       'Theft from Vehicle', 'Theft of Bicycle', 'Theft of Vehicle',
       'Vehicle Collision or Pedestrian Struck (with Fatality)',
       'Vehicle Collision or Pedestrian Struck (with Injury)'],
      dtype=object)

In [9]:
#Thus we can relate each type
a = encoder.transform(['Break and Enter Commercial', 'Break and Enter Residential/Other',
       'Homicide', 'Mischief', 'Offence Against a Person', 'Other Theft',
       'Theft from Vehicle', 'Theft of Bicycle', 'Theft of Vehicle',
       'Vehicle Collision or Pedestrian Struck (with Fatality)',
       'Vehicle Collision or Pedestrian Struck (with Injury)'])
a

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10], dtype=int64)

In [10]:
#In similar way we convert HUNDRED_BLOCK and NEIGHBOURHOOD
crime['HUNDRED_BLOCK'] = encoder.fit_transform(crime['HUNDRED_BLOCK'])
crime['NEIGHBOURHOOD'] = encoder.fit_transform(crime['NEIGHBOURHOOD'])

In [11]:
X = crime.iloc[:,1:].values
y = crime.iloc[:,:1].values

In [12]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size= 0.25,random_state = 1)

In [13]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()
classifier.fit(X_train,y_train)

  This is separate from the ipykernel package so we can avoid doing imports until


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [14]:
from sklearn.metrics import accuracy_score
y_predict = classifier.predict(X_test)
print(accuracy_score(y_test,y_predict)*100)

54.422107143664775


In [15]:
from sklearn.naive_bayes import GaussianNB
classifier1 = GaussianNB()
classifier1.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


GaussianNB(priors=None, var_smoothing=1e-09)

In [16]:
y_predict=classifier1.predict(X_test)
print(accuracy_score(y_test,y_predict)*100)

42.00191462578112


In [17]:
#By comparision Random Forest gives better accuracy than Naive Bayes
classifier.predict([[2003,5,12,16.0,14370,20,493906.5,5457452.47,49.269802,-123.083763]])

array([5])

In [18]:
y[0]

array([5])

In [19]:
#Naive Bayes gave incorrect result
classifier1.predict([[2003,5,12,16.0,14370,20,493906.5,5457452.47,49.269802,-123.083763]])

array([6])

In [20]:
y[0]

array([5])