# San Francisco Crime Classification
*Predict the category of crimes that occurred in the city by the bay*

[Kaggle Project Link](https://www.kaggle.com/c/sf-crime)

From 1934 to 1963, San Francisco was infamous for housing some of the world's most notorious criminals on the inescapable island of Alcatraz.

Today, the city is known more for its tech scene than its criminal past. But, with rising wealth inequality, housing shortages, and a proliferation of expensive digital toys riding BART to work, there is no scarcity of crime in the city by the bay.

From Sunset to SOMA, and Marina to Excelsior, this competition's dataset provides nearly 12 years of crime reports from across all of San Francisco's neighborhoods. Given time and location, you must predict the category of crime that occurred.

We're also encouraging you to explore the dataset visually. What can we learn about the city through visualizations like this Top Crimes Map? The top most up-voted scripts from this competition will receive official Kaggle swag as prizes. 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import datetime
import csv
import copy
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import numpy as np
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn library for importing the newsgroup data.
from sklearn.datasets import fetch_20newsgroups

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

np.set_printoptions( linewidth=1000 )

%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np 
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import AdaBoostClassifier 

from sklearn.datasets import make_gaussian_quantiles

# For producing decision tree diagrams.
from IPython.core.display import Image, display
from sklearn.externals.six import StringIO

In [2]:
train = pd.read_csv('C:/MIDS/W207/w207_final/train.csv.zip', parse_dates=['Dates'])
test = pd.read_csv('C:/MIDS/W207/w207_final//test.csv.zip', parse_dates=['Dates'])
#print(train[:10])
#print(train['Dates'])
#train = pd.read_csv('c:/Users/sconner/Desktop/Github/w207/W207_Final/data/train.csv.zip', parse_dates=['Dates'])
#test = pd.read_csv('c:/Users/sconner/Desktop/Github/w207/W207_Final/data/test.csv.zip', parse_dates=['Dates'])

In [3]:
train.describe()
#copying train data
 
train_labels =  copy.deepcopy(train['Category'])



#test_labels =  copy.deepcopy(test['Category'])

#test

In [19]:
#train_data['Time'] =  pd.DatetimeIndex(train ['Dates']).

train_data = copy.deepcopy(train[ 'X'])
train_data  = pd.concat([train_data,train['Y']], axis = 1)
months  = pd.DataFrame(pd.DatetimeIndex(train ['Dates']).month.T,columns = ['Month'])
train_data = pd.concat([train_data, months ], axis =1)

days  = pd.DataFrame(pd.DatetimeIndex(train ['Dates']).day.T, columns = ['Day'])
train_data = pd.concat([train_data, days], axis =1)

years = pd.DataFrame(pd.DatetimeIndex(train ['Dates']).year.T ,columns = ['Year'])
train_data = pd.concat([train_data, years], axis =1)

#creating dummy variables for the PdDistrict category
PdDistricts =  pd.get_dummies(train['PdDistrict'], dummy_na=False)
train_data = pd.concat([train_data,PdDistricts], axis=1)
 
#creating dummy variables for the day of week category
DaysOfWeek =pd.get_dummies(train['DayOfWeek'], dummy_na = False)
train_data = pd.concat([train_data,DaysOfWeek ], axis = 1)    



test_data = copy.deepcopy(test[ 'X'])
test_data  = pd.concat([test_data,test['Y']], axis = 1)
months  = pd.DataFrame(pd.DatetimeIndex(test ['Dates']).month.T,columns = ['Month'])
test_data = pd.concat([test_data, months ], axis =1)

days  = pd.DataFrame(pd.DatetimeIndex(test ['Dates']).day.T, columns = ['Day'])
test_data = pd.concat([test_data, days], axis =1)

years = pd.DataFrame(pd.DatetimeIndex(test ['Dates']).year.T ,columns = ['Year'])
test_data = pd.concat([test_data, years], axis =1)

#creating dummy variables for the PdDistrict category
PdDistricts =  pd.get_dummies(test['PdDistrict'], dummy_na=False)
test_data = pd.concat([test_data,PdDistricts], axis=1)
 
#creating dummy variables for the day of week category
DaysOfWeek =pd.get_dummies(test['DayOfWeek'], dummy_na = False)
test_data = pd.concat([test_data,DaysOfWeek ], axis = 1)    

In [22]:
print(train_data[:10])
print('\n')
print(test_data[:10])

            X          Y  Month  Day  Year  BAYVIEW  CENTRAL  INGLESIDE  \
0 -122.425892  37.774599      5   13  2015      0.0      0.0        0.0   
1 -122.425892  37.774599      5   13  2015      0.0      0.0        0.0   
2 -122.424363  37.800414      5   13  2015      0.0      0.0        0.0   
3 -122.426995  37.800873      5   13  2015      0.0      0.0        0.0   
4 -122.438738  37.771541      5   13  2015      0.0      0.0        0.0   
5 -122.403252  37.713431      5   13  2015      0.0      0.0        1.0   
6 -122.423327  37.725138      5   13  2015      0.0      0.0        1.0   
7 -122.371274  37.727564      5   13  2015      1.0      0.0        0.0   
8 -122.508194  37.776601      5   13  2015      0.0      0.0        0.0   
9 -122.419088  37.807802      5   13  2015      0.0      1.0        0.0   

   MISSION  NORTHERN    ...      SOUTHERN  TARAVAL  TENDERLOIN  Friday  \
0      0.0       1.0    ...           0.0      0.0         0.0     0.0   
1      0.0       1.0    ..

In [37]:
np.random.seed(1)

# a single iteration of tree bagging
B = 1 #only one iteration for sample submission - this could be 500 later
n = train_data.shape[0]
#print(train_data.shape)
sn = int((n*2.0)/3.0)   # nr of training data in subset for each tree

nf = train_data.shape[1]
 
#print(nf)
all_preds = np.chararray((B,test_data.shape[0]),  itemsize=30 )
 

for b in range(B):   #only one iteration for sample submission - this could be 500 later
    bs_sample_index = tuple(np.random.choice(range(n), size=sn, replace=True))
    #print(bs_sample_index)
    
    bs_data = train_data[train_data.index.isin(bs_sample_index)]
   
    #print(bs_data)
    bs_labels = train_labels[train_labels.index.isin(bs_sample_index)]
    bs_test_data = test_data
    
    bs_sample_index_features = tuple(np.random.choice(range(nf), size=int(np.sqrt(nf)), replace=False))
    #print(bs_sample_index_features )
    bs_data = bs_data.iloc[:, bs_sample_index_features ]
    bs_test_data = bs_test_data.iloc[:,bs_sample_index_features ]
    
    bs_tree = DecisionTreeClassifier(criterion="entropy", splitter="best")
    bs_tree.fit(bs_data, bs_labels)
    
    bs_tree_preds = bs_tree.predict(bs_test_data)
    print(bs_tree_preds.shape)
    all_preds[b,:] = bs_tree_preds

    
#print(all_preds)
#using the only prediction for now...need to vote and select best, but this is good for sample submission
submission = pd.get_dummies(bs_tree_preds.T, dummy_na=False)
submission.index.name = 'ID'
#voting = np.sum(all_preds,axis=0) / B
#voting = [int(x >= 0.5) for x in voting]
#np.mean(voting==test_labels)

(884262,)


In [39]:
print(submission)
submission.to_csv(path_or_buf = 'C:/MIDS/W207/w207_final/submission.csv') 

        ARSON  ASSAULT  BAD CHECKS  BRIBERY  BURGLARY  DISORDERLY CONDUCT  \
ID                                                                          
0         0.0      0.0         0.0      0.0       0.0                 0.0   
1         0.0      0.0         0.0      0.0       0.0                 0.0   
2         0.0      0.0         0.0      0.0       0.0                 0.0   
3         0.0      1.0         0.0      0.0       0.0                 0.0   
4         0.0      1.0         0.0      0.0       0.0                 0.0   
5         0.0      0.0         0.0      0.0       0.0                 0.0   
6         0.0      0.0         0.0      0.0       0.0                 0.0   
7         0.0      0.0         0.0      0.0       0.0                 0.0   
8         0.0      0.0         0.0      0.0       0.0                 0.0   
9         0.0      0.0         0.0      0.0       0.0                 0.0   
10        0.0      1.0         0.0      0.0       0.0                 0.0   