In [2]:
%matplotlib inline

# General Libraries
import re
import numpy as np
import matplotlib.pyplot as plt
import csv
import pandas as pd

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

In [5]:
crime_data = []
crime_labels = []
with open("train.csv", "rb") as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    first_row = True
    for row in reader:
        data = [y for x, y in enumerate(row) if x != 1]
        name = [y for x, y in enumerate(row) if x == 1]
        
        if first_row:
            first_row = False
            feature_names = data
            print feature_names
        else:
            crime_data.append(data)
            crime_labels.append(name)

['Dates', 'Descript', 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address', 'X', 'Y']


In [6]:
print "Size of total train data: ", len(crime_data)
print "Size of total train labels: ", len(crime_labels)


Size of total train data:  878049
Size of total train labels:  878049


In [7]:
# Divide the Crime training set into train data, test data, dev data
train_data, train_labels = crime_data[:500000], crime_labels[:500000]
crime_test, crime_test_labels = crime_data[500000:], crime_labels[500000:]
num_test = len(crime_test)
print num_test
dev_data, dev_labels = crime_test[:num_test/2], crime_test_labels[:num_test/2]
test_data, test_labels = crime_test[num_test/2:], crime_test_labels[num_test/2:]
mini_train_data, mini_train_labels = crime_data[200000:300000], crime_labels[200000:300000]

print "Size of train data: ", len(train_data)
print "Size of dev data: ", len(dev_data)
print "Size of test data: ", len(test_data)

378049
Size of train data:  500000
Size of dev data:  189024
Size of test data:  189025


In [8]:
train_data, train_labels = crime_data, crime_labels

In [9]:
def find_unique(train_data, index):
    output_list = []
    for i in range(len(train_data)):
        if train_data[i][index] not in output_list:
            output_list.append(train_data[i][index])
    
    return output_list

In [10]:
descript = find_unique(train_data, 1)
print len(descript)
print descript

879


In [11]:
pddistrict = find_unique(train_data, 3)
print len(pddistrict)
print pddistrict

10
['NORTHERN', 'PARK', 'INGLESIDE', 'BAYVIEW', 'RICHMOND', 'CENTRAL', 'TARAVAL', 'TENDERLOIN', 'MISSION', 'SOUTHERN']


In [303]:
print np.unique(train_labels)
print feature_names
print train_data[:1]

['ARSON' 'ASSAULT' 'BAD CHECKS' 'BRIBERY' 'BURGLARY' 'DISORDERLY CONDUCT'
 'DRIVING UNDER THE INFLUENCE' 'DRUG/NARCOTIC' 'DRUNKENNESS' 'EMBEZZLEMENT'
 'EXTORTION' 'FAMILY OFFENSES' 'FORGERY/COUNTERFEITING' 'FRAUD' 'GAMBLING'
 'KIDNAPPING' 'LARCENY/THEFT' 'LIQUOR LAWS' 'LOITERING' 'MISSING PERSON'
 'NON-CRIMINAL' 'OTHER OFFENSES' 'PORNOGRAPHY/OBSCENE MAT' 'PROSTITUTION'
 'RECOVERED VEHICLE' 'ROBBERY' 'RUNAWAY' 'SECONDARY CODES'
 'SEX OFFENSES FORCIBLE' 'SEX OFFENSES NON FORCIBLE' 'STOLEN PROPERTY'
 'SUICIDE' 'SUSPICIOUS OCC' 'TREA' 'TRESPASS' 'VANDALISM' 'VEHICLE THEFT'
 'WARRANTS' 'WEAPON LAWS']
['Dates', 'Descript', 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address', 'X', 'Y']
[['2015-05-13 23:53:00', 'WARRANT ARREST', 'Wednesday', 'NORTHERN', 'ARREST, BOOKED', 'OAK ST / LAGUNA ST', '-122.425891675136', '37.7745985956747']]


In [12]:
days = {'Sunday':0, 'Monday':1, 'Tuesday':2, 'Wednesday':3, 'Thursday':4, 'Friday':5, 'Saturday':6}
pdistrict = {'NORTHERN':1, 'PARK':2, 'INGLESIDE':3, 'BAYVIEW':4, 'RICHMOND':5, 'CENTRAL':6, 'TARAVAL':7, 'TENDERLOIN':8, 'MISSION':9, 'SOUTHERN':10}
districts = ['NORTHERN', 'PARK', 'INGLESIDE', 'BAYVIEW', 'RICHMOND', 'CENTRAL', 'TARAVAL', 'TENDERLOIN', 'MISSION', 'SOUTHERN']
day = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

In [118]:
longitude = find_unique(train_data, 6)
print len(longitude)


27077


In [119]:
data = []
for row in train_data:
    data.append(float(row[6]))

longitude = np.array(data, dtype=np.float32)
                                 

In [13]:
def find_mean_std(train_data, index):
    data = []
    for row in train_data:
        data.append(float(row[index]))
    
    data_arr = np.array(data, dtype=np.float32)
    print np.mean(data_arr)
    data_dict = {}
    data_dict['mean'] = np.mean(data_arr)
    data_dict['std'] = np.std(data_arr)
    return data_dict


In [15]:
long_dict = find_mean_std(train_data, 6)
print long_dict
print abs(long_dict['mean'])
print long_dict['std']
lat_dict = find_mean_std(train_data, 7)
print abs(lat_dict['mean'])
print lat_dict

-122.423
{'std': 0.030353628, 'mean': -122.42262}
122.423
0.0303536
37.771
37.771
{'std': 0.45689282, 'mean': 37.771027}


In [18]:
year_data = []
mon_data = []
time_data = []
for row in train_data:
    date_list = convert_date(row[0])
    year_data.append(float(date_list[0]))
    mon_data.append(float(date_list[0]))
    time_data.append(date_list[0])

year_arr = np.array(year_data, dtype=np.float32)
mon_arr = np.array(mon_data, dtype=np.float32)
time_arr = np.array(time_data, dtype=np.float32)

date_dict = {}
date_dict['mean_year'] = np.mean(year_arr)
date_dict['std_year'] = np.std(year_arr)
date_dict['mean_mon'] = np.mean(mon_arr)
date_dict['std_mon'] = np.std(mon_arr)
date_dict['mean_time'] = np.mean(time_arr)
date_dict['std_time'] = np.std(time_arr)
    

In [17]:
import time
# format_data[<YYYY>, <MM>, <DD>, <HH>, <MM>, <DayOfWeek>, <PdDistrict>, <Address>, <Longitude>, <Latitude>]
def convert_date(date):
    date_list = []
    date_val = time.strptime(date, '%Y-%m-%d %H:%M:%S')
    date_list.append(date_val.tm_year)
    date_list.append(date_val.tm_mon)
    date_list.append(date_val.tm_mday)
    time_in_min = date_val.tm_hour * 60.0 + date_val.tm_min
    date_list.append(time_in_min)
    return date_list

print train_data[0][0]
print convert_date(train_data[0][0])

2015-05-13 23:53:00
[2015, 5, 13, 1433.0]


In [19]:

def get_formatted_data(train_data):
    format_data = []
    count = 0
    
    for row in train_data:
        count += 1
        data = []
        #date_list = convert_date(row[0])
        data.extend(convert_date(row[0])) # Might require improvement
        '''            
        year_val = (float(date_list[0]) - date_dict['mean_year'])/date_dict['std_year']
        data.append(year_val)
        mon_val = (float(date_list[1]) - date_dict['mean_mon'])/date_dict['std_mon']
        data.append(mon_val)
        time_val = (float(date_list[2]) - date_dict['mean_time'])/date_dict['std_time']
        data.append(time_val)
        '''
        
        data.append(day.index(row[2])/len(day)) # Normalized
        pdd = row[3]
        pddistrict = [1 if pdd == d else 0 for d in districts] # Normalized
        data.extend(pddistrict)
        longitude = float(row[6])
        long_norm = (abs(longitude) - abs(long_dict['mean']))/long_dict['std'] # Normalized
        latitude = float(row[7])
        lat_norm = (abs(latitude) - abs(lat_dict['mean']))/lat_dict['std'] # Normalized
        data.append(long_norm)
        data.append(lat_norm)
        address = row[5].lower()
        addr = [1 if 'block' in address else 0] # Requires major improvement
        data.extend(addr)
        # Explore external map source
        format_data.append(np.array(data, dtype=np.float32))           
        
    return format_data
    
    

trainX = np.array(get_formatted_data(train_data), dtype=np.float32)
testX = np.array(get_formatted_data(test_data), dtype=np.float32)
devX = np.array(get_formatted_data(dev_data), dtype=np.float32)
mini_trainX = np.array(get_formatted_data(mini_train_data), dtype=np.float32)
print trainX[:1]

[[  2.01500000e+03   5.00000000e+00   1.30000000e+01   1.43300000e+03
    0.00000000e+00   1.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   1.07948340e-01
    7.81799201e-03   0.00000000e+00]]


In [310]:
print trainX[:1]

[[  2.01500000e+03   5.00000000e+00   1.30000000e+01   1.43300000e+03
    0.00000000e+00   1.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   1.07948340e-01
    7.81799201e-03   0.00000000e+00]]


In [311]:
print "TrainX Data shape: ", trainX.shape
print "TrainX Label shape: ", np.array(train_labels).shape
print "TestX Data shape: ", testX.shape
print "DevX Data shape: ", devX.shape
print "Mini TrainX Data shape: ", mini_trainX.shape

TrainX Data shape:  (878049L, 18L)
TrainX Label shape:  (878049L, 1L)
TestX Data shape:  (189025L, 18L)
DevX Data shape:  (189024L, 18L)
Mini TrainX Data shape:  (100000L, 18L)


In [312]:
train_labels = np.array(train_labels).ravel()
test_labels = np.array(test_labels).ravel()
print train_labels.shape
print test_labels.shape

(878049L,)
(189025L,)


In [313]:
lr = LogisticRegression()
lr.fit(trainX, train_labels)
print "Completed the training"

Completed the training


In [271]:
mini_testX, mini_test_labels = testX[:10000], test_labels[:10000]
f1_score = metrics.f1_score(lr.predict(mini_testX), mini_test_labels)
print f1_score

0.316109107021


In [237]:
print f1_score

0.325946042028


In [238]:
sub_test_data = []
with open("../datasets/sfocrime/test.csv", "rb") as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    first_row = True
    for row in reader:
        data = [y for x, y in enumerate(row)]
        if first_row:
            first_row = False
        else:
            sub_test_data.append(data)
        

In [239]:
print sub_test_data[:1]
print train_data[:1]

[['0', '2015-05-10 23:59:00', 'Sunday', 'BAYVIEW', '2000 Block of THOMAS AV', '-122.39958770418998', '37.7350510103906']]
[['2015-05-13 23:53:00', 'WARRANT ARREST', 'Wednesday', 'NORTHERN', 'ARREST, BOOKED', 'OAK ST / LAGUNA ST', '-122.425891675136', '37.7745985956747']]


In [314]:

def get_formatted_test_data(test_data):
    format_data = []
    count = 0
    
    for row in test_data:
        count += 1
        data = []
        data.extend(convert_date(row[1]))
        data.append(day.index(row[2])/len(day))
        pdd = row[3]
        pddistrict = [1 if pdd == d else 0 for d in districts]
        data.extend(pddistrict)
        longitude = float(row[5])
        long_norm = (abs(longitude) - abs(long_dict['mean']))/long_dict['std']
        latitude = float(row[6])
        lat_norm = (abs(latitude) - abs(lat_dict['mean']))/lat_dict['std']
        data.append(long_norm)
        data.append(lat_norm)
        
        if 'block' in row[4].lower():
            address = 1
        else:
            address = 0
        data.append(address)
        format_data.append(np.array(data, dtype=np.float32))           
        
    return format_data
    
    

testX = np.array(get_formatted_test_data(sub_test_data), dtype=np.float32)


In [315]:
print testX.shape
print testX[:1]

(884262L, 18L)
[[  2.01500000e+03   5.00000000e+00   1.00000000e+01   1.43900000e+03
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    1.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00  -7.58635759e-01
   -7.87396953e-02   1.00000000e+00]]


In [316]:
probs = lr.predict_proba(testX)

In [317]:
print probs.shape
print probs[:1]

(884262L, 39L)
[[  2.80049257e-03   1.06875822e-01   2.50339219e-04   6.70757115e-04
    5.54191564e-02   1.75340640e-03   6.12800942e-04   4.86088814e-02
    2.23931020e-03   6.90514758e-04   1.19558349e-04   9.86513077e-04
    6.92417107e-03   8.58040632e-03   3.48231011e-04   3.05989649e-03
    1.40020887e-01   1.70874120e-03   2.01029074e-04   7.80028972e-02
    8.18985751e-02   1.32025505e-01   1.63283804e-05   5.20977385e-04
    6.24080998e-03   1.99855868e-02   4.73441365e-03   2.10462344e-02
    3.18559924e-03   2.01952935e-04   4.29828341e-03   4.66797174e-04
    4.18579374e-02   1.42609217e-05   6.79717029e-03   6.85434829e-02
    8.44117250e-02   4.85004834e-02   1.53800654e-02]]


In [318]:
import gzip
with gzip.open('submission-matrix.csv.gz', 'wb') as f:
    out = csv.writer(f, lineterminator='\n')
    out.writerow(['Id'] + list(np.unique(crime_labels)))
    
    for i, prob in enumerate(probs):
        out.writerow([i] + list(prob))
print "Job completed"         

Job completed


In [68]:
print np.unique(crime_labels)

['ARSON' 'ASSAULT' 'BAD CHECKS' 'BRIBERY' 'BURGLARY' 'DISORDERLY CONDUCT'
 'DRIVING UNDER THE INFLUENCE' 'DRUG/NARCOTIC' 'DRUNKENNESS' 'EMBEZZLEMENT'
 'EXTORTION' 'FAMILY OFFENSES' 'FORGERY/COUNTERFEITING' 'FRAUD' 'GAMBLING'
 'KIDNAPPING' 'LARCENY/THEFT' 'LIQUOR LAWS' 'LOITERING' 'MISSING PERSON'
 'NON-CRIMINAL' 'OTHER OFFENSES' 'PORNOGRAPHY/OBSCENE MAT' 'PROSTITUTION'
 'RECOVERED VEHICLE' 'ROBBERY' 'RUNAWAY' 'SECONDARY CODES'
 'SEX OFFENSES FORCIBLE' 'SEX OFFENSES NON FORCIBLE' 'STOLEN PROPERTY'
 'SUICIDE' 'SUSPICIOUS OCC' 'TREA' 'TRESPASS' 'VANDALISM' 'VEHICLE THEFT'
 'WARRANTS' 'WEAPON LAWS']
