In [1]:
%matplotlib inline

# General Libraries
import re
import numpy as np
import matplotlib.pyplot as plt
import csv
import pandas as pd

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

Load the Crime data.

In [5]:
crime_data = []
crime_labels = []
with open("../train.csv", "rb") as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    first_row = True
    for row in reader:
        data = [y for x, y in enumerate(row) if x != 1]
        name = [y for x, y in enumerate(row) if x == 1]
        
        if first_row:
            first_row = False
            feature_names = data
            print feature_names
        else:
            crime_data.append(data)
            crime_labels.append(name)

['Dates', 'Descript', 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address', 'X', 'Y']


In [6]:
# Divide the Crime training set into train data, test data, dev data
train_data, train_labels = crime_data[:500000], crime_labels[:500000]
crime_test, crime_test_labels = crime_data[500000:], crime_labels[500000:]
num_test = len(crime_test)
print num_test
dev_data, dev_labels = crime_test[:num_test/2], crime_test_labels[:num_test/2]
test_data, test_labels = crime_test[num_test/2:], crime_test_labels[num_test/2:]
mini_train_data, mini_train_labels = crime_data[200000:300000], crime_labels[200000:300000]

print "Size of train data: ", len(train_data)
print "Size of dev data: ", len(dev_data)
print "Size of test data: ", len(test_data)

378049
Size of train data:  500000
Size of dev data:  189024
Size of test data:  189025


In [8]:
import time
def convert_date(date):
    date_list = []
    date_val = time.strptime(date, '%Y-%m-%d %H:%M:%S')
    date_list.append(date_val.tm_year)
    date_list.append(date_val.tm_mon)
    date_list.append(date_val.tm_mday)
    time_in_min = date_val.tm_hour * 60.0 + date_val.tm_min
    date_list.append(time_in_min)
    return date_list

print train_data[0][0]
print convert_date(train_data[0][0])

def find_mean_std(train_data, index):
    data = []
    for row in train_data:
        data.append(float(row[index]))
    
    data_arr = np.array(data, dtype=np.float32)
    print np.mean(data_arr)
    data_dict = {}
    data_dict['mean'] = np.mean(data_arr)
    data_dict['std'] = np.std(data_arr)
    return data_dict

2015-05-13 23:53:00
[2015, 5, 13, 1433.0]


In [9]:
districts = ['NORTHERN', 'PARK', 'INGLESIDE', 'BAYVIEW', 'RICHMOND', 'CENTRAL', 'TARAVAL', 'TENDERLOIN', 'MISSION', 'SOUTHERN']
day = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

Feature Engineering - Section I:

1) Crime Date - Normalization

2) Longitude & Latitude - Normalization

In [17]:
''' Crime Date - Normalization '''

year_data = []
mon_data = []
time_data = []
for row in train_data:
    date_list = convert_date(row[0])
    year_data.append(float(date_list[0]))
    mon_data.append(float(date_list[0]))
    time_data.append(date_list[0])

year_arr = np.array(year_data, dtype=np.float32)
mon_arr = np.array(mon_data, dtype=np.float32)
time_arr = np.array(time_data, dtype=np.float32)

date_dict = {}
date_dict['mean_year'] = np.mean(year_arr)
date_dict['std_year'] = np.std(year_arr)                               
date_dict['mean_mon'] = np.mean(mon_arr)
date_dict['std_mon'] = np.std(mon_arr)
date_dict['mean_time'] = np.mean(time_arr)
date_dict['std_time'] = np.std(time_arr)

''' Longitude & Latitude - Normalization '''
long_dict = find_mean_std(train_data, 6)
lat_dict = find_mean_std(train_data, 7)

-122.423
37.7675


Preparing the Data

In [18]:

def get_formatted_data(train_data):
    format_data = []
    count = 0
    
    for row in train_data:
        count += 1
        data = []
        
        ''' Data Preparation - Crime date '''
        
        #date_list = convert_date(row[0])
        
        data.extend(convert_date(row[0])) # Might require improvement
        
        #year_val = (float(date_list[0]) - date_dict['mean_year'])/date_dict['std_year']
        #data.append(year_val)
        #mon_val = (float(date_list[1]) - date_dict['mean_mon'])/date_dict['std_mon']
        #data.append(mon_val)
        #time_val = (float(date_list[2]) - date_dict['mean_time'])/date_dict['std_time']
        #data.append(time_val)
        
        ''' Data Preparation - Day of the Week '''
        week_day = row[2]
        crime_day = [1 if week_day == d else 0 for d in day] 
        data.extend(crime_day) # Normalized
        
        ''' Data Preparation - District '''
        pdd = row[3]
        pddistrict = [1 if pdd == d else 0 for d in districts] # Normalized
        data.extend(pddistrict)
        
        ''' Data Preparation - Longitude & Latitude '''
        longitude = float(row[6])
        long_norm = (abs(longitude) - abs(long_dict['mean']))/long_dict['std'] # Normalized
        latitude = float(row[7])
        lat_norm = (abs(latitude) - abs(lat_dict['mean']))/lat_dict['std'] # Normalized
        data.append(long_norm)
        data.append(lat_norm)
        
        ''' Data Preparation - Address '''
        
        address = row[5].lower()
        addr = [1 if 'block' in address else 0] # Requires major improvement
        data.extend(addr)
        
        # Explore external map source
        
        ''' Quantify the data '''
        format_data.append(np.array(data, dtype=np.float32))           
        
    return format_data
    
    
crimeX = np.array(get_formatted_data(crime_data), dtype=np.float32)
trainX = np.array(get_formatted_data(train_data), dtype=np.float32)
testX = np.array(get_formatted_data(test_data), dtype=np.float32)
devX = np.array(get_formatted_data(dev_data), dtype=np.float32)
mini_trainX = np.array(get_formatted_data(mini_train_data), dtype=np.float32)
print trainX[:1]

[[  2.01500000e+03   5.00000000e+00   1.30000000e+01   1.43300000e+03
    0.00000000e+00   0.00000000e+00   1.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   1.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   1.29296839e-01   2.97121316e-01   0.00000000e+00]]


In [23]:
print "CrimeX Data shape: ", crimeX.shape
print "CrimeX Label shape: ", np.array(crime_labels).shape
print "testX Data shape: ", testX.shape
print "testX Label shape:", np.array(test_labels).shape
print "Mini TrainX Data shape: ", mini_trainX.shape
print "Mini TrainX Label shape: ", np.array(mini_train_labels).shape

CrimeX Data shape:  (878049L, 24L)
CrimeX Label shape:  (878049L, 1L)
testX Data shape:  (189025L, 24L)
testX Label shape: (189025L, 1L)
Mini TrainX Data shape:  (100000L, 24L)
Mini TrainX Label shape:  (100000L, 1L)


In [24]:
train_labels = np.array(train_labels).ravel()
test_labels = np.array(test_labels).ravel()
crime_labels = np.array(crime_labels).ravel()

In [25]:
lr = LogisticRegression()
lr.fit(crimeX, crime_labels)
print "Completed the training"

Completed the training


In [26]:
mini_testX, mini_test_labels = testX[:10000], test_labels[:10000]
f1_score = metrics.f1_score(lr.predict(mini_testX), mini_test_labels)
print f1_score

0.314998194696


  'recall', 'true', average, warn_for)


In [27]:
print "hello"

hello


In [28]:
sub_test_data = []
with open("../test.csv", "rb") as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    first_row = True
    for row in reader:
        data = [y for x, y in enumerate(row)]
        if first_row:
            first_row = False
        else:
            sub_test_data.append(data)
            

def get_formatted_test_data(test_data):
    format_data = []
    count = 0
    
    for row in test_data:
        count += 1
        data = []
        data.extend(convert_date(row[1]))
        week_day = row[2]
        crime_day = [1 if week_day == d else 0 for d in day] 
        data.extend(crime_day)
        pdd = row[3]
        pddistrict = [1 if pdd == d else 0 for d in districts]
        data.extend(pddistrict)
        longitude = float(row[5])
        long_norm = (abs(longitude) - abs(long_dict['mean']))/long_dict['std']
        latitude = float(row[6])
        lat_norm = (abs(latitude) - abs(lat_dict['mean']))/lat_dict['std']
        data.append(long_norm)
        data.append(lat_norm)
        
        if 'block' in row[4].lower():
            address = 1
        else:
            address = 0
        data.append(address)
        format_data.append(np.array(data, dtype=np.float32))           
        
    return format_data
    
    

testX = np.array(get_formatted_test_data(sub_test_data), dtype=np.float32)

        

In [34]:
print probs.shape
print probs[:1]

(884262L, 39L)
[[  3.19594531e-03   1.11893378e-01   9.99398335e-05   6.87935482e-04
    4.84346270e-02   1.60246933e-03   8.95964469e-04   4.53638414e-02
    4.03224385e-03   4.32879921e-04   1.37376523e-04   8.49872448e-04
    4.40179386e-03   8.40223308e-03   2.44692320e-04   3.79233163e-03
    1.51675058e-01   1.70899243e-03   1.82237828e-04   6.56120507e-02
    8.98320326e-02   1.22974919e-01   1.66476422e-05   2.12681594e-04
    4.86397578e-03   2.04967091e-02   4.48285127e-03   2.00269473e-02
    3.74452528e-03   1.79955227e-04   3.96028544e-03   4.46248133e-04
    3.48269309e-02   8.51272773e-06   6.29219886e-03   6.90577879e-02
    1.00957971e-01   4.42163318e-02   1.97566245e-02]]


In [31]:
probs = lr.predict_proba(testX)

In [33]:
import gzip
with gzip.open('submission-matrix.csv.gz', 'wb') as f:
    out = csv.writer(f, lineterminator='\n')
    out.writerow(['Id'] + list(np.unique(crime_labels)))
    
    for i, prob in enumerate(probs):
        out.writerow([i] + list(prob))
print "Job completed"     

Job completed
