In [1]:
%matplotlib inline

# General Libraries
import re
import numpy as np
import matplotlib.pyplot as plt
import csv
import pandas as pd

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import log_loss

Load the Crime data.

In [2]:
crime_data = []
crime_labels = []
with open("../train.csv", "rb") as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    first_row = True
    for row in reader:
        data = [y for x, y in enumerate(row) if x != 1]
        name = [y for x, y in enumerate(row) if x == 1]
        
        if first_row:
            first_row = False
            feature_names = data
            print feature_names
        else:
            crime_data.append(data)
            crime_labels.append(name)

['Dates', 'Descript', 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address', 'X', 'Y']


In [3]:
# Divide the Crime training set into train data, test data, dev data
train_data, train_labels = crime_data[:500000], crime_labels[:500000]
crime_test, crime_test_labels = crime_data[500000:], crime_labels[500000:]
num_test = len(crime_test)
print num_test
dev_data, dev_labels = crime_test[:num_test/2], crime_test_labels[:num_test/2]
test_data, test_labels = crime_test[num_test/2:], crime_test_labels[num_test/2:]
mini_train_data, mini_train_labels = crime_data[200000:300000], crime_labels[200000:300000]

print "Size of train data: ", len(train_data)
print "Size of dev data: ", len(dev_data)
print "Size of test data: ", len(test_data)

378049
Size of train data:  500000
Size of dev data:  189024
Size of test data:  189025


In [4]:
from datetime import datetime
def convert_date(date_val):
    date_list = []
    date, time = date_val.split()
    date_list.append(int(date.split('-')[0]))
    date_list.append(int(date.split('-')[1]))
    date_list.append(int(date.split('-')[2]))
    time_in_min = int(time.split(':')[0]) * 60.0 + int(time.split(':')[1])
    date_list.append(time_in_min)
    date_list.append(int(time.split(':')[0]))
    time_hour = int(time.split(':')[0])
    if time_hour < 6:
        time_of_day = 'Twilight'
    elif time_hour < 12:
        time_of_day = 'Morning'
    elif time_hour < 18:
        time_of_day = 'Afternoon'
    else:
        time_of_day = 'Night'
    date_list.append(time_of_day)
    return date_list

print train_data[0][0]
print convert_date(train_data[0][0])

def find_mean_std(train_data, index):
    data = []
    for row in train_data:
        data.append(float(row[index]))
    
    data_arr = np.array(data, dtype=np.float32)
    print np.mean(data_arr)
    data_dict = {}
    data_dict['mean'] = np.mean(data_arr)
    data_dict['std'] = np.std(data_arr)
    return data_dict

2015-05-13 23:53:00
[2015, 5, 13, 1433.0, 23, 'Night']


In [5]:
districts = ['NORTHERN', 'PARK', 'INGLESIDE', 'BAYVIEW', 'RICHMOND', 'CENTRAL', 'TARAVAL', 'TENDERLOIN', 'MISSION', 'SOUTHERN']
day = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
daytime = ['Twilight', 'Morning', 'Afternoon', 'Night']

Feature Engineering - Section I:

1) Crime Date - Normalization

2) Longitude & Latitude - Normalization

In [6]:
''' Crime Date - Normalization '''

year_data = []
mon_data = []
day_data = []
time_data = []
time_of_day_data = []
for row in crime_data:
    date_arr = convert_date(row[0])
    year_data.append(float(date_arr[0]))
    mon_data.append(float(date_arr[1]))
    day_data.append(float(date_arr[2]))
    time_data.append(float(date_arr[3]))
    time_of_day_data.append(date_arr[5])
                     

year_arr = np.array(year_data, dtype=np.float32)
mon_arr = np.array(mon_data, dtype=np.float32)
day_arr = np.array(day_data, dtype=np.float32)
time_arr = np.array(time_data, dtype=np.float32)
time_of_day_arr = np.array(time_of_day_data)

date_dict = {}
date_dict['mean_year'] = np.mean(year_arr)
date_dict['std_year'] = np.std(year_arr)                               
date_dict['mean_mon'] = np.mean(mon_arr)
date_dict['std_mon'] = np.std(mon_arr)
date_dict['mean_day'] = np.mean(day_arr)
date_dict['std_day'] = np.std(day_arr)
date_dict['mean_time'] = np.mean(time_arr)
date_dict['std_time'] = np.std(time_arr)


''' Longitude & Latitude - Normalization '''
long_dict = find_mean_std(train_data, 6)
lat_dict = find_mean_std(train_data, 7)

-122.423
37.7675


Preparing the Data

In [7]:

def get_formatted_data(train_data):
    format_data = []
    count = 0
    
    for row in train_data:
        count += 1
        data = []
        
        ''' Data Preparation - Crime date '''
        
        #date_list = convert_date(row[0])
        date_arr = convert_date(row[0])
        
        data.extend(date_arr[:3]) # Normalized date
        
        # Time of the day preparation
        time_of_day = date_arr[5]
        day_time = [1 if time_of_day == t else 0 for t in daytime]
        data.extend(day_time)
        
        #year_val = (float(date_list[0]) - date_dict['mean_year'])/date_dict['std_year']
        #data.append(year_val)
        #mon_val = (float(date_list[1]) - date_dict['mean_mon'])/date_dict['std_mon']
        #data.append(mon_val)
        #day_val = (float(date_list[2]) - date_dict['mean_day'])/date_dict['std_day']
        #data.append(day_val)
        #time_val = (float(date_list[3]) - date_dict['mean_time'])/date_dict['std_time']
        #data.append(time_val)
        
        ''' Data Preparation - Day of the Week '''
        week_day = row[2]
        crime_day = [1 if week_day == d else 0 for d in day] 
        data.extend(crime_day) # Normalized
        
        ''' Data Preparation - District '''
        pdd = row[3]
        pddistrict = [1 if pdd == d else 0 for d in districts] # Normalized
        data.extend(pddistrict)
        
        ''' Data Preparation - Longitude & Latitude '''
        longitude = float(row[6])
        long_norm = (abs(longitude) - abs(long_dict['mean']))/long_dict['std'] # Normalized
        latitude = float(row[7])
        lat_norm = (abs(latitude) - abs(lat_dict['mean']))/lat_dict['std'] # Normalized
        data.append(long_norm)
        data.append(lat_norm)
        
        ''' Data Preparation - Address '''
        
        address = row[5].lower()
        addr = [1 if 'block' in address else 0] # Requires major improvement
        data.extend(addr)
        
        # Explore external map source
        
        ''' Quantify the data '''
        format_data.append(np.array(data, dtype=np.float32))           
        
    return format_data
    
    
crimeX = np.array(get_formatted_data(crime_data), dtype=np.float32)
trainX = np.array(get_formatted_data(train_data), dtype=np.float32)
testX = np.array(get_formatted_data(test_data), dtype=np.float32)
devX = np.array(get_formatted_data(dev_data), dtype=np.float32)
mini_trainX = np.array(get_formatted_data(mini_train_data), dtype=np.float32)
print trainX[:1]

[[  2.01500000e+03   5.00000000e+00   1.30000000e+01   0.00000000e+00
    0.00000000e+00   0.00000000e+00   1.00000000e+00   0.00000000e+00
    0.00000000e+00   1.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   1.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    1.29296839e-01   2.97121316e-01   0.00000000e+00]]


In [8]:
print "CrimeX Data shape: ", crimeX.shape
print "CrimeX Label shape: ", np.array(crime_labels).shape
print "testX Data shape: ", testX.shape
print "testX Label shape:", np.array(test_labels).shape
print "Mini TrainX Data shape: ", mini_trainX.shape
print "Mini TrainX Label shape: ", np.array(mini_train_labels).shape

CrimeX Data shape:  (878049L, 27L)
CrimeX Label shape:  (878049L, 1L)
testX Data shape:  (189025L, 27L)
testX Label shape: (189025L, 1L)
Mini TrainX Data shape:  (100000L, 27L)
Mini TrainX Label shape:  (100000L, 1L)


In [9]:
train_labels = np.array(train_labels).ravel()
test_labels = np.array(test_labels).ravel()
crime_labels = np.array(crime_labels).ravel()

In [80]:
lr = LogisticRegression()
lr.fit(crimeX, crime_labels)
print "Completed the training"

Completed the training


In [82]:
mini_testX, mini_test_labels = testX[:10000], test_labels[:10000]
f1_score = metrics.f1_score(lr.predict(mini_testX), mini_test_labels)
print f1_score

0.306096502629


In [1]:
print f1_score

NameError: name 'f1_score' is not defined

In [14]:
sub_test_data = []
with open("../test.csv", "rb") as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    first_row = True
    for row in reader:
        data = [y for x, y in enumerate(row)]
        if first_row:
            first_row = False
        else:
            sub_test_data.append(data)
            

def get_formatted_test_data(test_data):
    format_data = []
    count = 0
    
    for row in test_data:
        count += 1
        data = []
        
        date_arr = convert_date(row[1])
        data.extend(date_arr[:3]) # Normalized date
        time_of_day = date_arr[5]
        day_time = [1 if time_of_day == t else 0 for t in daytime]
        data.extend(day_time)
        
        #year_val = (float(date_arr[0]) - date_dict['mean_year'])/date_dict['std_year']
        #data.append(year_val)
        #mon_val = (float(date_arr[1]) - date_dict['mean_mon'])/date_dict['std_mon']
        #data.append(mon_val)
        #mon_val = (float(date_arr[2]) - date_dict['mean_day'])/date_dict['std_day']
        #data.append(day_val)
        #time_val = (float(date_arr[3]) - date_dict['mean_time'])/date_dict['std_time']
        #data.append(time_val)
        
        week_day = row[2]
        crime_day = [1 if week_day == d else 0 for d in day] 
        data.extend(crime_day)
        
        pdd = row[3]
        pddistrict = [1 if pdd == d else 0 for d in districts]
        data.extend(pddistrict)
        
        longitude = float(row[5])
        long_norm = (abs(longitude) - abs(long_dict['mean']))/long_dict['std']
        latitude = float(row[6])
        lat_norm = (abs(latitude) - abs(lat_dict['mean']))/lat_dict['std']
        data.append(long_norm)
        data.append(lat_norm)
        
        if 'block' in row[4].lower():
            address = 1
        else:
            address = 0
        data.append(address)
        format_data.append(np.array(data, dtype=np.float32))           
        
    return format_data
    
    

testX = np.array(get_formatted_test_data(sub_test_data), dtype=np.float32)

        

In [75]:
print probs.shape
print probs[:1]

(884262L, 39L)
[[  6.72197218e-03   1.12152462e-01   3.38357114e-05   7.28435745e-04
    5.05806552e-02   1.83534645e-03   1.64564062e-03   3.74130614e-02
    4.83681426e-03   2.18599441e-04   1.10502521e-04   6.62701942e-04
    1.96106086e-03   5.16737908e-03   1.34772426e-04   4.60024743e-03
    1.50101637e-01   1.74168133e-03   1.91504377e-04   5.79770180e-02
    8.23265381e-02   1.19509862e-01   1.47748012e-05   4.39417475e-04
    4.35949869e-03   2.69940631e-02   2.93741822e-03   2.68803424e-02
    4.88154113e-03   1.21387722e-04   4.88781522e-03   4.56089763e-04
    3.27736717e-02   8.49554605e-06   5.93062189e-03   8.17195143e-02
    1.08533927e-01   3.80170909e-02   2.03926025e-02]]


In [17]:
print "hello"

hello


In [74]:
probs = lr.predict_proba(testX)

In [76]:
import gzip
with gzip.open('submission-matrix.csv.gz', 'wb') as f:
    out = csv.writer(f, lineterminator='\n')
    out.writerow(['Id'] + list(np.unique(crime_labels)))
    
    for i, prob in enumerate(probs):
        out.writerow([i] + list(prob))
print "Job completed"     

Job completed


In [10]:
clf = RandomForestClassifier(n_estimators=25)
clf.fit(crimeX, crime_labels)
print "Random Forest Training Completed"


Random Forest Training Completed


In [12]:
mini_testX, mini_test_labels = testX[:10000], test_labels[:10000]
f1_score = metrics.f1_score(clf.predict(mini_testX), mini_test_labels)
print f1_score

0.882141746499


  sample_weight=sample_weight)


In [16]:
clf_probs = clf.predict_proba(testX)
print clf_probs.shape
print clf_probs[:1]

(884262L, 39L)
[[ 0.          0.18        0.          0.          0.01333333  0.          0.
   0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.48        0.          0.          0.03        0.07
   0.02        0.          0.          0.          0.02        0.          0.
   0.          0.          0.          0.          0.04        0.          0.
   0.04        0.08        0.02666667  0.        ]]


In [18]:
import gzip
with gzip.open('submission-matrix.csv.gz', 'wb') as f:
    out = csv.writer(f, lineterminator='\n')
    out.writerow(['Id'] + list(np.unique(crime_labels)))
    
    for i, prob in enumerate(clf_probs):
        out.writerow([i] + list(prob))
print "Job completed" 

Job completed


In [19]:
print "hello"

hello
