In [1]:
%matplotlib inline

# General Libraries
import re
import numpy as np
import matplotlib.pyplot as plt
import csv
import pandas as pd

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import log_loss



In [2]:
data_df = pd.read_csv("../train.csv")

v_datetime = pd.to_datetime(data_df['Dates'])
v_date = v_datetime.dt.date
v_date_diff = (v_date - v_date.min())/np.timedelta64(1, 'D')
data_df['DateDiff'] = v_date_diff

crime_data = np.array(data_df[['Dates', 'Descript', 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address', 'X', 'Y', 'DateDiff']].values)
crime_labels = np.array(data_df[['Category']].values.ravel())
print crime_data[:1]

[['2015-05-13 23:53:00' 'WARRANT ARREST' 'Wednesday' 'NORTHERN'
  'ARREST, BOOKED' 'OAK ST / LAGUNA ST' -122.425891675136 37.7745985956747
  4510.0]]


In [3]:
test_data_df = pd.read_csv("../test.csv")

v_datetime = pd.to_datetime(test_data_df['Dates'])
v_date = v_datetime.dt.date
v_date_diff = (v_date - v_date.min())/np.timedelta64(1, 'D')
test_data_df['DateDiff'] = v_date_diff

test_data = np.array(test_data_df.values)
print test_data[:1]
test_data_df.columns.values

[[0L '2015-05-10 23:59:00' 'Sunday' 'BAYVIEW' '2000 Block of THOMAS AV'
  -122.39958770418998 37.7350510103906 4512.0]]


array(['Id', 'Dates', 'DayOfWeek', 'PdDistrict', 'Address', 'X', 'Y',
       'DateDiff'], dtype=object)

In [4]:
from datetime import datetime
def convert_date(date_val):
    date_list = []
    date, time = date_val.split()
    date_list.append(int(date.split('-')[0]))
    date_list.append(int(date.split('-')[1]))
    date_list.append(int(date.split('-')[2]))
    time_in_min = int(time.split(':')[0]) * 60.0 + int(time.split(':')[1])
    date_list.append(time_in_min)
    date_list.append(int(time.split(':')[0]))
    time_hour = int(time.split(':')[0])
    if time_hour < 6:
        time_of_day = 'Twilight'
    elif time_hour < 12:
        time_of_day = 'Morning'
    elif time_hour < 18:
        time_of_day = 'Afternoon'
    else:
        time_of_day = 'Night'
    date_list.append(time_of_day)
    return date_list

print crime_data[0][0]
print convert_date(crime_data[0][0])

def find_mean_std(train_data, index):
    data = []
    for row in train_data:
        data.append(float(row[index]))
    
    data_arr = np.array(data, dtype=np.float32)
    print np.mean(data_arr)
    data_dict = {}
    data_dict['mean'] = np.mean(data_arr)
    data_dict['std'] = np.std(data_arr)
    return data_dict

2015-05-13 23:53:00
[2015, 5, 13, 1433.0, 23, 'Night']


In [5]:
districts = ['NORTHERN', 'PARK', 'INGLESIDE', 'BAYVIEW', 'RICHMOND', 'CENTRAL', 'TARAVAL', 'TENDERLOIN', 'MISSION', 'SOUTHERN']
week_day = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
daytime = ['Twilight', 'Morning', 'Afternoon', 'Night']
year_range = ['2003-2006', '2006-2009', '2009-2012', 'After 2015']
crime_year = [2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015]
crime_month = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
crime_day = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29 ,30 ,31]
crime_hour = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]

In [6]:
''' Longitude & Latitude - Normalization '''
long_dict = find_mean_std(crime_data, 6)
lat_dict = find_mean_std(crime_data, 7)

-122.423
37.771


In [7]:

def get_formatted_data(train_data):
    format_data = []
    count = 0
    
    for i in range(len(train_data)):
        count += 1
        data = []
        row = train_data[i]
        
        ''' Data Preparation - Crime date '''
        
        #date_list = convert_date(row[0])
        # [2015, 5, 13, 1433.0, 23, 'Night']
        
        date_arr = convert_date(row[0])
        year, month, day, time_in_min, hour, time_of_day = date_arr
        
        norm_year = [(year < 2010)*1, (year>=2010)*1]
        data.extend(norm_year)
        
        norm_month = [1 if month == m else 0 for m in crime_month]
        data.extend(norm_month)
        
        norm_day = [1 if day == d else 0 for d in crime_day]
        data.extend(norm_day)
        
        norm_hour = [1 if hour == h else 0 for h in crime_hour]
        data.extend(norm_hour)
        
        # Time of the day preparation
        day_time = [1 if time_of_day == td else 0 for td in daytime]
        data.extend(day_time)
        
        
        ''' Data Preparation - Day of the Week '''
        wk_day = row[2]
        
        crime_week_day = [1 if wk_day == d else 0 for d in week_day] 
        data.extend(crime_week_day) # Normalized
        
        ''' Data Preparation - District '''
        pdd = row[3]
        pddistrict = [1 if pdd == d else 0 for d in districts] # Normalized
        data.extend(pddistrict)
        
        ''' Data Preparation - Longitude & Latitude '''
        '''
        longitude = float(row[6])
        long_norm = (abs(longitude) - abs(long_dict['mean']))/long_dict['std'] # Normalized
        latitude = float(row[7])
        lat_norm = (abs(latitude) - abs(lat_dict['mean']))/lat_dict['std'] # Normalized
        '''
        
        long_norm = round(row[6],3)
        lat_norm = round(row[7], 3)
        
        #long_norm = round(row[6],3)
        #lat_norm = round(float(row[7]), 3)
        data.append(long_norm)
        data.append(lat_norm)
        
        #cluster_label = cl_index[i]
        #data.append(cluster_label)
        
        ''' Data Preparation - Address '''
        
        #address = row[5].lower()
        #addr = [1 if 'block' in address else 0] # Requires major improvement
        #data.extend(addr)
        
        date_diff = round(row[6], 4)
        data.append(date_diff)
        
        ''' Quantify the data '''
        format_data.append(np.array(data))           
        
    return format_data
    
    
crimeX = np.array(get_formatted_data(crime_data))
print "CrimeX Data shape: ", crimeX.shape
print "CrimeX Label shape: ", np.array(crime_labels).shape
print crimeX[:1]

CrimeX Data shape:  (878049L, 93L)
CrimeX Label shape:  (878049L,)
[[   0.        1.        0.        0.        0.        0.        1.        0.
     0.        0.        0.        0.        0.        0.        0.        0.
     0.        0.        0.        0.        0.        0.        0.        0.
     0.        0.        1.        0.        0.        0.        0.        0.
     0.        0.        0.        0.        0.        0.        0.        0.
     0.        0.        0.        0.        0.        0.        0.        0.
     0.        0.        0.        0.        0.        0.        0.        0.
     0.        0.        0.        0.        0.        0.        0.        0.
     0.        0.        0.        0.        1.        0.        0.        0.
     1.        0.        0.        1.        0.        0.        0.        0.
     1.        0.        0.        0.        0.        0.        0.        0.
     0.        0.     -122.426    37.775  -122.4259]]


In [8]:
long_norm = -122.4260025
print round(long_norm, 3)

-122.426


In [None]:
def shuffle(X, y, seed=1337):
  np.random.seed(seed)
  shuffle = np.arange(len(y))
  np.random.shuffle(shuffle)
  X = X[shuffle]
  y = y[shuffle]
  return X, y

crimeX, crime_labels = shuffle(crimeX, crime_labels)

In [9]:
mini_trainX, mini_train_labels = crimeX[:200000], crime_labels[:200000]
mini_testX, mini_test_labels = crimeX[200000:400000], crime_labels[200000:400000]

1) Logistic Regression

In [10]:
params = {'C':[0.001, 0.01, 0.1, 0.2, 1, 10, 100]}
clf = GridSearchCV(LogisticRegression(penalty='l2'), params)
clf.fit(mini_trainX[:10000], mini_train_labels[:10000])
print "Best Params for Logistic Regression: ", clf.best_params_

Best Params for Logistic Regression:  {'C': 0.01}




In [11]:
clf = LogisticRegression(penalty='l2', C=0.01, tol=0.01)
clf.fit(mini_trainX, mini_train_labels)
print "Completed training with best C"
clf_accuracy = clf.score(mini_testX, mini_test_labels)
clf_probs = clf.predict_proba(mini_testX)
clf_log_loss = log_loss(mini_test_labels, clf_probs)
f1_score = metrics.f1_score(clf.predict(mini_testX), mini_test_labels)

print "Logistic Regression Results"
print "--------------------"
print "Accuracy: ", clf_accuracy
print "Log Loss: ", clf_log_loss
print "F1 Score: ", f1_score

  sample_weight=sample_weight)
  'recall', 'true', average, warn_for)


Completed training with best C
Logistic Regression Results
--------------------
Accuracy:  0.19537
Log Loss:  2.70455706744
F1 Score:  0.32687787045


In [14]:
clf = RandomForestClassifier(n_estimators=39)
clf.fit(mini_trainX, mini_train_labels)
print "Completed training with best C"
clf_accuracy = clf.score(mini_testX, mini_test_labels)
clf_probs = clf.predict_proba(mini_testX)
clf_log_loss = log_loss(mini_test_labels, clf_probs)
f1_score = metrics.f1_score(clf.predict(mini_testX), mini_test_labels)

print "Random Forest Results"
print "--------------------"
print "Accuracy: ", clf_accuracy
print "Log Loss: ", clf_log_loss
print "F1 Score: ", f1_score

Completed training with best C
Logistic Regression Results
--------------------
Accuracy:  0.225155
Log Loss:  7.35380556788
F1 Score:  0.278253872726


  sample_weight=sample_weight)


In [16]:
clf = BernoulliNB()
clf.fit(mini_trainX, mini_train_labels)
print "Completed training with best C"
clf_accuracy = clf.score(mini_testX, mini_test_labels)
clf_probs = clf.predict_proba(mini_testX)
clf_log_loss = log_loss(mini_test_labels, clf_probs)
f1_score = metrics.f1_score(clf.predict(mini_testX), mini_test_labels)

print "BernoulliNB Results"
print "--------------------"
print "Accuracy: ", clf_accuracy
print "Log Loss: ", clf_log_loss
print "F1 Score: ", f1_score

Completed training with best C
BernoulliNB Results
--------------------
Accuracy:  0.19025
Log Loss:  3.03587210942
F1 Score:  0.236026844657


  sample_weight=sample_weight)


Submission code

In [24]:
def get_formatted_test_data(test_data):
    format_data = []
    count = 0
    
    for i in range(len(test_data)):
        count += 1
        data = []
        
        row = test_data[i]
        
        ''' Data Preparation - Date '''
        date_arr = convert_date(row[1])
        year, month, day, time_in_min, hour, time_of_day = date_arr
        
        #norm_year = [1 if year == y else 0 for y in crime_year]
        norm_year = [(year < 2010)*1, (year>=2010)*1]
        data.extend(norm_year)
        
        norm_month = [1 if month == m else 0 for m in crime_month]
        data.extend(norm_month)
        
        norm_day = [1 if day == d else 0 for d in crime_day]
        data.extend(norm_day)
        
        norm_hour = [1 if hour == h else 0 for h in crime_hour]
        data.extend(norm_hour)
        
        # Time of the day preparation
        day_time = [1 if time_of_day == td else 0 for td in daytime]
        data.extend(day_time)
        
        ''' Data Preparation - Day of the Week '''
        wk_day = row[2]
        
        crime_week_day = [1 if wk_day == d else 0 for d in week_day] 
        data.extend(crime_week_day) # Normalized
        
        pdd = row[3]
        pddistrict = [1 if pdd == d else 0 for d in districts]
        data.extend(pddistrict)
        
        longitude = float(row[5])
        long_norm = (abs(longitude) - abs(long_dict['mean']))/long_dict['std']
        latitude = float(row[6])
        lat_norm = (abs(latitude) - abs(lat_dict['mean']))/lat_dict['std']
        data.append(long_norm)
        data.append(lat_norm)
        
        #test_cluster_label = clt_index[i]
        #data.append(test_cluster_label)
        
        address = row[4].lower()
        addr = [1 if 'block' in address else 0]
        data.extend(addr)
        
        date_diff = row[5]
        data.append(date_diff)
        
        format_data.append(np.array(data, dtype=np.float32))    
        
        
        
    return format_data
    
    

testX = np.array(get_formatted_test_data(test_data), dtype=np.float32)


In [2]:
lr = LogisticRegression(penalty='l2', C=0.01, tol=0.01)
lr.fit(crimeX, crime_labels)
print "Completed the training"

NameError: name 'LogisticRegression' is not defined

In [1]:
probs = lr.predict_proba(testX)
print probs.shape
print probs[:1]

NameError: name 'lr' is not defined

In [26]:
import gzip
with gzip.open('submission-matrix-5.csv.gz', 'wb') as f:
    out = csv.writer(f, lineterminator='\n')
    out.writerow(['Id'] + list(np.unique(crime_labels)))
    
    for i, prob in enumerate(probs):
        out.writerow([i] + list(prob))
print "Job completed"    

Job completed


In [31]:
def build_features(data):
    data['DateTime'] = pd.to_datetime(data['Dates'])
    date_vector = data['DateTime'].dt.date
    data['DateDiff'] = (date_vector - date_vector.min()) / np.timedelta64(1, 'D')
    data['Year'] = pd.DatetimeIndex(data['DateTime']).year
    data['Month'] = pd.DatetimeIndex(data['DateTime']).month
    data['Day'] = pd.DatetimeIndex(data['DateTime']).day
    data['Hour'] = pd.DatetimeIndex(data['DateTime']).hour
    data['SecondsDelta'] = (data.DateTime - pd.Timestamp('2013-01-01')) / np.timedelta64(1,'s')
    data['Weekend'] = (data.DayOfWeek == "Saturday") | (data.DayOfWeek == "Sunday")
    years = pd.get_dummies(data.Year)
    years.columns = ['2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015']
    months = pd.get_dummies(data.Month)
    months.columns = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    days = pd.get_dummies(data.Day)
    days.columns = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31']
    daysofweek = pd.get_dummies(data.DayOfWeek)
    hours = pd.get_dummies(data.Hour)
    hours.columns = ['12AM', '1AM', '2AM', '3AM', '4AM', '5AM',
                     '6AM', '7AM', '8AM', '9AM', '10AM', '11AM',
                     '12PM', '1PM', '2PM', '3PM', '4PM', '5PM',
                     '6PM', '7PM', '8PM', '9PM', '10PM', '11PM']
    districts = pd.get_dummies(data.PdDistrict)
    new_data = pd.concat([data, years, months, days, daysofweek, hours, districts], axis=1)
    return new_data

data = build_features(data_df)
test = build_features(test_data_df)

In [32]:
# Generate location-based dummies
XR3 = data['X'].round(decimals=3).apply(str)
YR3 = data['Y'].round(decimals=3).apply(str)
data_XR3s = pd.get_dummies(XR3)
data_YR3s = pd.get_dummies(YR3)    
XR3 = test['X'].round(decimals=3).apply(str)
YR3 = test['Y'].round(decimals=3).apply(str)
test_XR3s = pd.get_dummies(XR3)
test_YR3s = pd.get_dummies(YR3)    

#Subset the test to only include features that exist in training set
test_XR3s = test_XR3s[list(data_XR3s)]
test_YR3s = test_YR3s[list(data_YR3s)]

print data_XR3s.shape
print data_YR3s.shape
print
print test_XR3s.shape
print test_YR3s.shape

(878049, 147)
(878049, 107)

(884262, 147)
(884262, 107)


In [33]:
# Separate labels
train_labels = data.Category

# Create integer labels
panda_labels = pd.Categorical(data.Category).codes
train_labels_int = np.array(panda_labels).astype(np.int32)

# Drop Category, Descript and Resolution columns since they are not in the test set.
# Drop non-numerics too - they are accounted for as dummy variables.
train_data = data.drop(['Category', 'Descript', 'Resolution', 'DateTime', 'Dates', 'PdDistrict', 'Address', 'DayOfWeek'], axis=1)
train_data.Weekend = train_data.Weekend * 1
train_names = train_data.columns.values.tolist()

test_data = test.drop(['DateTime', 'Dates', 'PdDistrict', 'Address', 'DayOfWeek'], axis=1)
test_data.Weekend = test_data.Weekend * 1
test_names = test_data.columns.values.tolist()

In [35]:
features = ['Jan','Friday', 'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday', 'Wednesday', '12AM', '1AM', '2AM', '3AM', '4AM', '5AM', '6AM', '7AM', '8AM', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM', '5PM', '6PM', '7PM', '8PM', '9PM', '10PM', '11PM', 'BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION', 'NORTHERN', 'PARK', 'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN']
np_train_data = np.array(pd.concat([train_data[features], (data.Year < 2006) * 1, (data.Year < 2008) * 1, (data.Year < 2010) * 1, data_XR3s, data_YR3s], axis=1))

MemoryError: 

In [None]:
np_test_data = np.array(pd.concat([test_data[features], (test.Year < 2006) * 1, (test.Year < 2008) * 1, (test.Year < 2010) * 1, test_XR3s[list(data_XR3s)], test_YR3s[list(data_YR3s)]], axis=1))