# Baseline script of San Francisco Crime Classification

## Goal
  - Make baseline script. Hope to help all mentees.
  - SVN: 너무 오래걸려서 포기. [Link](https://datascience.stackexchange.com/questions/989/svm-using-scikit-learn-runs-endlessly-and-never-completes-execution)
  - Logistic Regression: 2.66
  - KNN(k=500): 2.77

In [1]:
import numpy as np
import pandas as pd

## Load Data

In [2]:
train = pd.read_csv("../data/train.csv")
train.head(3)

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414


In [3]:
test = pd.read_csv("../data/test.csv")
test.head(3)

Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y
0,0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051
1,1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432
2,2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212


## Preprocess

In [4]:
from sklearn.utils import shuffle

train = shuffle(train, random_state=0)

In [5]:
feature_names = ["X", "Y"]
label_name = "Category"

train_X = train[feature_names]
test_X = test[feature_names]

train_y = train[label_name]

In [6]:
from sklearn import preprocessing

train_X = preprocessing.scale(train_X)
test_X = preprocessing.scale(test_X)

## Cross Validation Scoring

In [7]:
from sklearn.cross_validation import cross_val_score

In [None]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

gaussian_score = cross_val_score(GaussianNB(), train_X, train_y, scoring='log_loss', cv=5).mean()
bernoulli_score = cross_val_score(BernoulliNB(), train_X, train_y, scoring='log_loss', cv=5).mean()

print("GaussianNB = {0:.6f}".format(gaussian_score))
# print("MultinomialNB = {0:.6f}".format(multimonial_score))
print("BernoulliNB = {0:.6f}".format(bernoulli_score))

### Logistic Regression

In [42]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='liblinear', multi_class='ovr', n_jobs=-1, verbose=1)

logistic_score = cross_val_score(model, train_X, train_y, scoring='log_loss', cv=2).mean()
print("LogisticRegression = {0:.6f}".format(-1.0 * logistic_score))

[LibLinear]convergence after 705 epochs took 252 seconds
[LibLinear]LogisticRegression = 2.666529


### SVM

In [None]:
from sklearn import svm

model = svm.SVC(probability=True, verbose=True)
svm_score = cross_val_score(model, train_X, train_y, scoring='log_loss', cv=2).mean()
print("SVM = {0:.6f}".format(-1.0 * svm_score))

### KNearestNeighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

#n_neighbors를 몇으로 할지가 확실하지 않으므로 몇개 해본다 (메모리 너무 먹어서 포기)
#weights는 uniform이 좋다.

k_list = [500, 1000, 2000]
for k in k_list:
    model = KNeighborsClassifier(n_neighbors=k, weights='uniform', n_jobs=-1)
    knn_score = cross_val_score(model, train_X, train_y, scoring='log_loss', cv=2).mean()
    print("### k={0:d}, KNearestNeighbors = {2:.6f}".format(k, -1.0 * knn_score))

In [None]:
#위의 결과를 보고 최적의 값으로 model을 만든다
model = KNeighborsClassifier(n_neighbors=500, weights='uniform', n_jobs=-1)

## Prediction

In [10]:
model.fit(train_X, train_y)
prediction = model.predict_proba(test_X)

[LibLinear]

## Submission

In [11]:
sample = pd.read_csv("../data/sampleSubmission.csv", index_col="Id")
sample.head(3)

Unnamed: 0_level_0,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,EMBEZZLEMENT,...,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [12]:
submission = pd.DataFrame(prediction, index=sample.index)
submission.columns = sample.columns
submission.head(1)

Unnamed: 0_level_0,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,EMBEZZLEMENT,...,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.002655,0.092147,0.000431,0.000498,0.034749,0.005213,0.001855,0.07084,0.004864,0.001368,...,0.000195,0.005204,0.000424,0.031639,2.3e-05,0.009476,0.047395,0.045973,0.054855,0.015533


In [13]:
from datetime import datetime

current_time = datetime.now().strftime("%Y.%m.%d %H:%M:%S")
description = "baseline script"

filename = "../submission/{0} {1}.csv".format(current_time, description)

submission.to_csv(filename)