# San Francisco Crime Data Test Model

In [1]:
# Base
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

# Tools
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import scale
# Metrics
from sklearn.metrics import log_loss
# Classifiers
from sklearn.ensemble import RandomForestClassifier

In [7]:
rs = 42 # random_state

In [2]:
data = pd.read_csv('./data/sf_crime/train.csv')

In [4]:
def transform_data(data):
    data['Dates'] = pd.to_datetime(data['Dates'])
    data['Month'] = data['Dates'].apply(lambda i: i.month)
    data['Year'] = data['Dates'].apply(lambda i: i.year)
    data['Hour'] = data['Dates'].apply(lambda i: i.hour)
    data['Minutes'] = data['Dates'].apply(lambda i: \
                                          i.hour*60 + i.minute)
#     data['Night'] = (data['Hour'] > 19) | (data['Hour'] < 7)

    categorycal_cols = ['Category', 'Descript', 'DayOfWeek', 'PdDistrict',
                        'Resolution', 'Address', 'Night']
    for category in categorycal_cols:
        if category in data:
            data[category] = data[category].astype('category')
    
    return data

In [5]:
data = transform_data(data)

In [220]:
data.head(2)

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,Month,Year,Hour,Minutes
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,5,2015,23,1433
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,5,2015,23,1433


In [60]:
def split_data(X, y):
    return train_test_split(X, y, test_size=0.3, random_state=42)

### Random forest

In [221]:
y = data['Category']#.cat.codes
X = data[['X', 'Y', 'Year', 'Minutes']]
X_train, X_test, y_train, y_test = split_data(X, y)


clf = RandomForestClassifier(n_estimators=70, max_depth=15, n_jobs=4,
                             random_state=rs, verbose=True)

clf.fit(X_train, y_train)
proba = clf.predict_proba(X_test)
loss = log_loss(y_test, proba)

print(loss)

#“gini” for the Gini impurity and “entropy”

# 2.36056042616

[Parallel(n_jobs=4)]: Done   1 out of  70 | elapsed:    1.9s remaining:  2.3min
[Parallel(n_jobs=4)]: Done  70 out of  70 | elapsed:   36.8s finished
[Parallel(n_jobs=4)]: Done   1 out of  70 | elapsed:    0.1s remaining:   13.1s
[Parallel(n_jobs=4)]: Done  70 out of  70 | elapsed:    2.9s finished


2.36056042616


# Отчет

'X', 'Y', 'Month', 'Year',
n_estimators=30, max_depth=10:
2.45973964768

'X', 'Y', 'Month', 'Year', 'Hour'
n_estimators=30, max_depth=10:
2.45612199334

'X', 'Y', 'Month', 'Year', 'Hour'
n_estimators=70, max_depth=15:
2.3770777449

Внезапно, если выкинуть месяц - результат улучшается.
'X', 'Y', 'Month', 'Year', 'Hour'
n_estimators=70, max_depth=15:
2.37622850483

Если выкинуть год но оставить месяц - результат ухудшается.
'X', 'Y', 'Month', 'Year', 'Hour'
n_estimators=70, max_depth=15:
2.39490375842

Если использовать только координаты и час - результат так же ухудшается.
'X', 'Y', 'Hour'
n_estimators=70, max_depth=15:
2.41821809425

Добавление фичи Minutes позволило улучшить результат
'X', 'Y', 'Year', 'Hour', 'Minutes'
n_estimators=70, max_depth=15
2.36959679856

Выкидываем часы и получаем очередно улучшение модели
'X', 'Y', 'Year', 'Minutes'
n_estimators=70, max_depth=15
2.36056042616 - *Best*

Нормализация по координатам ухудшает модель
2.361

(используются 'X', 'Y', 'Month', 'Year', 'Hour')
При обучении max_depath=10, при количестве деревьев до 70 качество log_loss улучгается (вплоть до 2.4517821589206741), после 70 деревьев показатели ухудшаются.

При обучении n_estimators=30, качество log_loss улучшается до 15, деревьев,  (2.4033074972863218). После 15 деревьев показатели резко ухудшаются.

Совмещая два вышеуказанных показателя (n_estimators=70, max_depth=15) удалось получить: 2.3770777449

Введение переменной Night ухудшает модель (2.39742976255)

Рассчет на 300 деревьях ухудшает результат (2.38157899729)

Попытка построить модель отдельно для Night и отдельно для Day ухудшает модель (2.42175093186 для дня, 2.5 для ночи. Под ночью понималось время > 19 и < 7)

In [179]:
# Попробуем отдельно попредсказывать для лета и зимы
summer_data = data[(data['Month']==6) |
                   (data['Month']==7) |
                   (data['Month']==8)]
y = summer_data['Category']
X = summer_data[['X', 'Y', 'Year', 'Minutes']]
X_train, X_test, y_train, y_test = split_data(X, y)


clf = RandomForestClassifier(n_estimators=300, max_depth=14, n_jobs=4,
                             random_state=rs, verbose=True)
clf.fit(X_train, y_train)
proba = clf.predict_proba(X_test)
loss = log_loss(y_test, proba)
print(d, loss)

# 14 деревьев - оптимально
# Для лета (month=6,7,8)     
# n=300, max_debth=14
# 300, 2.3849

# Для зимы (month=12,1,2)
# n=300, max_debth=14, 2.396

[Parallel(n_jobs=4)]: Done   1 out of 300 | elapsed:    0.1s remaining:  1.1min
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:   17.0s finished
[Parallel(n_jobs=4)]: Done   1 out of 300 | elapsed:    0.0s remaining:   12.5s
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:    2.6s finished


(20, 2.3854960849457894)


In [148]:
night_data = data[data['Night'] == 0]
y = night_data['Category']
X = night_data[['X', 'Y', 'Month', 'Year', 'Hour']]
X_train, X_test, y_train, y_test = split_data(X, y)

clf = RandomForestClassifier(n_estimators=30, max_depth=10, n_jobs=4,
                             random_state=rs, verbose=True)
clf.fit(X_train, y_train)
proba = clf.predict_proba(X_test)
loss = log_loss(y_test, proba)
print(loss)


[Parallel(n_jobs=4)]: Done   1 out of  30 | elapsed:    0.6s remaining:   19.3s
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:    5.1s finished
[Parallel(n_jobs=4)]: Done   1 out of  30 | elapsed:    0.0s remaining:    2.7s
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:    0.5s finished


2.42175093186


In [8]:
# Submit

test_data = pd.read_csv('./data/sf_crime/test.csv')
test_data = transform_data(test_data)
test_data = test_data[['X', 'Y', 'Year', 'Hour']]

X = data[['X', 'Y', 'Year', 'Hour']]
y = data['Category']
clf = RandomForestClassifier(n_estimators=70, max_depth=15, n_jobs=4,
                             random_state=rs, verbose=True)
clf.fit(X, y)

[Parallel(n_jobs=4)]: Done   1 out of  70 | elapsed:    1.9s remaining:  2.3min
[Parallel(n_jobs=4)]: Done  70 out of  70 | elapsed:   37.6s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=15, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=70, n_jobs=4,
            oob_score=False, random_state=42, verbose=True,
            warm_start=False)

In [9]:
proba = clf.predict_proba(test_data)

[Parallel(n_jobs=4)]: Done   1 out of  70 | elapsed:    0.5s remaining:   38.6s
[Parallel(n_jobs=4)]: Done  70 out of  70 | elapsed:   47.9s finished


In [23]:
ans = []

for id in xrange(len(proba)):
    row = list(proba[id])
    row = [id] + row
    ans.append(row)

ans = pd.DataFrame(ans)
ans.columns = ['Id'] + list(clf.classes_)

print(ans.head(2))
# item = list(proba[0])
# print len(item)

# max_index = item.index(max_item)
# item = [0 for i in xrange(len(item))]
# item[max_index] = 1
# print max_item, max_index

# print item

# print(len(y.unique()))

# print()

   Id     ARSON   ASSAULT  BAD CHECKS   BRIBERY  BURGLARY  DISORDERLY CONDUCT  \
0   0  0.002240  0.118271    0.000033  0.000446  0.021406            0.001727   
1   1  0.001321  0.094248    0.000057  0.001204  0.012745            0.002711   

   DRIVING UNDER THE INFLUENCE  DRUG/NARCOTIC  DRUNKENNESS     ...       \
0                      0.00566       0.019943     0.000659     ...        
1                      0.00298       0.042819     0.003727     ...        

   SEX OFFENSES NON FORCIBLE  STOLEN PROPERTY   SUICIDE  SUSPICIOUS OCC  \
0                   0.000153         0.004651  0.001098        0.044409   
1                   0.000236         0.002473  0.001038        0.036614   

       TREA  TRESPASS  VANDALISM  VEHICLE THEFT  WARRANTS  WEAPON LAWS  
0  0.000000  0.004337   0.073504       0.163536  0.023281     0.021432  
1  0.000118  0.002719   0.032138       0.086312  0.072499     0.027582  

[2 rows x 40 columns]


In [24]:
ans.to_csv('./data/sf_crime/res.csv', index=False)

In [21]:
result_predict = clf.predict(test_data)

[Parallel(n_jobs=4)]: Done   1 out of  70 | elapsed:    0.6s remaining:   51.3s
[Parallel(n_jobs=4)]: Done  70 out of  70 | elapsed:   55.0s finished


KeyboardInterrupt: 