In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn_pandas import DataFrameMapper, cross_val_score
from sklearn import preprocessing
from sklearn import decomposition
from sklearn import compose
from sklearn import pipeline
from sklearn import linear_model
from sklearn import svm
from sklearn import ensemble
from sklearn import naive_bayes
from sklearn import tree
from sklearn import svm
from sklearn import neighbors

In [3]:
df_train = pd.read_csv(
    'data/train.csv', 
    usecols='Dates PdDistrict X Y Category'.split(),
    parse_dates=['Dates']
)
df_train.head()

Unnamed: 0,Dates,Category,PdDistrict,X,Y
0,2015-05-13 23:53:00,WARRANTS,NORTHERN,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,NORTHERN,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,NORTHERN,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,NORTHERN,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,PARK,-122.438738,37.771541


In [4]:
cats = df_train.Category.value_counts()
cats_pct = cats / cats.sum()
keep_cats = cats_pct[cats_pct.cumsum() <= 0.995].index
df_train = df_train[df_train.Category.isin(keep_cats)]

In [5]:
df_train.head()

Unnamed: 0,Dates,Category,PdDistrict,X,Y
0,2015-05-13 23:53:00,WARRANTS,NORTHERN,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,NORTHERN,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,NORTHERN,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,NORTHERN,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,PARK,-122.438738,37.771541


# Remove outlier coords

In [6]:
rows = df_train.Y == 90
df_train.loc[rows, 'Y'] = np.nan
df_train.loc[rows, 'X'] = np.nan
df_train = df_train.dropna()

# Join weather data

In [7]:
df_train['date'] = pd.to_datetime(df_train.Dates.dt.date)
df_train.head()

Unnamed: 0,Dates,Category,PdDistrict,X,Y,date
0,2015-05-13 23:53:00,WARRANTS,NORTHERN,-122.425892,37.774599,2015-05-13
1,2015-05-13 23:53:00,OTHER OFFENSES,NORTHERN,-122.425892,37.774599,2015-05-13
2,2015-05-13 23:33:00,OTHER OFFENSES,NORTHERN,-122.424363,37.800414,2015-05-13
3,2015-05-13 23:30:00,LARCENY/THEFT,NORTHERN,-122.426995,37.800873,2015-05-13
4,2015-05-13 23:30:00,LARCENY/THEFT,PARK,-122.438738,37.771541,2015-05-13


In [8]:
weather = pd.read_csv(
    'data/weather.csv', 
    usecols='DATE PRCP TMIN TMAX'.split(),
    parse_dates=['DATE'], 
    index_col=['DATE']
)
weather.head()

Unnamed: 0_level_0,PRCP,TMAX,TMIN
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2003-01-01,0.0,59,46
2003-01-02,0.0,57,48
2003-01-03,0.0,63,47
2003-01-04,0.0,66,50
2003-01-05,0.0,69,49


In [9]:
df_train = df_train.join(weather, on='date')
df_train.head()

Unnamed: 0,Dates,Category,PdDistrict,X,Y,date,PRCP,TMAX,TMIN
0,2015-05-13 23:53:00,WARRANTS,NORTHERN,-122.425892,37.774599,2015-05-13,0.0,61,50
1,2015-05-13 23:53:00,OTHER OFFENSES,NORTHERN,-122.425892,37.774599,2015-05-13,0.0,61,50
2,2015-05-13 23:33:00,OTHER OFFENSES,NORTHERN,-122.424363,37.800414,2015-05-13,0.0,61,50
3,2015-05-13 23:30:00,LARCENY/THEFT,NORTHERN,-122.426995,37.800873,2015-05-13,0.0,61,50
4,2015-05-13 23:30:00,LARCENY/THEFT,PARK,-122.438738,37.771541,2015-05-13,0.0,61,50


# Feature engineering on dates

In [10]:
df_train['weekday'] = df_train.Dates.dt.weekday
df_train['month'] = df_train.Dates.dt.month
df_train['year'] = df_train.Dates.dt.year
df_train['hour'] = df_train.Dates.dt.hour

# Extract features

In [11]:
df_train.head()

Unnamed: 0,Dates,Category,PdDistrict,X,Y,date,PRCP,TMAX,TMIN,weekday,month,year,hour
0,2015-05-13 23:53:00,WARRANTS,NORTHERN,-122.425892,37.774599,2015-05-13,0.0,61,50,2,5,2015,23
1,2015-05-13 23:53:00,OTHER OFFENSES,NORTHERN,-122.425892,37.774599,2015-05-13,0.0,61,50,2,5,2015,23
2,2015-05-13 23:33:00,OTHER OFFENSES,NORTHERN,-122.424363,37.800414,2015-05-13,0.0,61,50,2,5,2015,23
3,2015-05-13 23:30:00,LARCENY/THEFT,NORTHERN,-122.426995,37.800873,2015-05-13,0.0,61,50,2,5,2015,23
4,2015-05-13 23:30:00,LARCENY/THEFT,PARK,-122.438738,37.771541,2015-05-13,0.0,61,50,2,5,2015,23


In [12]:
prep_coords = pipeline.make_pipeline(
    preprocessing.StandardScaler(),
    decomposition.PCA()
)

mapper = DataFrameMapper([
    ('X Y'.split(), prep_coords),
    ('Category', preprocessing.LabelEncoder()),
    ('PRCP TMAX PdDistrict year month weekday hour'.split(), preprocessing.FunctionTransformer(validate=False)),
#     ('year month weekday hour'.split(), preprocessing.OneHotEncoder(categories='auto')),
])

In [13]:
df_mapped = pd.DataFrame(
    mapper.fit_transform(df_train),
    columns='X Y Category PRCP TMAX PdDistrict year month weekday hour'.split(),
)

In [14]:
df_mapped.head()

Unnamed: 0,X,Y,Category,PRCP,TMAX,PdDistrict,year,month,weekday,hour
0,-0.134123,0.308631,26,0,61,NORTHERN,2015,5,2,23
1,-0.134123,0.308631,14,0,61,NORTHERN,2015,5,2,23
2,-0.932278,1.02129,14,0,61,NORTHERN,2015,5,2,23
3,-0.872079,1.10831,10,0,61,NORTHERN,2015,5,2,23
4,0.314564,0.578388,10,0,61,PARK,2015,5,2,23


In [15]:
df_mapped_dummies = pd.get_dummies(df_mapped, columns='PdDistrict year month weekday hour'.split())
df_mapped_dummies.head()

Unnamed: 0,X,Y,Category,PRCP,TMAX,PdDistrict_BAYVIEW,PdDistrict_CENTRAL,PdDistrict_INGLESIDE,PdDistrict_MISSION,PdDistrict_NORTHERN,...,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23
0,-0.134123,0.308631,26,0,61,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
1,-0.134123,0.308631,14,0,61,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
2,-0.932278,1.02129,14,0,61,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
3,-0.872079,1.10831,10,0,61,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
4,0.314564,0.578388,10,0,61,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [16]:
df_train['PdDistrict'].value_counts()

SOUTHERN      156182
MISSION       119183
NORTHERN      104760
BAYVIEW        89003
CENTRAL        84948
TENDERLOIN     81376
INGLESIDE      78449
TARAVAL        65254
PARK           49155
RICHMOND       45009
Name: PdDistrict, dtype: int64

In [17]:
X = df_mapped_dummies.sample(100_000, random_state=123)
y = X.pop('Category').astype(int)

# Train a quick GB

In [18]:
clf = ensemble.GradientBoostingClassifier(n_estimators=100, max_depth=1, subsample=0.5)
clf.fit(X, y)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=1,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=0.5, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [19]:
ypred = clf.predict_proba(X)

In [20]:
from sklearn.metrics import make_scorer, log_loss
log_loss(y, ypred)

2.5265332252894663

In [21]:
fis = pd.DataFrame(
    {'importance': clf.feature_importances_}, 
    index=X.columns
)
fis.sort_values('importance', ascending=False)

Unnamed: 0,importance
X,0.297638
Y,0.221516
PdDistrict_TENDERLOIN,0.150724
PdDistrict_NORTHERN,0.030004
year_2005,0.024084
year_2004,0.021113
year_2014,0.017019
year_2003,0.016899
year_2013,0.016342
hour_2,0.015755


In [22]:
df_test = pd.read_csv('data/test.csv')
df_test.head()

Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y
0,0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051
1,1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432
2,2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212
3,3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412
4,4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412


In [23]:
X.head()

Unnamed: 0,X,Y,PRCP,TMAX,PdDistrict_BAYVIEW,PdDistrict_CENTRAL,PdDistrict_INGLESIDE,PdDistrict_MISSION,PdDistrict_NORTHERN,PdDistrict_PARK,...,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23
152109,0.242267,-1.48904,0.0,73,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
628096,0.253093,-1.66903,0.0,68,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
87836,1.12935,0.100991,0.0,69,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
260523,-1.46336,0.175253,0.0,73,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
595237,1.0534,1.37572,0.02,63,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [117]:
y.head()

152109    14
628096     7
87836     22
260523    14
595237     5
Name: Category, dtype: int64

In [120]:
from sklearn.model_selection import KFold, cross_val_score, StratifiedKFold
from sklearn.metrics import make_scorer, log_loss

def cross_val(clf, X, y):
    kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=123)
    scores = []
    for train, test in kf.split(X, y):
        clf.fit(X[train], y[train])
        y_pred = clf.predict_proba(X[test])
        score = log_loss(y[test], y_pred, labels=y[train])
        print(score)
        scores.append(score)
    return np.array(scores)
        
estimators = {
    'gb': ensemble.GradientBoostingClassifier(n_estimators=50, max_depth=2),
    'k10': neighbors.KNeighborsClassifier(n_neighbors=10),
    'k20': neighbors.KNeighborsClassifier(n_neighbors=10),
#     'rfc': ensemble.RandomForestClassifier(n_estimators=100),
    'sgd': linear_model.SGDClassifier(loss='log'),
    'logreg': linear_model.LogisticRegression(solver='lbfgs', multi_class='ovr'),
    'svc': svm.SVC(gamma='auto', probability=True),
#     'tree': tree.DecisionTreeClassifier(),
}

for name, clf in estimators.items():
    print(f'Check {name}')
    scores = cross_val(clf, X.values, y.values)
    print(f' ==> {scores.mean()}')

Check gb
2.59946368588776
2.5734645319563976
2.5550269134482706
 ==> 2.5759850437641427
Check k10
15.205664500413405
15.007207715887347
15.228184585611082
 ==> 15.147018933970612
Check k20
15.205664500413405
15.007207715887347
15.228184585611082
 ==> 15.147018933970612
Check sgd


  np.exp(prob, prob)


29.638001593442524


  np.exp(prob, prob)


28.691696207554262


  np.exp(prob, prob)


29.97618137044815
 ==> 29.43529305714831
Check logreg




2.6244739548726703




2.6131539950223455




2.598507866395622
 ==> 2.612045272096879
Check svc
2.6002707860572656




2.600849890532593




2.594039763068964
 ==> 2.5983868132196077


In [28]:
from sklearn.model_selection import KFold, cross_val_score, StratifiedKFold
from sklearn.model_selection import GridSearchCV

In [29]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [1, 2],
    'subsample': [0.1, 0.5, 1.0],
}

In [30]:
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=123)
clf = GridSearchCV(
    ensemble.GradientBoostingClassifier(),
    param_grid,
    cv=cv,
    scoring='neg_log_loss',
    n_jobs=-1,
)
clf.fit(X, y)

GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=123, shuffle=True),
       error_score='raise-deprecating',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_sampl...      subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': [100, 200], 'max_depth': [1, 2], 'subsample': [0.1, 0.5, 1.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_log_loss', verbose=0)

In [31]:
clf.best_params_

{'max_depth': 2, 'n_estimators': 200, 'subsample': 1.0}

In [32]:
clf.best_score_

-2.495816516212204

In [None]:
clf = estimators['logreg']
clf.fit(X, y)
y_pred = clf.predict_proba(X)
log_loss(y, y_pred, labels=y)

In [None]:
26.81 * 2 * 2 + 51

In [None]:
7987 / 2013

In [None]:
scores = cross_val_score(ml_pipe, X, y, cv=kf, scoring='neg_log_loss') # make_scorer(log_loss, needs_proba=True, labels=y))
scores.mean()

In [None]:
ml_pipe.fit(df_train, df_train.enc_cat)

In [None]:
clf.feature_importances_

In [None]:
tree.export_graphviz(ml_pipe.steps[-1][1], 'tree.dot')
!dot -Tpng tree.dot -o tree.png

In [None]:
!open tree.png

In [None]:
enc.fit(df_train, df_train.y)

In [None]:
pred = enc.predict(df_train)

In [None]:
from sklearn import metrics

In [None]:
metrics.balanced_accuracy_score(df_train.y, pred)

In [None]:
metrics??

In [None]:
print(metrics.classification_report(df_train.y, pred))