In [1]:
# Objetivo: Encontrar categorias de crimes

In [15]:
import numpy as np
import pandas as pd
import itertools
from datetime import datetime, time
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, ExtraTreesRegressor, GradientBoostingClassifier
from sklearn.cross_validation import KFold, train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import log_loss
from sklearn.grid_search import GridSearchCV
import json

import matplotlib.pyplot as plt
import matplotlib.patches as patches
import matplotlib.path as path
from datetime import datetime
import gc
%matplotlib inline

## Import e tratamentos

In [3]:
def trata_vars(df, test=False):
    df['Hour'] = df.Dates.dt.hour
    
    df['Periodo'] = df.Dates.dt.hour
    
    noite = (df.Hour >= 18) | (df.Hour < 6)
    df.ix[noite, 'Periodo'] = 'noite'
    
    manha = (df.Hour >= 6) & (df.Hour < 12)
    df.ix[manha, 'Periodo'] = 'manha'
    
    tarde = (df.Hour >= 12) & (df.Hour < 18)
    df.ix[tarde, 'Periodo'] = 'tarde'
    
    df['Weekend'] = False
    df['Weekend'] = (df.DayOfWeek == 'Saturday') | (df.DayOfWeek == 'Sunday')
    df.Weekend = df.Weekend * 1
    
    df['PeriodoDistrict'] = df.Periodo + df.PdDistrict
    
    df['Hour'] = df.Dates.dt.hour.astype('str')
    
    df['HourDistrict'] = df.Hour + df.PdDistrict
    
    df['Month'] = df.Dates.dt.month.astype('str')
    df['Year'] = df.Dates.dt.year.astype('str')
    df['Day'] = df.Dates.dt.day.astype('str')
    df['YearMonth'] = df.Dates.dt.year.astype('str') + df.Dates.dt.month.astype('str')
    df['YearDistrict'] = df.Dates.dt.year.astype('str') + df.PdDistrict
    df['esquina'] = df.Address.str.contains('/').astype('int')
    df['Block'] = df['Address'].str.contains('Block').astype('int')
    df['MonthDistrict'] = df.Month.astype('str') + df.PdDistrict
    
    
    date_time = pd.to_datetime(df.Dates)
    year = date_time.dt.year
    df['Year'] = year
    month = date_time.dt.month
    df['Month'] = month
    day = date_time.dt.day
    df['Day'] = day
    hour = date_time.dt.hour
    df['hour'] = hour
    minute = date_time.dt.minute
    time = hour*60+minute
    df['Time'] = time
    
    if not test:
        df.sort_values(by='Category')
    return df

In [4]:
def get_data(cols_x, cols_y):
    df = pd.DataFrame.from_csv('train.csv.gz')
    df.reset_index(inplace=True)
    df = trata_vars(df)

#     Y = pd.get_dummies(df[cols_y])
    
    df.drop_duplicates(inplace=True)
    cats = df.Category.unique()
    cats.sort()
    df_cat = pd.DataFrame(cats, columns=['Category'])
    df_cat['IndCat'] = list(df_cat.index)
    
    df = df.merge(df_cat, how='left', on='Category')
    
    X = pd.get_dummies(df[cols_x])    
    X = X.values
    Y = df['Category'].values
#     Y = np.array(Y) 

#     del duplicados
    gc.collect()
    
    return X, Y, df, cats

In [5]:
cols_x = ['PdDistrict', 'esquina','Block', 'Year', 'Month', 'Day','hour', 'Time', 'X', 'Y', 'DayOfWeek']
cols_y = ['Category']

In [6]:
X, Y, df, cats = get_data(cols_x,cols_y)

In [7]:
# df.alterX = (np.round(df.X * 10) / 10).astype(str)
# df.alterY = (np.round(df.Y * 10) / 10).astype(str)

In [8]:
# df.location = df.alterX + df.alterY + df.Category

In [9]:
# .value_counts().plot(kind='bar')

In [10]:
kf = StratifiedKFold(df.Category,n_folds=2, shuffle=True, random_state=500)
train = list(kf)[0][0]
test = list(kf)[0][1]

X_train = X[train]
X_test = X[test]
Y_train = Y[train]
Y_test = Y[test]
df_train = df.ix[train]
# df_train = df

In [11]:
# qtde = X_train.shape[0]
# # qtde = 200000

# if qtde != X_train.shape[0]:
#     X = np.array(X_train.iloc[:qtde])
#     Y = np.array(Y_train.iloc[:qtde])
# else:
#     X = np.asfortranarray(X_train)
#     Y = np.asfortranarray(Y_train)

In [12]:
# n_estimators=20,
#                         random_state=500,
#                         n_jobs=4,
# #                         criterion='entropy',
# #                             max_leaf_nodes=30,
#                         warm_start=True,
# #                                 min_samples_leaf=10,
# #                         max_features=None

In [16]:
def train_forest(train_x, train_y, test_x=None, test_y=None):
    
#     train_x = X_train
#     train_y = Y_train
#     test_x = X_test
#     test_y = Y_test

    ini = datetime.now()

    forest = GradientBoostingClassifier(min_samples_split=1000)
#     forest = ExtraTreesClassifier(min_samples_split=1000)

    kf = StratifiedKFold(np.array(df_train['IndCat']), n_folds=3, shuffle=True, random_state=500)
    print('Training forest...')
    for k, (train, test) in enumerate(kf, start=1):
        forest.fit(train_x[train], train_y[train])
        pred = forest.predict_proba(train_x[test]) 
        print('fold:', k, 'log loss:', log_loss(train_y[test], pred))
        del pred

    if not test_x is None:
        pred = forest.predict_proba(test_x)
        print('fold:', k, 'log_loss:', log_loss(test_y, pred))
        del pred
    fim = datetime.now()
    print(fim - ini)
    return forest

In [17]:
forest = train_forest(X_train, Y_train, X_test, Y_test)
# forest = train_forest(X, Y)

Training forest...
fold: 1 log loss: 2.40793085591
fold: 2 log loss: 2.41128308951
fold: 3 log loss: 2.40685993209
fold: 3 log_loss: 2.41499484332
5:05:04.733811


In [18]:
import pickle
with open('temp', 'wb') as f:
    pickle.dump(forest, f)

In [None]:
import pickle
with open('temp', 'rb') as f:
    forest = pickle.load(f)

In [19]:
def get_test():
    df_test = pd.read_csv('test.csv.gz', index_col='Id')
    df_test.reset_index(inplace=True)
    df_test.Dates = pd.to_datetime(df_test.Dates)
    df_test = trata_vars(df_test, test=True)

    X_test = pd.get_dummies(df_test[cols_x])
    return X_test

In [20]:
X_test = get_test()

In [None]:
# a = X_train.columns - X_test.columns
# for i in a:
#     X_test[i] = 0

In [21]:
# res = forest.predict(np.array(X_test))


l = []
for i in range(0, X_test.shape[0],100000):
    print(i)
    res = forest.predict_proba(np.array(X_test.iloc[i:i+100000]))
#     res = np.array(res)[:,:,1].T
    l.append(res)
    del res
    gc.collect()

newl = []
for i in l:
    for j in i:
        newl.append(j)
res = np.array(newl)


0
100000
200000
300000
400000
500000
600000
700000
800000


In [22]:
df_res = pd.DataFrame(res, columns=cats)
# df_res = df_res.merge(df_cat, how='left', on='IndCat')
final_result = df_res

In [23]:
len(final_result.columns)

39

In [24]:
# for x in [x for x in df.Category.unique() if x not in final_result.columns]:
#     final_result[x] = 0

In [25]:
# cols = df['Category'].unique()
# cols.sort()

# res = pd.DataFrame(res)
# final_result.columns = cols
final_result['Id'] = final_result.index
final_result.set_index('Id', inplace=True)

In [26]:
today = datetime.today()
t = today.strftime('%Y%m%d%H%M')
final_result.to_csv('result_%s.csv.gz'%t, compression='gzip')