In [None]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import datetime as dt
train_raw  = pd.read_csv("../input/sf-crime/train.csv.zip")
# test_raw  = pd.read_csv("../input/sf-crime/test.csv.zip")
# sample_raw  = pd.read_csv("../input/sf-crime/sampleSubmission.csv.zip")
# col = ['Dates', 'Category', 'Descript', 'DayOfWeek', 'PdDistrict','Resolution', 'Address', 'X', 'Y']
# train.shape = (878049, 9)

In [None]:
# マスターを作成
category_master = train_raw[["Category"]].drop_duplicates()
category_master["Category_id"] = list(range(category_master.shape[0]))
print("Category数: ",category_master.shape[0])

week_master = train_raw[["DayOfWeek"]].drop_duplicates()
week_master["DayOfWeek_id"] = list(range(week_master.shape[0]))
print("Week数: ",week_master.shape[0])

pd_district_master = train_raw[["PdDistrict"]].drop_duplicates()
pd_district_master["PdDistrict_id"] = list(range(pd_district_master.shape[0]))
print("PdDistrict数: ",pd_district_master.shape[0])

address_master = train_raw[["Address"]].drop_duplicates()
address_master["Address_id"] = list(range(address_master.shape[0]))
print("Address数: ",address_master.shape[0])

In [None]:
train = train_raw
# 集計用のCountカラムを追加
train["Count"] = 1

# 日付から月、時刻、TimeGroup(朝昼晩区分)を追加
train["Dates"] = pd.to_datetime(train["Dates"])
train["Month"] = train["Dates"].dt.month
train["Hour"] = train["Dates"].dt.hour

# Adress
train['Is_ST'] = train['Address'].str.contains(" ST", case=True)
train['Is_AV'] = train['Address'].str.contains(" AV", case=True)
train['Is_WY'] = train['Address'].str.contains(" WY", case=True)
train['Is_TR'] = train['Address'].str.contains(" TR", case=True)
train['Is_DR'] = train['Address'].str.contains(" DR", case=True)
train['Is_Block'] = train['Address'].str.contains(" Block", case=True)
train['Is_crossing'] = train['Address'].str.contains(" / ", case=True)

# 定義は適当にPanasonicのスマート家電からhttps://panasonic.jp/pss/qa/answer167.html
def func_cate(x):
    if  x >= 3 and x < 11:  # 朝は、3時から10時59分まで
        return 0
    elif x >= 11 and x < 18: # 昼は、11時から17時59分まで
        return 1
    else:  # 夜は18時から26時59分まで
        return 2
train['TimeGroup'] = train["Hour"].apply(func_cate)

# Categoryをintフラグに変換
train = train.merge(category_master, on="Category")

# DayOfWeekをintフラグに変換
train = train.merge(week_master, on="DayOfWeek")

# PdDistrictをintフラグに変換
train = train.merge(pd_district_master, on="PdDistrict")

# Addressをintフラグに変換
train = train.merge(address_master, on="Address")

# train.head(3)
train.columns

In [None]:
# 分析に使えるカラムだけを選択
# ['Dates', 'Category', 'Descript', 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address', 'X', 'Y', 'Count', 
# 'Month', 'Hour', 'TimeGroup', 'Is_ST', 'Is_AV', 'Is_WY', 'Is_TR', 'Is_DR', 'Is_Block', 'Is_crossing', 'Category_id', 'DayOfWeek_id', 'PdDistrict_id', 'Address_id']
train_rs = train[["Dates", 'Month', 'TimeGroup', 'Is_ST', 'Is_AV', 'Is_WY', 'Is_TR', 'Is_DR', 'Is_Block', 'Is_crossing', 'Category_id', 'DayOfWeek_id', 'PdDistrict_id']]
# train_rs = train_rs.groupby(
#     train_rs.drop("Count", axis=1).columns.tolist(),
#     as_index=False
# ).agg({"Count":np.sum})
train_rs.head(3)

In [None]:
import xgboost as xgb

# 日付で学習用とバリデーション用に分ける
# min 2003-01-06 00:01:00 / max 2015-05-13 23:53:00
train_t = train_rs[train_rs["Dates"] < dt.datetime(2012,1,1)].drop("Dates",axis=1)
train_v = train_rs[train_rs["Dates"] >= dt.datetime(2012,1,1)].drop("Dates",axis=1)

tr_x, tr_y = train_t.drop("Category_id", axis=1), train_t["Category_id"]
va_x, va_y = train_v.drop("Category_id", axis=1), train_v["Category_id"]
# tr_x, tr_y = train_t, train_t["Category_id"]
# va_x, va_y = train_v, train_v["Category_id"]

# 特徴量と目的変数をxgboostのデータ構造に変換する
# 学習データの特徴量と目的変数がtr_x, tr_y、バリデーションデータの特徴量と目的変数がva_x, va_yとする
dtrain = xgb.DMatrix(tr_x, label=tr_y)
dvalid = xgb.DMatrix(va_x, label=va_y)

params = {
    "objective": 'multi:softprob',
    'num_class': 39,
    'eval_metric':'mlogloss'
}

num_round = 50
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]

# モデルの学習の実行
bst = xgb.train(params, dtrain, num_round, watchlist)

In [None]:
from sklearn.metrics import accuracy_score
pred_val = bst.predict(dvalid)
pred_val = np.argmax(pred_val, axis=1)
score = accuracy_score(va_y, pred_val)
print('score:{0:.4f}'.format(score))

In [None]:
xgb.plot_importance(bst)

In [None]:
# from matplotlib.pylab import rcParams

# ##set up the parameters
# rcParams['figure.figsize'] = 80,100

# xgb.plot_tree(bst, num_trees=10)
# plt.savefig("test.png")