In [39]:
import pandas as pd
import numpy as np
import scipy as sp
import sklearn as sk
from sklearn.grid_search import GridSearchCV
from sklearn import ensemble

import datetime
import math
import pprint
pp = pprint.PrettyPrinter(indent=4)

In [40]:
"""
# 方針
* 週単位／曜日別で観光客が変動すると仮定する
  * e.g. 6月第1週, 6月第2週 とかでカラムを分ける
  * 曜日でカラムを分ける

"""
None

In [41]:
# 観光宿泊者数データ
target_train = pd.read_csv('../data/original/target/target_train.csv')
target_location = pd.read_csv('../data/original/target/target_location.tsv', delimiter='\t')
prefecture_master = pd.read_csv('../data/original/target/prefecture_master.tsv', delimiter='\t', header=None, names=['code', 'prefecture'])
category = pd.read_csv('../data/original/target/category.csv')
category_master = pd.read_csv('../data/original/target/category_master.tsv', delimiter='\t', header=None, names=['code', 'category'])

In [42]:
def iso_date(s):
    return datetime.datetime.strptime(s, '%Y-%m-%d').isocalendar()

target_train['iso_year'] = target_train['date'].map(lambda s: iso_date(s)[0])
target_train['iso_week'] = target_train['date'].map(lambda s: iso_date(s)[1])
target_train['iso_weekday'] = target_train['date'].map(lambda s: iso_date(s)[2])

In [43]:
## 学習用データ
X = pd.DataFrame()
X = pd.concat((X, pd.get_dummies(target_train['iso_year'])), axis=1)
X = pd.concat((X, pd.get_dummies(target_train['iso_week'])), axis=1)
X = pd.concat((X, pd.get_dummies(target_train['iso_weekday'])), axis=1)

n_cv=round(math.log2(len(X.columns)) * 4)

y = target_train[[
        '01202_total', '04100_total', '13102_total', '14382_total', '14384_total', '16201_total', '17201_total',
        '22205_total', '24203_total', '26100_total', '32203_total', '34100_total', '42201_total', '47207_total', 
        '01202_inbound', '04100_inbound', '13102_inbound', '14382_inbound', '14384_inbound', '16201_inbound', '17201_inbound',
        '22205_inbound', '24203_inbound', '26100_inbound', '32203_inbound', '34100_inbound', '42201_inbound', '47207_inbound', 
    ]]

In [44]:
# ランダムフォレスト
regr = GridSearchCV(ensemble.RandomForestRegressor(), {
        'n_estimators': [10, 100, 1000],
        'max_features': ['sqrt', 'log2', 'auto']
    }, n_jobs=-1, cv=n_cv)
regr.fit(X, y)

print('# grid_scores:')
pp.pprint(regr.grid_scores_)
print('# best_estimator: \n', regr.best_estimator_)
print('# best_score: \n', regr.best_score_)
print('# best_params: \n', regr.best_params_)
print('# feature_importances: \n', regr.best_estimator_.feature_importances_)

# grid_scores:
[   mean: -0.16678, std: 1.15455, params: {'max_features': 'sqrt', 'n_estimators': 10},
    mean: -0.14108, std: 1.20917, params: {'max_features': 'sqrt', 'n_estimators': 100},
    mean: -0.12776, std: 1.13843, params: {'max_features': 'sqrt', 'n_estimators': 1000},
    mean: -0.27061, std: 1.27345, params: {'max_features': 'log2', 'n_estimators': 10},
    mean: -0.16390, std: 1.22009, params: {'max_features': 'log2', 'n_estimators': 100},
    mean: -0.14262, std: 1.15264, params: {'max_features': 'log2', 'n_estimators': 1000},
    mean: -0.06044, std: 0.93968, params: {'max_features': 'auto', 'n_estimators': 10},
    mean: -0.07627, std: 1.01294, params: {'max_features': 'auto', 'n_estimators': 100},
    mean: -0.06661, std: 1.01200, params: {'max_features': 'auto', 'n_estimators': 1000}]
# best_estimator: 
 RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_sampl

In [93]:
# 推測
submit = pd.read_csv('../data/original/sample_submit.csv', names=[
        "date",
        "total_1", "total_2", "total_3", "total_4", "total_5", "total_6", "total_7",
        "total_8", "total_9", "total_10", "total_11", "total_12", "total_13", "total_14",
        "inbound_1", "inbound_2", "inbound_3", "inbound_4", "inbound_5", "inbound_6", "inbound_7",
        "inbound_8", "inbound_9", "inbound_10", "inbound_11", "inbound_12", "inbound_13", "inbound_14"
    ])
submit['iso_year'] = submit['date'].map(lambda s: iso_date(s)[0])
submit['iso_week'] = submit['date'].map(lambda s: iso_date(s)[1])
submit['iso_weekday'] = submit['date'].map(lambda s: iso_date(s)[2])

X_target = pd.DataFrame(data=np.zeros((len(submit), len(X.columns))), columns=X.columns.values)
for key in ['iso_year', 'iso_week', 'iso_weekday']:
    dm = pd.get_dummies(submit[key])
    X_target[dm.columns] = dm
    
p = regr.best_estimator_.predict(X_target)
output = pd.concat((submit['date'], pd.DataFrame(data=p)), axis=1)
fname = datetime.datetime.now().strftime('%y%m%d_%H%M-submit') + '.csv'
output.to_csv('../data/submit/' + fname, header=None, index=None)

In [None]:
# AdaBoost
regr = GridSearchCV(ensemble.AdaBoostRegressor(), {
        'n_estimators': [10, 100, 1000, 10000],
        'loss': ['linear', 'square', 'exponential']
    }, n_jobs=-1, cv=round(math.log2(365) * 4))
regr.fit(X, y)

print('# grid_scores:')
pp.pprint(regr.grid_scores_)
print('# best_estimator: \n', regr.best_estimator_)
print('# best_score: \n', regr.best_score_)
print('# best_params: \n', regr.best_params_)
print('# feature_importances: \n', regr.best_estimator_.feature_importances_)

In [None]:
# Bagging
regr = GridSearchCV(ensemble.BaggingRegressor(), {
        'n_estimators': [10, 100, 1000, 10000],
        'max_samples': [1.0],
        'max_features': [1.0]
    }, n_jobs=-1, cv=round(math.log2(365) * 4))
regr.fit(X, y)

print('# grid_scores:')
pp.pprint(regr.grid_scores_)
print('# best_estimator: \n', regr.best_estimator_)
print('# best_score: \n', regr.best_score_)
print('# best_params: \n', regr.best_params_)
print('# feature_importances: \n', regr.best_estimator_.feature_importances_)

In [None]:
# ExtraTrees
regr = GridSearchCV(ensemble.ExtraTreesRegressor(), {
        'n_estimators': [10, 100, 1000, 10000],
        'max_features': ['log2', 'sqrt', 'auto']
    }, n_jobs=-1, cv=round(math.log2(365) * 4))
regr.fit(X, y)

print('# grid_scores:')
pp.pprint(regr.grid_scores_)
print('# best_estimator: \n', regr.best_estimator_)
print('# best_score: \n', regr.best_score_)
print('# best_params: \n', regr.best_params_)
print('# feature_importances: \n', regr.best_estimator_.feature_importances_)

In [None]:
# GradientBoosting
regr = GridSearchCV(ensemble.GradientBoostingRegressor(), {
        'n_estimators': [10, 100, 1000, 10000], # A large number usually results in better performance.
        'max_features': ['log2', 'sqrt', 'auto'] # Choosing max_features < n_features leads to a reduction of variance and an increase in bias.
    }, n_jobs=-1, cv=round(math.log2(365) * 4))
regr.fit(X, y)

print('# grid_scores:')
pp.pprint(regr.grid_scores_)
print('# best_estimator: \n', regr.best_estimator_)
print('# best_score: \n', regr.best_score_)
print('# best_params: \n', regr.best_params_)
print('# feature_importances: \n', regr.best_estimator_.feature_importances_)