In [1]:
import ruamel.yaml as yaml
import os
import sys
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split

from sklearn.cross_validation import *
from sklearn.grid_search import GridSearchCV

NO_CONFIG_ERR_MSG = """No config file found. Root directory is determined by presence of "config.yaml" file."""

original_wd = os.getcwd()

# Number of times to move back in directory
num_retries = 10
for x in range(0, num_retries):
    # try to load config file
    try:
        with open("config.yaml", 'r') as stream:
            cfg = yaml.safe_load(stream)
    # If not found move back one directory level
    except FileNotFoundError:
        os.chdir('../')
        # If reached the max number of directory levels change to original wd and print error msg
        if x+1 == num_retries:
            os.chdir(original_wd)
            print(NO_CONFIG_ERR_MSG)

# Add directory to PATH
path = os.getcwd()

if path not in sys.path:
    sys.path.append(path)


## Load and Process Data

In [2]:
df = pd.read_csv('data/processed/model_data.csv')

One Hot Encode the season

In [7]:
features= [
 'WINS_score',
 'market_size',
 'superteam_flg']
target = [
    'team_value'
]

season_on_hot = pd.get_dummies(df['Season']).add_prefix('Season_')

X = pd.merge(season_on_hot, df[features], left_index=True, right_index=True)
Y = df['team_value']

X_train, X_test, y_train, y_test = train_test_split(X, Y)

In [5]:
xgb_model = xgb.XGBRegressor()

In [12]:
#brute force scan for all parameters, here are the tricks
#usually max_depth is 6,7,8
#learning rate is around 0.05, but small changes may make big diff
#tuning min_child_weight subsample colsample_bytree can have 
#much fun of fighting against overfit 
#n_estimators is how many round of boosting
#finally, ensemble xgboost with multiple seeds may reduce variance
parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'learning_rate': [.001, 0.05, .01], #so called `eta` value
              'max_depth': [2, 5, 10, 20],
              'min_child_weight': [11],
              'silent': [1],
              'subsample': [.2, .5, 0.8],
              'colsample_bytree': [.2, .5, 0.8],
              'n_estimators': [5, 50, 500], #number of trees, change it to 1000 for better results
              'missing':[-999],
              'seed': [42]}


clf = GridSearchCV(xgb_model, parameters, n_jobs=6, 
                   cv=5, verbose=2, refit=True)

In [13]:
%%time
clf.fit(X_train, y_train)

Fitting 5 folds for each of 324 candidates, totalling 1620 fits


[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed:    5.7s
[Parallel(n_jobs=6)]: Done 1158 tasks      | elapsed:   16.2s
[Parallel(n_jobs=6)]: Done 1620 out of 1620 | elapsed:   21.2s finished


Wall time: 21.6 s


GridSearchCV(cv=5, error_score='raise',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params={}, iid=True, n_jobs=6,
       param_grid={'nthread': [4], 'learning_rate': [0.001, 0.05, 0.01], 'max_depth': [2, 5, 10, 20], 'min_child_weight': [11], 'silent': [1], 'subsample': [0.2, 0.5, 0.8], 'colsample_bytree': [0.2, 0.5, 0.8], 'n_estimators': [5, 50, 500], 'missing': [-999], 'seed': [42]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=2)

In [15]:
#trust your CV!
best_parameters, score, _ = max(clf.grid_scores_, key=lambda x: x[1])
print('Score:', score)
for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

# test_probs = clf.predict_proba(test[features])[:,1]

# sample = pd.read_csv('../input/sample_submission.csv')
# sample.QuoteConversion_Flag = test_probs
# sample.to_csv("xgboost_best_parameter_submission.csv", index=False)

Score: 0.2647353333157255
colsample_bytree: 0.8
learning_rate: 0.05
max_depth: 5
min_child_weight: 11
missing: -999
n_estimators: 500
nthread: 4
seed: 42
silent: 1
subsample: 0.8


In [16]:
best_parameters

{'colsample_bytree': 0.8,
 'learning_rate': 0.05,
 'max_depth': 5,
 'min_child_weight': 11,
 'missing': -999,
 'n_estimators': 500,
 'nthread': 4,
 'seed': 42,
 'silent': 1,
 'subsample': 0.8}