In [1]:
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer

In [2]:
train_set = pd.read_csv('Train_Processed.csv')
train_set = train_set.drop(['Place_ID X Date','Date', 'Place_ID','target_min','target_max',
                            'target_variance','target_count'], axis=1)

In [3]:
test_set = pd.read_csv('Test_Processed.csv')
test_set = test_set.drop(['Place_ID X Date','Date', 'Place_ID'], axis=1)

In [4]:
X = train_set.drop(['target'], axis=1)
y = train_set.target

In [5]:
model = xgb.XGBRegressor()

In [6]:
params={
 "learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
 "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
 "min_child_weight" : [ 1, 3, 5, 7 ],
 "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
 "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ],
 "scale_pos_weight" : [ 1, 10, 25, 50, 75, 99, 100, 1000 ],
 "max_delta_step"   : [ 0, 1, 2, 3 ]
}

In [7]:
def rmse(predict, actual):
    predict = np.array(predict)
    actual = np.array(actual)
    distance = predict - actual
    square_distance = distance ** 2
    mean_square_distance = square_distance.mean()
    score = np.sqrt(mean_square_distance)
    return score

rmse_score = make_scorer(rmse, greater_is_better=False)

In [8]:
random_search=RandomizedSearchCV(model,
                                 param_distributions=params,
                                 n_iter=5,
                                 scoring=rmse_score,
                                 n_jobs=-1,
                                 cv=5,
                                 verbose=3)

In [9]:
random_search.fit(X,y)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  19 out of  25 | elapsed:   36.1s remaining:   11.4s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:   41.7s finished




RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                          colsample_bylevel=1,
                                          colsample_bynode=1,
                                          colsample_bytree=1, gamma=0,
                                          importance_type='gain',
                                          learning_rate=0.1, max_delta_step=0,
                                          max_depth=3, min_child_weight=1,
                                          missing=None, n_estimators=100,
                                          n_jobs=1, nthread=None,
                                          objective='reg:linear',
                                          random_state=0, reg_alpha=...
                                        'gamma': [0.0, 0.1, 0.2, 0.3, 0.4],
                                        'learning_rate': [0.05, 0.1, 0.15, 0.2,
                                

In [10]:
random_search.best_estimator_

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.5, gamma=0.3,
             importance_type='gain', learning_rate=0.25, max_delta_step=0,
             max_depth=4, min_child_weight=7, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=50, seed=None,
             silent=None, subsample=1, verbosity=1)

In [11]:
model = xgb.XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.4, gamma=0.1,
             importance_type='gain', learning_rate=0.2, max_delta_step=0,
             max_depth=10, min_child_weight=3, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=75, seed=None,
             silent=None, subsample=1, verbosity=1)

In [12]:
model.fit(X,y)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.4, gamma=0.1,
             importance_type='gain', learning_rate=0.2, max_delta_step=0,
             max_depth=10, min_child_weight=3, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=75, seed=None,
             silent=None, subsample=1, verbosity=1)

In [15]:
predictions = model.predict(data=test_set)

In [16]:
submit = pd.read_csv('SampleSubmission.csv')

In [19]:
submit.target = predictions

In [20]:
submit

Unnamed: 0,Place_ID X Date,target
0,0OS9LVX X 2020-01-02,44.082516
1,0OS9LVX X 2020-01-03,32.118126
2,0OS9LVX X 2020-01-04,55.954746
3,0OS9LVX X 2020-01-05,45.228409
4,0OS9LVX X 2020-01-06,29.169155
...,...,...
16131,ZZDJZMS X 2020-03-31,77.041954
16132,ZZDJZMS X 2020-04-01,65.043694
16133,ZZDJZMS X 2020-04-02,66.862823
16134,ZZDJZMS X 2020-04-03,48.854744


In [21]:
submit.to_csv('submission_v3.csv',index=False)