In [1]:
!pip install xgboost
!pip install lightgbm
!pip install holidays



In [3]:
import pandas as pd
import numpy as np
import holidays
import xgboost as xgb
from sklearn.metrics import mean_squared_error

In [5]:
data = pd.read_csv('train.csv', index_col='id')
data.head(5)

Unnamed: 0_level_0,date,speed
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1/1/2017 0:00,43.00293
1,1/1/2017 1:00,46.118696
2,1/1/2017 2:00,44.294158
3,1/1/2017 3:00,41.067468
4,1/1/2017 4:00,46.448653


In [6]:
data['date'] = pd.to_datetime(data['date'], format='%d/%m/%Y %H:%M')

In [7]:
def transform(data):
    X = data.copy()
    X['year'] = data['date'].dt.year
    X['month'] = data['date'].dt.month
    X['day'] = data['date'].dt.day
    X['hour'] = data['date'].dt.hour
    X['weekday'] = data['date'].dt.weekday
    X['quarter'] = data['date'].dt.quarter
    X['dayofyear'] = data['date'].dt.dayofyear
    X['weekofyear'] = data['date'].dt.isocalendar().week.astype(int)
    X['year_2017'] = X['year'] == 2017 
    X['holiday'] = data['date'].map(lambda date : date in holidays.HK())
    X['busy'] = pd.Series(data['date'].dt.hour%12).isin([8,9,18,19,20,21])
    X['am'] = data['date'].dt.hour < 12
    return X

In [8]:
fdata = transform(data)

In [9]:
fdata.iloc[0]

date          2017-01-01 00:00:00
speed                     43.0029
year                         2017
month                           1
day                             1
hour                            0
weekday                         6
quarter                         1
dayofyear                       1
weekofyear                     52
year_2017                    True
holiday                     False
busy                        False
am                           True
Name: 0, dtype: object

In [10]:
X = pd.get_dummies(fdata[['year_2017','month','day','hour','weekday','quarter','dayofyear','weekofyear','holiday','busy']])#10.48

In [11]:
X.shape

(14006, 10)

In [12]:
y = data[['speed']]

In [13]:
#function to doing Cross Validation and GridSearch
from sklearn.model_selection import GridSearchCV
def algorithm_pipeline(X_train_data, X_test_data, y_train_data, y_test_data, 
                       model, param_grid, cv=10, scoring_fit='neg_mean_squared_error'):
    gs = GridSearchCV(
        estimator=model,
        param_grid=param_grid, 
        cv=cv, 
        n_jobs=-1, 
        scoring=scoring_fit,
        verbose=2
    )
    fitted_model = gs.fit(X_train_data, y_train_data)
    
    pred = fitted_model.predict(X_test_data)
    
    return fitted_model, pred

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=21)
param_grid = {
   'max_depth': [6,7,8],
    'n_estimators': [200,240,250],
    'learning_rate':[0.2,0.15]
}
model = xgb.XGBRegressor(
                 seed=23)
model, pred = algorithm_pipeline(X_train, X_test, y_train, y_test, model, param_grid, cv=5)

print(model.best_score_)
print(model.best_params_)

mse = (mean_squared_error(y_test, pred))
print("MSE: %f" % (mse))

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:   20.3s finished


-11.165598299521989
{'learning_rate': 0.15, 'max_depth': 6, 'n_estimators': 240}
MSE: 10.577599


In [33]:
#Train the model with optimal hypermeter and full training data
clf_model = xgb.XGBRegressor(
                 learning_rate=0.15,
                 max_depth=7,
                 n_estimators=240,                                                               
                 seed=23)
clf_model.fit(X,y)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.15, max_delta_step=0, max_depth=7,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=240, n_jobs=0, num_parallel_tree=1, random_state=23,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=23,
             subsample=1, tree_method='exact', validate_parameters=1,
             verbosity=None)

In [34]:
preds = clf_model.predict(X_test)
rmse =mean_squared_error(y_test, preds)
print("MSE: %f" % (rmse))

MSE: 4.433367


In [35]:
test = pd.read_csv('test-data.csv', index_col=['id'])
test['date'] = pd.to_datetime(test['date'], format='%d/%m/%Y %H:%M')

In [36]:
test_data = transform(test)

In [37]:
test_data_X = pd.get_dummies(test_data[['year_2017','month','day','hour','weekday','quarter','dayofyear','weekofyear','holiday','busy']])

In [38]:
preds_test = clf_model.predict(test_data_X)

In [39]:
result = pd.DataFrame(preds_test,columns =['speed'])

In [40]:
result.to_csv('test.csv' ,index_label='id')