In [0]:
#!unzip "/content/drive/My Drive/Data/fc33077e-6-dataset.zip"

In [0]:
#!pip install catboost
#!pip install -U yellowbrick
#!pip install lightgbm

In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import RFE
from yellowbrick.model_selection import FeatureImportances
from sklearn.metrics import mean_squared_error
from hyperopt import hp
import lightgbm as lgb
import xgboost as xgb
import catboost as ctb
from catboost import CatBoostRegressor
from hyperopt import fmin, tpe, STATUS_OK, STATUS_FAIL, Trials
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)



In [4]:
%%time
train = pd.read_csv('/content/dataset/train.csv')

CPU times: user 40.8 ms, sys: 14.4 ms, total: 55.2 ms
Wall time: 55.5 ms


In [0]:
X = train.drop(columns = ['air_pollution_index'])
y = train['air_pollution_index']

In [6]:
X.shape

(33750, 13)

In [0]:
def prepare_df(train):
    org_keys = train.columns.tolist()
    
    train['is_holiday'] = np.where(train['is_holiday'] == 'None', False, True)
    train['date_time'] = pd.to_datetime(train['date_time'])
    
    train.set_index('date_time', inplace = True)
    
    days = ['01d', '03d', '07d', '14d', '30d']
    
    sum_attributes = ['is_holiday', 'rain_p_h', 'snow_p_h']

    for val in sum_attributes:
        for day in days:
            train['num_'+val+'_'+day] = train.rolling(day)[val].sum()
            
    weather_types = train['weather_type'].unique().tolist()
    for weather in weather_types:
        train[weather] = np.where(train['weather_type'] == weather, 1, 0)
    
    train.drop(columns = 'weather_type', inplace = True)
    
    avg_attributes = ['humidity', 'wind_speed', 'wind_direction', 'visibility_in_miles', 'dew_point', 'temperature', 
                     'rain_p_h', 'snow_p_h', 'clouds_all', 'traffic_volume'] + weather_types

    for val in avg_attributes:
        for day in days:
            train['avg_'+val+'_'+day] = train.rolling(day)[val].mean()
            
    for column in train.columns:
        if 'avg' in column:
            train[column+'_diff'] = train[column[4:-4]] - train[column]
            
    return train.reset_index()

In [8]:
%%time
X = prepare_df(X.copy())

CPU times: user 585 ms, sys: 60.7 ms, total: 645 ms
Wall time: 645 ms


In [9]:
X.shape

(33750, 248)

In [0]:
X['date_time'] = X['date_time'].dt.hour

In [11]:
%%time
selector = VarianceThreshold(0.1)
selector.fit(X)

CPU times: user 393 ms, sys: 84.7 ms, total: 478 ms
Wall time: 479 ms


In [0]:
sel = X.columns[selector.get_support(indices=True)]

In [0]:
X = X[sel]

In [14]:
%%time
fsel = SelectFromModel(CatBoostRegressor(task_type="GPU", verbose=0), max_features=100)
fsel.fit(X, y)

CPU times: user 19.1 s, sys: 7.68 s, total: 26.8 s
Wall time: 22.6 s


In [0]:
sel = X.columns[fsel.get_support(indices=True)]

In [16]:
len(sel)

61

In [0]:
X = X[sel]

In [0]:
rfe = RFE(RandomForestRegressor(n_jobs = -1), step = 5, verbose = 2)

In [0]:
%%time
rfe.fit(X, y)

Fitting estimator with 61 features.
Fitting estimator with 56 features.
Fitting estimator with 51 features.
Fitting estimator with 46 features.


In [0]:
sel = X.columns[rfe.get_support(indices=True)]

In [0]:
X = X[sel]

In [0]:
X_train, X_test = X[:27000], X[27000:]
y_train, y_test = y[:27000], y[27000:]

In [0]:
# XGB parameters
xgb_reg_params = {
    'learning_rate':    hp.choice('learning_rate',    np.arange(0.05, 0.31, 0.05)),
    'max_depth':        hp.choice('max_depth',        np.arange(5, 16, 1, dtype=int)),
    'min_child_weight': hp.choice('min_child_weight', np.arange(1, 8, 1, dtype=int)),
    'colsample_bytree': hp.choice('colsample_bytree', np.arange(0.3, 0.8, 0.1)),
    'subsample':        hp.uniform('subsample', 0.8, 1),
    'n_estimators':     100,
    'objective': 'reg:squarederror',
    'tree_method': "gpu_hist"
}
xgb_fit_params = {
    'eval_metric': 'rmse',
    'early_stopping_rounds': 10,
    'verbose': False
}
xgb_para = dict()
xgb_para['reg_params'] = xgb_reg_params
xgb_para['fit_params'] = xgb_fit_params
xgb_para['loss_func' ] = lambda y, pred: np.sqrt(mean_squared_error(y, pred))


# LightGBM parameters
lgb_reg_params = {
    'learning_rate':    hp.choice('learning_rate',    np.arange(0.05, 0.31, 0.05)),
    'max_depth':        hp.choice('max_depth',        np.arange(5, 16, 1, dtype=int)),
    'min_child_weight': hp.choice('min_child_weight', np.arange(1, 8, 1, dtype=int)),
    'colsample_bytree': hp.choice('colsample_bytree', np.arange(0.3, 0.8, 0.1)),
    'subsample':        hp.uniform('subsample', 0.8, 1),
    'n_estimators':     100
}
lgb_fit_params = {
    'eval_metric': 'l2',
    'early_stopping_rounds': 10,
    'verbose': False
}
lgb_para = dict()
lgb_para['reg_params'] = lgb_reg_params
lgb_para['fit_params'] = lgb_fit_params
lgb_para['loss_func' ] = lambda y, pred: np.sqrt(mean_squared_error(y, pred))


# CatBoost parameters
ctb_reg_params = {
    'learning_rate':     hp.choice('learning_rate',     np.arange(0.05, 0.31, 0.05)),
    'max_depth':         hp.choice('max_depth',         np.arange(5, 16, 1, dtype=int)),
    'l2_leaf_reg': hp.choice('l2_leaf_reg', np.arange(3, 300, 25)),
    'n_estimators':      100,
    'task_type': "GPU",
}
ctb_fit_params = {
    'early_stopping_rounds': 10,
    'verbose': False,
}
ctb_para = dict()
ctb_para['reg_params'] = ctb_reg_params
ctb_para['fit_params'] = ctb_fit_params
ctb_para['loss_func' ] = lambda y, pred: np.sqrt(mean_squared_error(y, pred))

In [0]:
class HPOpt(object):

    def __init__(self, x_train, x_test, y_train, y_test):
        self.x_train = x_train
        self.x_test  = x_test
        self.y_train = y_train
        self.y_test  = y_test

    def process(self, fn_name, space, trials, algo, max_evals):
        fn = getattr(self, fn_name)
        try:
            result = fmin(fn=fn, space=space, algo=algo, max_evals=max_evals, trials=trials)
        except Exception as e:
            return {'status': STATUS_FAIL,
                    'exception': str(e)}
        return result, trials

    def xgb_reg(self, para):
        reg = xgb.XGBRegressor(**para['reg_params'])
        return self.train_reg(reg, para)

    def lgb_reg(self, para):
        reg = lgb.LGBMRegressor(**para['reg_params'])
        return self.train_reg(reg, para)

    def ctb_reg(self, para):
        reg = ctb.CatBoostRegressor(**para['reg_params'])
        return self.train_reg(reg, para)

    def train_reg(self, reg, para):
        reg.fit(self.x_train, self.y_train,
                eval_set=[(self.x_test, self.y_test)],
                **para['fit_params'])
        pred = reg.predict(self.x_test)
        loss = para['loss_func'](self.y_test, pred)
        return {'loss': loss, 'status': STATUS_OK}

In [0]:
obj = HPOpt(X_train, X_test, y_train, y_test)

xgb_opt = obj.process(fn_name='xgb_reg', space=xgb_para, trials=Trials(), algo=tpe.suggest, max_evals=1000)
lgb_opt = obj.process(fn_name='lgb_reg', space=lgb_para, trials=Trials(), algo=tpe.suggest, max_evals=1000)
ctb_opt = obj.process(fn_name='ctb_reg', space=ctb_para, trials=Trials(), algo=tpe.suggest, max_evals=1000)

In [0]:
for key in xgb_opt[0].keys():
    if key in xgb_reg_params.keys():
        xgb_reg_params[key] = xgb_opt[0][key]

for key in lgb_opt[0].keys():
    if key in lgb_reg_params.keys():
        lgb_reg_params[key] = lgb_opt[0][key]

for key in ctb_opt[0].keys():
    if key in ctb_reg_params.keys():
        ctb_reg_params[key] = ctb_opt[0][key]

In [0]:
%%time
xgb_opt = xgb.XGBRegressor(**xgb_reg_params)
xgb_opt.fit(X_train, y_train)
xgb_score = np.fabs(xgb_opt.score(X_test, y_test))

In [0]:
%%time
lgb_opt = lgb.LGBMRegressor(**lgb_reg_params)
lgb_opt.fit(X_train, y_train)
lgb_score = np.fabs(lgb_opt.score(X_test, y_test))

In [0]:
%%time
ctb_opt = ctb.CatBoostRegressor(**ctb_reg_params)
ctb_opt.fit(X_train, y_train)
ctb_score = np.fabs(ctb_opt.score(X_test, y_test))

In [0]:
score = xgb_score + lgb_score + ctb_score

In [0]:
test = pd.read_csv('/content/dataset/test.csv')

In [0]:
%%time
X_sub = prepare_df(test.copy())

In [0]:
X_sub['date_time'] = X_sub['date_time'].dt.hour

In [0]:
X_sub = X_sub[sel]

In [0]:
test['air_pollution_index_xgb'] = xgb_opt.predict(X_sub)
test['air_pollution_index_lgb'] = lgb_opt.predict(X_sub)
test['air_pollution_index_ctb'] = ctb_opt.predict(X_sub)

In [0]:
test = test.reset_index()

In [0]:
test['air_pollution_index'] = (test['air_pollution_index_xgb'] * xgb_score / score)
  + (test['air_pollution_index'] * lgb_score / score) + (test['air_pollution_index'] * ctb_score / score)

In [0]:
test.to_csv('/content/drive/My Drive/sub2.csv', columns = ['date_time', 'air_pollution_index'], index = False)