In [1]:
!unzip "/content/drive/My Drive/Data/fc33077e-6-dataset.zip"

Archive:  /content/drive/My Drive/Data/fc33077e-6-dataset.zip
   creating: dataset/
  inflating: dataset/test.csv        
  inflating: dataset/train.csv       


In [2]:
!pip install catboost
!pip install -U yellowbrick
!pip install lightgbm

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/94/ec/12b9a42b2ea7dfe5b602f235692ab2b61ee1334ff34334a15902272869e8/catboost-0.22-cp36-none-manylinux1_x86_64.whl (64.4MB)
[K     |████████████████████████████████| 64.4MB 45kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.22
Collecting yellowbrick
[?25l  Downloading https://files.pythonhosted.org/packages/13/95/a14e4fdfb8b1c8753bbe74a626e910a98219ef9c87c6763585bbd30d84cf/yellowbrick-1.1-py3-none-any.whl (263kB)
[K     |████████████████████████████████| 266kB 2.6MB/s 
Installing collected packages: yellowbrick
  Found existing installation: yellowbrick 0.9.1
    Uninstalling yellowbrick-0.9.1:
      Successfully uninstalled yellowbrick-0.9.1
Successfully installed yellowbrick-1.1


In [0]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import RFE
from yellowbrick.model_selection import FeatureImportances
from sklearn.metrics import mean_squared_error
from hyperopt import hp
import gc
import lightgbm as lgb
import xgboost as xgb
import catboost as ctb
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error
from hyperopt import fmin, tpe, STATUS_OK, STATUS_FAIL, Trials
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

In [4]:
%%time
train = pd.read_csv('/content/dataset/train.csv')

CPU times: user 40.4 ms, sys: 17 ms, total: 57.4 ms
Wall time: 68.7 ms


In [0]:
X = train.drop(columns = ['air_pollution_index'])
y = train['air_pollution_index']

In [6]:
X.shape

(33750, 13)

In [0]:
def prepare_df(train):
    org_keys = train.columns.tolist()
    
    train['is_holiday'] = np.where(train['is_holiday'] == 'None', False, True)
    train['date_time'] = pd.to_datetime(train['date_time'])
    
    train.set_index('date_time', inplace = True)
    
    days = ['01d', '03d', '07d', '14d', '30d']
    
    sum_attributes = ['is_holiday', 'rain_p_h', 'snow_p_h']

    for val in sum_attributes:
        for day in days:
            train['num_'+val+'_'+day] = train.rolling(day)[val].sum()
            
    weather_types = train['weather_type'].unique().tolist()
    for weather in weather_types:
        train[weather] = np.where(train['weather_type'] == weather, 1, 0)
    
    train.drop(columns = 'weather_type', inplace = True)
    
    avg_attributes = ['humidity', 'wind_speed', 'wind_direction', 'visibility_in_miles', 'dew_point', 'temperature', 
                     'rain_p_h', 'snow_p_h', 'clouds_all', 'traffic_volume'] + weather_types

    for val in avg_attributes:
        for day in days:
            train['avg_'+val+'_'+day] = train.rolling(day)[val].mean()
            
    for column in train.columns:
        if 'avg' in column:
            train[column+'_diff'] = train[column[4:-4]] - train[column]
            
    return train.reset_index()

In [8]:
%%time
X = prepare_df(X.copy())

CPU times: user 605 ms, sys: 50.2 ms, total: 655 ms
Wall time: 675 ms


In [9]:
X.shape

(33750, 248)

In [0]:
X['date_time'] = X['date_time'].dt.hour

In [11]:
%%time
selector = VarianceThreshold(0.1)
selector.fit(X)

CPU times: user 407 ms, sys: 78.8 ms, total: 486 ms
Wall time: 490 ms


In [0]:
sel = X.columns[selector.get_support(indices=True)]

In [0]:
X = X[sel]

In [14]:
%%time
fsel = SelectFromModel(CatBoostRegressor(task_type="GPU", verbose=0), max_features=100)
fsel.fit(X, y)

CPU times: user 1min 12s, sys: 8.67 s, total: 1min 20s
Wall time: 1min 15s


In [0]:
sel = X.columns[fsel.get_support(indices=True)]

In [16]:
len(sel)

61

In [0]:
X = X[sel]

In [0]:
rfe = RFE(lgb.LGBMRegressor(n_jobs = -1), step = 2, verbose = 2)

In [19]:
%%time
rfe.fit(X, y)

Fitting estimator with 61 features.
Fitting estimator with 59 features.
Fitting estimator with 57 features.
Fitting estimator with 55 features.
Fitting estimator with 53 features.
Fitting estimator with 51 features.
Fitting estimator with 49 features.
Fitting estimator with 47 features.
Fitting estimator with 45 features.
Fitting estimator with 43 features.
Fitting estimator with 41 features.
Fitting estimator with 39 features.
Fitting estimator with 37 features.
Fitting estimator with 35 features.
Fitting estimator with 33 features.
Fitting estimator with 31 features.
CPU times: user 38.6 s, sys: 397 ms, total: 39 s
Wall time: 20.2 s


RFE(estimator=LGBMRegressor(boosting_type='gbdt', class_weight=None,
                            colsample_bytree=1.0, importance_type='split',
                            learning_rate=0.1, max_depth=-1,
                            min_child_samples=20, min_child_weight=0.001,
                            min_split_gain=0.0, n_estimators=100, n_jobs=-1,
                            num_leaves=31, objective=None, random_state=None,
                            reg_alpha=0.0, reg_lambda=0.0, silent=True,
                            subsample=1.0, subsample_for_bin=200000,
                            subsample_freq=0),
    n_features_to_select=None, step=2, verbose=2)

In [0]:
sel = X.columns[rfe.get_support(indices=True)]

In [0]:
X = X[sel]

In [0]:
X_train, X_test = X[:27000], X[27000:]
y_train, y_test = y[:27000], y[27000:]

In [0]:
def score(params):
    num_round = int(params['n_estimators'])
    del params['n_estimators']
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_test, label=y_test)
    watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
    gbm_model = xgb.train(params, 
                              dtrain, 
                              num_round,
                              evals=watchlist,
                              verbose_eval=False)
    predictions = gbm_model.predict(dvalid, ntree_limit=gbm_model.best_iteration)
    loss = mean_absolute_error(y_test, np.array(predictions))
    return {'loss': loss, 'status': STATUS_OK}

In [0]:
def optimize(evals, trials, optimizer=tpe.suggest, random_state=0):
    space = {
        'n_estimators': hp.quniform('n_estimators', 200, 600, 1),
        'eta': hp.quniform('eta', 0.025, 0.25, 0.025), # A problem with max_depth casted to float instead of int with the hp.quniform method.
        'max_depth':  hp.choice('max_depth', np.arange(1, 14, dtype=int)),
        'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1),
        'subsample': hp.quniform('subsample', 0.7, 1, 0.05),
        'gamma': hp.quniform('gamma', 0.5, 1, 0.05),
        'colsample_bytree': hp.quniform('colsample_bytree', 0.7, 1, 0.05),
        'alpha' :  hp.quniform('alpha', 0, 10, 1),
        'lambda': hp.quniform('lambda', 1, 2, 0.1),
        #'nthread': cores,
        'objective': 'reg:squarederror',
        'tree_method': 'gpu_hist',
        'booster': 'gbtree',
        'seed': random_state
    }
    best = fmin(score, space, algo=tpe.suggest, max_evals=evals, trials = trials)
    return best

In [43]:
%%time
trials = Trials()
n= 1000
best_param = optimize(evals = n,
                      optimizer=tpe.suggest,
                      trials = trials)

100%|██████████| 1000/1000 [53:53<00:00,  3.23s/it, best loss: 72.47344302707248]
CPU times: user 46min 10s, sys: 7min 43s, total: 53min 54s
Wall time: 53min 53s


In [0]:
best_param['objective'] = 'reg:squarederror'
best_param['tree_method'] = 'gpu_hist'
best_param['n_estimators'] = int(best_param['n_estimators'])

In [52]:
best_param

{'alpha': 2.0,
 'colsample_bytree': 0.8,
 'eta': 0.025,
 'gamma': 0.75,
 'lambda': 1.4000000000000001,
 'max_depth': 8,
 'min_child_weight': 9.0,
 'n_estimators': 240,
 'objective': 'reg:squarederror',
 'subsample': 0.8500000000000001,
 'tree_method': 'gpu_hist'}

In [0]:
estimator = xgb.XGBRegressor(**best_param)

In [54]:
%%time
estimator.fit(X_train, y_train)

CPU times: user 1.74 s, sys: 314 ms, total: 2.06 s
Wall time: 2.06 s


XGBRegressor(alpha=2.0, base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, eta=0.025, gamma=0.75,
             importance_type='gain', lambda=1.4000000000000001,
             learning_rate=0.1, max_delta_step=0, max_depth=8,
             min_child_weight=9.0, missing=None, n_estimators=240, n_jobs=1,
             nthread=None, objective='reg:squarederror', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=0.8500000000000001, tree_method='gpu_hist',
             verbosity=1)

In [55]:
estimator.score(X_test, y_test)

-0.04937170493651389

In [0]:
test = pd.read_csv('/content/dataset/test.csv')

In [57]:
%%time
X_sub = prepare_df(test.copy())

CPU times: user 350 ms, sys: 22.3 ms, total: 372 ms
Wall time: 372 ms


In [0]:
X_sub['date_time'] = X_sub['date_time'].dt.hour

In [0]:
X_sub = X_sub[sel]

In [0]:
test['air_pollution_index'] = estimator.predict(X_sub)

In [0]:
test = test.reset_index()

In [0]:
test.to_csv('sub2.csv', columns = ['date_time', 'air_pollution_index'], index = False)