In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
!unzip "/content/drive/My Drive/Data/fc33077e-6-dataset.zip"

Archive:  /content/drive/My Drive/Data/fc33077e-6-dataset.zip
   creating: dataset/
  inflating: dataset/test.csv        
  inflating: dataset/train.csv       


In [0]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_squared_error
from hyperopt import hp
import gc
import xgboost as xgb
from sklearn.metrics import mean_absolute_error
from hyperopt import fmin, tpe, STATUS_OK, STATUS_FAIL, Trials
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

In [4]:
%%time
train = pd.read_csv('/content/dataset/train.csv')

CPU times: user 45.1 ms, sys: 15.9 ms, total: 61 ms
Wall time: 71.4 ms


In [0]:
X = train.drop(columns = ['air_pollution_index'])
y = train['air_pollution_index']

In [6]:
X.shape

(33750, 13)

In [0]:
def prepare_df(train):
    org_keys = train.columns.tolist()
    
    train['is_holiday'] = np.where(train['is_holiday'] == 'None', False, True)
    train['date_time'] = pd.to_datetime(train['date_time'])
    
    train.set_index('date_time', inplace = True)
    
    days = ['01d', '03d', '07d', '14d', '30d']
    
    sum_attributes = ['is_holiday', 'rain_p_h', 'snow_p_h']

    for val in sum_attributes:
        for day in days:
            train['num_'+val+'_'+day] = train.rolling(day)[val].sum()
            
    weather_types = train['weather_type'].unique().tolist()
    for weather in weather_types:
        train[weather] = np.where(train['weather_type'] == weather, 1, 0)
    
    train.drop(columns = 'weather_type', inplace = True)
    
    avg_attributes = ['humidity', 'wind_speed', 'wind_direction', 'visibility_in_miles', 'dew_point', 'temperature', 
                     'rain_p_h', 'snow_p_h', 'clouds_all', 'traffic_volume'] + weather_types

    for val in avg_attributes:
        for day in days:
            train['avg_'+val+'_'+day] = train.rolling(day)[val].mean()
            
    for column in train.columns:
        if 'avg' in column:
            train[column+'_diff'] = train[column[4:-4]] - train[column]
            
    return train.reset_index()

In [8]:
%%time
X = prepare_df(X.copy())

CPU times: user 615 ms, sys: 61.4 ms, total: 676 ms
Wall time: 693 ms


In [9]:
X.shape

(33750, 248)

In [0]:
X['date_time'] = X['date_time'].dt.hour

In [11]:
%%time
selector = VarianceThreshold(0.1)
selector.fit(X)

CPU times: user 424 ms, sys: 69.9 ms, total: 494 ms
Wall time: 498 ms


In [0]:
sel = X.columns[selector.get_support(indices=True)]

In [0]:
X = X[sel]

In [14]:
%%time
fsel = SelectFromModel(xgb.XGBRFRegressor(n_jobs=-1), max_features=100)
fsel.fit(X, y)

CPU times: user 20.4 s, sys: 126 ms, total: 20.5 s
Wall time: 11.3 s


In [0]:
sel = X.columns[fsel.get_support(indices=True)]

In [16]:
len(sel)

59

In [0]:
X = X[sel]

In [0]:
rfe = RFE(xgb.XGBRFRegressor(tree_method = 'gpu_hist'), step = 1, verbose = 2)

In [19]:
%%time
rfe.fit(X, y)

Fitting estimator with 59 features.
Fitting estimator with 58 features.
Fitting estimator with 57 features.
Fitting estimator with 56 features.
Fitting estimator with 55 features.
Fitting estimator with 54 features.
Fitting estimator with 53 features.
Fitting estimator with 52 features.
Fitting estimator with 51 features.
Fitting estimator with 50 features.
Fitting estimator with 49 features.
Fitting estimator with 48 features.
Fitting estimator with 47 features.
Fitting estimator with 46 features.
Fitting estimator with 45 features.
Fitting estimator with 44 features.
Fitting estimator with 43 features.
Fitting estimator with 42 features.
Fitting estimator with 41 features.
Fitting estimator with 40 features.
Fitting estimator with 39 features.
Fitting estimator with 38 features.
Fitting estimator with 37 features.
Fitting estimator with 36 features.
Fitting estimator with 35 features.
Fitting estimator with 34 features.
Fitting estimator with 33 features.
Fitting estimator with 32 fe

RFE(estimator=XGBRFRegressor(base_score=0.5, colsample_bylevel=1,
                             colsample_bynode=0.8, colsample_bytree=1, gamma=0,
                             learning_rate=1, max_delta_step=0, max_depth=3,
                             min_child_weight=1, missing=None, n_estimators=100,
                             n_jobs=1, nthread=None, objective='reg:linear',
                             random_state=0, reg_alpha=0, reg_lambda=1,
                             scale_pos_weight=1, seed=None, silent=None,
                             subsample=0.8, tree_method='gpu_hist',
                             verbosity=1),
    n_features_to_select=None, step=1, verbose=2)

In [0]:
sel = X.columns[rfe.get_support(indices=True)]

In [0]:
X = X[sel]

In [0]:
X_train, X_test = X[:27000], X[27000:]
y_train, y_test = y[:27000], y[27000:]

In [0]:
def score(params):
    estimator = xgb.XGBRFRegressor(**params)
    estimator.fit(X_train, y_train)
    predictions = estimator.predict(X_test)
    loss = mean_absolute_error(y_test, np.array(predictions))
    del estimator, predictions
    gc.collect()
    return {'loss': loss, 'status': STATUS_OK}

In [0]:
def optimize(evals, trials, optimizer=tpe.suggest, random_state=0):
    space = {
        'n_estimators': hp.choice('n_estimators', np.arange(200, 600, dtype=int)),
        'eta': hp.quniform('eta', 0.025, 0.25, 0.025), # A problem with max_depth casted to float instead of int with the hp.quniform method.
        'max_depth':  hp.choice('max_depth', np.arange(1, 14, dtype=int)),
        'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1),
        'subsample': hp.quniform('subsample', 0.7, 1, 0.05),
        'gamma': hp.quniform('gamma', 0.5, 1, 0.05),
        'colsample_bytree': hp.quniform('colsample_bytree', 0.7, 1, 0.05),
        'alpha' :  hp.quniform('alpha', 0, 10, 1),
        'lambda': hp.quniform('lambda', 1, 2, 0.1),
        #'nthread': cores,
        'objective': 'reg:squarederror',
        'tree_method': 'gpu_hist',
        #'n_jobs': -1,
        'seed': random_state
    }
    best = fmin(score, space, algo=tpe.suggest, max_evals=evals, trials = trials)
    return best

In [0]:
%%time
trials = Trials()
n= 1000
best_param = optimize(evals = n,
                      optimizer=tpe.suggest,
                      trials = trials)

  1%|          | 6/1000 [00:31<1:26:19,  5.21s/it, best loss: 72.55631930202908]

In [0]:
best_param['objective'] = 'reg:squarederror'
best_param['tree_method'] = 'gpu_hist'

In [0]:
best_param

In [0]:
estimator = xgb.XGBRFRegressor(**best_param)

In [0]:
%%time
estimator.fit(X_train, y_train)

In [0]:
estimator.score(X_test, y_test)

In [0]:
test = pd.read_csv('/content/dataset/test.csv')

In [0]:
%%time
X_sub = prepare_df(test.copy())

In [0]:
X_sub['date_time'] = X_sub['date_time'].dt.hour

In [0]:
X_sub = X_sub[sel]

In [0]:
test['air_pollution_index'] = estimator.predict(X_sub)

In [0]:
test = test.reset_index()

In [0]:
test.to_csv('sub3.csv', columns = ['date_time', 'air_pollution_index'], index = False)