In [0]:
#from google.colab import drive
#drive.mount('/content/drive')

In [0]:
#!unzip "/content/drive/My Drive/Data/fc33077e-6-dataset.zip"

In [0]:
#!pip install catboost
#!pip install -U yellowbrick

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import RFE
from yellowbrick.model_selection import FeatureImportances
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

  import pandas.util.testing as tm


In [5]:
%%time
train = pd.read_csv('/content/dataset/train.csv')

CPU times: user 44.6 ms, sys: 12.1 ms, total: 56.7 ms
Wall time: 58.1 ms


In [0]:
X = train.drop(columns = ['air_pollution_index'])
y = train['air_pollution_index']

In [7]:
X.shape

(33750, 13)

In [0]:
def prepare_df(train):
    org_keys = train.columns.tolist()
    
    train['is_holiday'] = np.where(train['is_holiday'] == 'None', False, True)
    train['date_time'] = pd.to_datetime(train['date_time'])
    
    train.set_index('date_time', inplace = True)
    
    days = ['01d', '03d', '07d', '14d', '30d']
    
    sum_attributes = ['is_holiday', 'rain_p_h', 'snow_p_h']

    for val in sum_attributes:
        for day in days:
            train['num_'+val+'_'+day] = train.rolling(day)[val].sum()
            
    weather_types = train['weather_type'].unique().tolist()
    for weather in weather_types:
        train[weather] = np.where(train['weather_type'] == weather, 1, 0)
    
    train.drop(columns = 'weather_type', inplace = True)
    
    avg_attributes = ['humidity', 'wind_speed', 'wind_direction', 'visibility_in_miles', 'dew_point', 'temperature', 
                     'rain_p_h', 'snow_p_h', 'clouds_all', 'traffic_volume'] + weather_types

    for val in avg_attributes:
        for day in days:
            train['avg_'+val+'_'+day] = train.rolling(day)[val].mean()
            
    for column in train.columns:
        if 'avg' in column:
            train[column+'_diff'] = train[column[4:-4]] - train[column]
            
    return train.reset_index()

In [9]:
%%time
X = prepare_df(X.copy())

CPU times: user 593 ms, sys: 69 ms, total: 662 ms
Wall time: 667 ms


In [10]:
X.shape

(33750, 248)

In [0]:
X['date_time'] = X['date_time'].dt.hour

In [12]:
%%time
selector = VarianceThreshold(0.1)
selector.fit(X)

CPU times: user 416 ms, sys: 77 ms, total: 493 ms
Wall time: 496 ms


In [0]:
sel = X.columns[selector.get_support(indices=True)]

In [0]:
X = X[sel]

In [15]:
%%time
fsel = SelectFromModel(CatBoostRegressor(task_type="GPU"), max_features=100)
fsel.fit(X, y)

Learning rate set to 0.107721
0:	learn: 83.7228472	total: 49.2ms	remaining: 49.1s
1:	learn: 83.7140926	total: 86.9ms	remaining: 43.4s
2:	learn: 83.7002341	total: 128ms	remaining: 42.5s
3:	learn: 83.6893954	total: 167ms	remaining: 41.5s
4:	learn: 83.6766177	total: 207ms	remaining: 41.2s
5:	learn: 83.6674447	total: 246ms	remaining: 40.7s
6:	learn: 83.6409228	total: 297ms	remaining: 42.2s
7:	learn: 83.6325366	total: 324ms	remaining: 40.1s
8:	learn: 83.6265930	total: 348ms	remaining: 38.4s
9:	learn: 83.6135250	total: 377ms	remaining: 37.3s
10:	learn: 83.6058222	total: 403ms	remaining: 36.2s
11:	learn: 83.5917444	total: 432ms	remaining: 35.6s
12:	learn: 83.5823834	total: 458ms	remaining: 34.7s
13:	learn: 83.5674904	total: 484ms	remaining: 34.1s
14:	learn: 83.5441515	total: 511ms	remaining: 33.6s
15:	learn: 83.5369615	total: 532ms	remaining: 32.7s
16:	learn: 83.5309288	total: 550ms	remaining: 31.8s
17:	learn: 83.5118174	total: 575ms	remaining: 31.4s
18:	learn: 83.4991999	total: 600ms	remaini

In [0]:
sel = X.columns[fsel.get_support(indices=True)]

In [17]:
len(sel)

60

In [0]:
X = X[sel]

In [0]:
rfe = RFE(RandomForestRegressor(n_jobs = -1), step = 5, verbose = 2)

In [20]:
%%time
rfe.fit(X, y)

Fitting estimator with 60 features.
Fitting estimator with 55 features.
Fitting estimator with 50 features.
Fitting estimator with 45 features.
Fitting estimator with 40 features.
Fitting estimator with 35 features.
CPU times: user 37min, sys: 1.66 s, total: 37min 2s
Wall time: 18min 57s


RFE(estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                    criterion='mse', max_depth=None,
                                    max_features='auto', max_leaf_nodes=None,
                                    max_samples=None, min_impurity_decrease=0.0,
                                    min_impurity_split=None, min_samples_leaf=1,
                                    min_samples_split=2,
                                    min_weight_fraction_leaf=0.0,
                                    n_estimators=100, n_jobs=-1,
                                    oob_score=False, random_state=None,
                                    verbose=0, warm_start=False),
    n_features_to_select=None, step=5, verbose=2)

In [0]:
sel = X.columns[rfe.get_support(indices=True)]

In [0]:
X = X[sel]

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [0]:
estimator = CatBoostRegressor(task_type="GPU",
                              devices='0:1')

In [25]:
%%time
estimator.fit(X_train, y_train)

Learning rate set to 0.105541
0:	learn: 83.6090010	total: 24.9ms	remaining: 24.9s
1:	learn: 83.5970866	total: 48.2ms	remaining: 24.1s
2:	learn: 83.5804975	total: 72.3ms	remaining: 24s
3:	learn: 83.5498406	total: 97.2ms	remaining: 24.2s
4:	learn: 83.5291400	total: 122ms	remaining: 24.2s
5:	learn: 83.5186018	total: 145ms	remaining: 24s
6:	learn: 83.5025080	total: 168ms	remaining: 23.9s
7:	learn: 83.4884490	total: 191ms	remaining: 23.7s
8:	learn: 83.4786944	total: 211ms	remaining: 23.2s
9:	learn: 83.4502504	total: 231ms	remaining: 22.9s
10:	learn: 83.4389194	total: 248ms	remaining: 22.3s
11:	learn: 83.4277825	total: 271ms	remaining: 22.3s
12:	learn: 83.4141183	total: 289ms	remaining: 21.9s
13:	learn: 83.4026813	total: 306ms	remaining: 21.5s
14:	learn: 83.3855386	total: 321ms	remaining: 21.1s
15:	learn: 83.3728941	total: 337ms	remaining: 20.7s
16:	learn: 83.3598433	total: 353ms	remaining: 20.4s
17:	learn: 83.3419745	total: 369ms	remaining: 20.1s
18:	learn: 83.3346963	total: 387ms	remaining

<catboost.core.CatBoostRegressor at 0x7fbbc6f3ed30>

In [26]:
estimator.score(X_test, y_test)

-0.0205088890134435

In [0]:
test = pd.read_csv('/content/dataset/test.csv')

In [38]:
%%time
X_sub = prepare_df(test.copy())

CPU times: user 392 ms, sys: 11 ms, total: 403 ms
Wall time: 405 ms


In [0]:
X_sub['date_time'] = X_sub['date_time'].dt.hour

In [0]:
X_sub = X_sub[sel]

In [0]:
test['air_pollution_index'] = estimator.predict(X_sub)

In [0]:
test = test.reset_index()

In [0]:
test.to_csv('sub1.csv', columns = ['date_time', 'air_pollution_index'], index = False)