In [0]:
#!unzip "/content/drive/My Drive/Data/fc33077e-6-dataset.zip"

In [0]:
#!pip install catboost
#!pip install -U yellowbrick

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import RFE
from yellowbrick.model_selection import FeatureImportances
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

  import pandas.util.testing as tm


In [4]:
%%time
train = pd.read_csv('/content/dataset/train.csv')

CPU times: user 38.5 ms, sys: 16.6 ms, total: 55.1 ms
Wall time: 57.6 ms


In [0]:
X = train.drop(columns = ['air_pollution_index'])
y = train['air_pollution_index']

In [6]:
X.shape

(33750, 13)

In [0]:
def prepare_df(train):
    org_keys = train.columns.tolist()
    
    train['is_holiday'] = np.where(train['is_holiday'] == 'None', False, True)
    train['date_time'] = pd.to_datetime(train['date_time'])
    
    train.set_index('date_time', inplace = True)
    
    days = ['01d', '03d', '07d', '14d', '30d']
    
    sum_attributes = ['is_holiday', 'rain_p_h', 'snow_p_h']

    for val in sum_attributes:
        for day in days:
            train['num_'+val+'_'+day] = train.rolling(day)[val].sum()
            
    weather_types = train['weather_type'].unique().tolist()
    for weather in weather_types:
        train[weather] = np.where(train['weather_type'] == weather, 1, 0)
    
    train.drop(columns = 'weather_type', inplace = True)
    
    avg_attributes = ['humidity', 'wind_speed', 'wind_direction', 'visibility_in_miles', 'dew_point', 'temperature', 
                     'rain_p_h', 'snow_p_h', 'clouds_all', 'traffic_volume'] + weather_types

    for val in avg_attributes:
        for day in days:
            train['avg_'+val+'_'+day] = train.rolling(day)[val].mean()
            
    for column in train.columns:
        if 'avg' in column:
            train[column+'_diff'] = train[column[4:-4]] - train[column]
            
    return train.reset_index()

In [8]:
%%time
X = prepare_df(X.copy())

CPU times: user 598 ms, sys: 63.9 ms, total: 662 ms
Wall time: 664 ms


In [9]:
X.shape

(33750, 248)

In [0]:
X['date_time'] = X['date_time'].dt.hour

In [11]:
%%time
selector = VarianceThreshold(0.1)
selector.fit(X)

CPU times: user 429 ms, sys: 87.2 ms, total: 516 ms
Wall time: 519 ms


In [0]:
sel = X.columns[selector.get_support(indices=True)]

In [0]:
X = X[sel]

In [14]:
%%time
fsel = SelectFromModel(CatBoostRegressor(task_type="GPU", verbose=0), max_features=100)
fsel.fit(X, y)

CPU times: user 20.3 s, sys: 7.19 s, total: 27.5 s
Wall time: 23 s


In [0]:
sel = X.columns[fsel.get_support(indices=True)]

In [16]:
len(sel)

60

In [0]:
X = X[sel]

In [0]:
rfe = RFE(RandomForestRegressor(n_jobs = -1), step = 5, verbose = 2)

In [19]:
%%time
rfe.fit(X, y)

Fitting estimator with 60 features.
Fitting estimator with 55 features.
Fitting estimator with 50 features.
Fitting estimator with 45 features.
Fitting estimator with 40 features.
Fitting estimator with 35 features.
CPU times: user 36min 58s, sys: 1.46 s, total: 36min 59s
Wall time: 19min


RFE(estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                    criterion='mse', max_depth=None,
                                    max_features='auto', max_leaf_nodes=None,
                                    max_samples=None, min_impurity_decrease=0.0,
                                    min_impurity_split=None, min_samples_leaf=1,
                                    min_samples_split=2,
                                    min_weight_fraction_leaf=0.0,
                                    n_estimators=100, n_jobs=-1,
                                    oob_score=False, random_state=None,
                                    verbose=0, warm_start=False),
    n_features_to_select=None, step=5, verbose=2)

In [0]:
sel = X.columns[rfe.get_support(indices=True)]

In [0]:
X = X[sel]

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [0]:
estimator = CatBoostRegressor(task_type="GPU",
                              devices='0:1',
                              verbose=0)

In [24]:
%%time
estimator.fit(X_train, y_train)

CPU times: user 13.4 s, sys: 3.6 s, total: 17 s
Wall time: 13.6 s


<catboost.core.CatBoostRegressor at 0x7f83d7657da0>

In [25]:
estimator.score(X_test, y_test)

-0.018958207555134665

In [0]:
test = pd.read_csv('/content/dataset/test.csv')

In [27]:
%%time
X_sub = prepare_df(test.copy())

CPU times: user 1.03 s, sys: 16 ms, total: 1.05 s
Wall time: 540 ms


In [0]:
X_sub['date_time'] = X_sub['date_time'].dt.hour

In [0]:
X_sub = X_sub[sel]

In [0]:
test['air_pollution_index'] = estimator.predict(X_sub)

In [0]:
test = test.reset_index()

In [0]:
test.to_csv('sub1.csv', columns = ['date_time', 'air_pollution_index'], index = False)