Modified from: https://www.kaggle.com/remekkinas/tps-7-ensemble-stacking-meta-regressor and https://www.kaggle.com/andy6804tw/catboost-18feature-cross-validation?scriptVersionId=68459420

In [None]:
# load packages
import pandas as pd
import numpy as np
import math

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import LeaveOneGroupOut
from sklearn.metrics import mean_squared_error, mean_squared_log_error

from catboost import CatBoostRegressor

Using leave one group out as cross validation method can get better result than normal kfold cross validation

In [None]:
# define function to build model, perform cv evaluation and make prediction
def CatBoost_model_logo(train_data, train_label, third_party_group, test_data, 
                        model_seed, learning_rate, depth, l2_leaf_reg, random_strength, 
                        bagging_temperature, grow_policy, leaf_estimation_method, eval_metric):
    model = CatBoostRegressor(loss_function='RMSE', iterations=80000, od_type='Iter', thread_count=-1, 
                              random_seed=model_seed, learning_rate=learning_rate,  depth=depth, 
                              eval_metric=eval_metric, grow_policy=grow_policy, use_best_model=True,
                              leaf_estimation_method=leaf_estimation_method, l2_leaf_reg=l2_leaf_reg, 
                              random_strength=random_strength, bagging_temperature=bagging_temperature)
    # 
    pred_y_train_np = np.zeros(shape=(train_data.shape[0]))        
    pred_y_test_np = np.zeros(shape=(test_data.shape[0]))        
    feature_importance_df = pd.DataFrame(columns=['feature_name','feature_importance'])   
    feature_importance_df['feature_name'] = train_data.columns
    feature_importance_df['feature_importance'] = 0
    # logo cv evaluation
    folds = 10  # 10 months in training set
    logo = LeaveOneGroupOut()
    counter = 0
    for train_index, test_index in logo.split(train_data, train_label, third_party_group):
        X_train, X_test = train_data.loc[train_index], train_data.loc[test_index]
        y_train, y_test = train_label.loc[train_index], train_label.loc[test_index]
        # train
        model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False, early_stopping_rounds=500)
        pred_y_train_np[test_index] = np.exp(model.predict(X_test))      
        pred_y_test_np = pred_y_test_np + np.exp(model.predict(test_data))/folds    
        feature_importance_df['feature_importance'] = feature_importance_df['feature_importance'] + (model.get_feature_importance()/folds)
        print('Fold', str(counter), 'RMSLE:', np.sqrt(mean_squared_log_error(np.exp(y_test), pred_y_train_np[test_index])), 
              'RMSE:', np.sqrt(mean_squared_error(np.exp(y_test), pred_y_train_np[test_index])))
        counter = counter + 1
    print('Total ', 'RMSLE:', np.sqrt(mean_squared_log_error(np.exp(train_label), pred_y_train_np)),
          'RMSE:', np.sqrt(mean_squared_error(np.exp(train_label), pred_y_train_np)))
    return pred_y_test_np, pred_y_train_np, feature_importance_df

In [None]:
# load raw data
train = pd.read_csv('../input/tabular-playground-series-jul-2021/train.csv') 
train = train[:-1]     # remove last row
test = pd.read_csv('../input/tabular-playground-series-jul-2021/test.csv')
submission = pd.read_csv('../input/tabular-playground-series-jul-2021/sample_submission.csv')
target_columns = ['target_carbon_monoxide','target_benzene','target_nitrogen_oxides']
sensor_columns = ['sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5']
weather_columns = ['deg_C', 'relative_humidity', 'absolute_humidity']
# log transformation
train[target_columns] = np.log(train[target_columns])
# combine training set and testing set
full_data = pd.concat([train, test])
full_data['date_time'] = full_data['date_time'].astype('datetime64')    
train_months = full_data["date_time"].dt.month[:len(train)]    # third-party group to perform leave one group out cv
# new weather-related features
full_data['max_absolute_humidity'] = (full_data['absolute_humidity']*100)/full_data['relative_humidity']
part1 = np.log(full_data['relative_humidity']/100)
part2 = 17.625*full_data['deg_C']/(243.04+full_data['deg_C'])
full_data['dew_point'] = 243.04*(part1+part2)/(17.625-part1-part2)
# new time-related features
full_data['dayofweek'] = full_data['date_time'].dt.weekday  
full_data['weekend'] = np.where(full_data['date_time'].dt.dayofweek>=5, 1, 0)
full_data['hourofday'] = full_data['date_time'].dt.hour
full_data['peak_hour'] = np.where(((full_data['hourofday'].isin(np.arange(8, 20, 1)))&(full_data['weekend']==0))|
                                  ((full_data['hourofday'].isin(np.arange(12, 24, 1)))&(full_data['weekend']==1)), 1, 0)
full_data['peak_month'] = np.where(full_data['date_time'].dt.month.isin([9,10,11,12,1,2,3,4,5]), 1, 0)
# other feature
full_data['outlier'] = np.where((full_data['deg_C']>20)&(full_data['max_absolute_humidity']<1), 1, 0)  # outlier
# use sin and cos to capture cyclic pattern
diff = full_data['date_time'] - min(full_data['date_time'])
# yearly pattern
days = diff.dt.days
full_data['year_1_sin'] = np.sin(2 * math.pi * days / (365 * 1)) 
full_data['year_1_cos'] = np.cos(2 * math.pi * days / (365 * 1))
full_data['year_2_sin'] = np.sin(2 * math.pi * days / (365 * 2)) 
full_data['year_2_cos'] = np.cos(2 * math.pi * days / (365 * 2)) 
full_data['year_3_sin'] = np.sin(2 * math.pi * days / (365 * 3)) 
full_data['year_3_cos'] = np.cos(2 * math.pi * days / (365 * 3)) 
full_data['year_4_sin'] = np.sin(2 * math.pi * days / (365 * 4)) 
full_data['year_4_cos'] = np.cos(2 * math.pi * days / (365 * 4)) 
# daily pattern
seconds = diff.dt.seconds
full_data['day_1_sin'] = np.sin(2 * math.pi * seconds / ( 3600 * 24 * 1))
full_data['day_1_cos'] = np.cos(2 * math.pi * seconds / ( 3600 * 24 * 1))
full_data['day_2_sin'] = np.sin(2 * math.pi * seconds / ( 3600 * 24 * 2))
full_data['day_2_cos'] = np.cos(2 * math.pi * seconds / ( 3600 * 24 * 2))
full_data['day_7_sin'] = np.sin(2 * math.pi * seconds / ( 3600 * 24 * 7))
full_data['day_7_cos'] = np.cos(2 * math.pi * seconds / ( 3600 * 24 * 7))
# lag data    
lags = [1, 4, 24, 7*24]
for feature in weather_columns+sensor_columns:
    temp = full_data[feature]
    # forwards
    for lag in lags:
        lag_name = feature + '_future_' + str(lag)
        temp_lag = full_data[feature].shift(periods=-lag, fill_value=0)
        full_data[lag_name] = (temp_lag - temp)
    # backwards
    for lag in lags:
        lag_name = feature + '_past_' + str(lag)
        temp_lag = full_data[feature].shift(periods=lag, fill_value=0)
        full_data[lag_name] = (temp_lag - temp)
# split full dataset
full_data = full_data.drop(columns=['date_time'])
train = full_data[:len(train)].copy()
test = full_data[len(train):].drop(columns=target_columns).copy()
print(train.shape, test.shape)

Use adding random feature method to find out which features negatively affect model's performance. After removing them, performance is further improved.

In [None]:
# predict target_carbon_monoxide
drop_feature_carbon_monoxide = ['absolute_humidity_past_1','sensor_4_past_1','relative_humidity_past_1','relative_humidity_future_1','deg_C_past_1','deg_C_future_1']
X_carbon_monoxide = train.drop(columns=target_columns+drop_feature_carbon_monoxide)
y_carbon_monoxide = train['target_carbon_monoxide']
test_carbon_monoxide = test.drop(columns=drop_feature_carbon_monoxide)
print(X_carbon_monoxide.shape, test_carbon_monoxide.shape)
#
test_y_carbon_monoxide_logo, train_y_carbon_monoxide_logo, feature_importance_carbon_monoxide_logo = CatBoost_model_logo(
                    train_data=X_carbon_monoxide, train_label=y_carbon_monoxide, third_party_group=train_months, test_data=test_carbon_monoxide, 
                    model_seed=42, learning_rate=0.01, depth=6, l2_leaf_reg=8.9, random_strength=1.95, bagging_temperature=8.38, 
                    grow_policy='Lossguide', eval_metric='RMSE', leaf_estimation_method='Newton')

In [None]:
# top 20 important features
plt.figure(figsize=(12,6))
sns.barplot(data=feature_importance_carbon_monoxide_logo.sort_values(by=['feature_importance'], ascending=False)[:20], 
            y='feature_name', x='feature_importance')
plt.title('carbon_monoxide')
plt.show()

In [None]:
# predict target_benzene
drop_feature_benzene = ['sensor_4_past_168','sensor_1_future_24','sensor_1_future_168','deg_C_future_1','dayofweek','year_3_cos',
                        'weekend','year_3_sin','day_2_cos','year_4_sin','year_4_cos','day_7_cos','day_1_sin','peak_month']
X_benzene = train.drop(columns=target_columns+drop_feature_benzene)
y_benzene = train['target_benzene']
test_benzene = test.drop(columns=drop_feature_benzene)
print(X_benzene.shape, test_benzene.shape)
#
test_y_benzene_logo, train_y_benzene_logo, feature_importance_benzene_logo  = CatBoost_model_logo(
                    train_data=X_benzene, train_label=y_benzene, third_party_group=train_months, test_data=test_benzene, 
                    model_seed=42, learning_rate=0.166, depth=2, l2_leaf_reg=8.7, random_strength=1.54, bagging_temperature=3.34, 
                    grow_policy='Depthwise', eval_metric='RMSE', leaf_estimation_method='Newton')

In [None]:
# top 20 important features
plt.figure(figsize=(12,6))
sns.barplot(data=feature_importance_benzene_logo.sort_values(by=['feature_importance'], ascending=False)[:20], 
            y='feature_name', x='feature_importance')
plt.title('target_benzene')
plt.show()

In [None]:
# predict target_nitrogen_oxides
X_nitrogen_oxides = train.drop(columns=target_columns)
y_nitrogen_oxides = train['target_nitrogen_oxides']
#
test_y_nitrogen_oxides_logo, train_y_nitrogen_oxides_logo, feature_importance_nitrogen_oxides_logo  = CatBoost_model_logo(
                    train_data=X_nitrogen_oxides, train_label=y_nitrogen_oxides, third_party_group=train_months, test_data=test, 
                    model_seed=42, learning_rate=0.028, depth=6, l2_leaf_reg=3.12, random_strength=1.8, bagging_temperature=4.42, 
                    grow_policy='Depthwise', eval_metric='RMSE', leaf_estimation_method='Newton')

In [None]:
# top 20 important features
plt.figure(figsize=(12,6))
sns.barplot(data=feature_importance_nitrogen_oxides_logo.sort_values(by=['feature_importance'], ascending=False)[:20], 
            y='feature_name', x='feature_importance')
plt.title('target_nitrogen_oxides')
plt.show()

In [None]:
# predict
submission['target_carbon_monoxide'] = test_y_carbon_monoxide_logo
submission['target_benzene'] = test_y_benzene_logo
submission['target_nitrogen_oxides'] = test_y_nitrogen_oxides_logo
submission.to_csv('submission_Catboost_logo.csv', index=False)

In [None]:
submission.head(10)