### CatBoost_18Feature_Cross_Validation

In [None]:
import numpy as np
import matplotlib.pyplot as plt 
import pandas as pd
import seaborn as sns 
from sklearn.datasets import load_boston
import warnings
warnings.filterwarnings('ignore')
plt.style.use('ggplot')

In [None]:
train = pd.read_csv('../input/tabular-playground-series-jul-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-jul-2021/test.csv')
print('train shape:',train.shape)
print('test shape:',test.shape)

In [None]:
# found that one of the test data is in the train set and removed it
# Dropping the last row which is 2011-01-01 00:00:00
train=train.loc[~(train['date_time']=='2011-01-01 00:00:00')].reset_index(drop=True)

In [None]:
print('train shape:',train.shape)

In [None]:
all_data = pd.concat([train, test])
# convert to datatime format
all_data['date_time'] = pd.to_datetime(all_data['date_time'])
all_data.head()

## EDA
- The distribution of deg_C shows peaks between 20 to 30 deg.
- There is a dip in relative humidity at 40% and there are two peaks at 30% and 45% approx.
- The absolute humidity value shows peaks at 0.25g/m3(i have assumed it to be g/m3.Data info did not explicitly mention any units).
- The distribution of sensor_1,2,3 & 5 appears to be left skewed whereas sensor-4 is normal with outliers at 500.

In [None]:
fig,ax=plt.subplots(4,2,figsize=(20,15))
for i,col in enumerate(train.columns[1:9]):
    ax[i%4][i//4].hist(train[col],bins=40,color='darkblue',label=f'{col}')
    ax[i%4][i//4].set_title(f'Distribution of {col}',fontsize=15)
    ax[i%4][i//4].set_xlabel(f'{col}')
    ax[i%4][i//4].set_ylabel('Dist')
    plt.subplots_adjust(hspace=0.45)

In [None]:
fig,ax=plt.subplots(3,1,figsize=(8,10))
for i,col in enumerate(train.columns[9:12]):
    ax[i%3].hist(train[col],bins=40,color='darkblue',label=f'{col}')
    ax[i%3].set_title(f'Distribution of {col}',fontsize=15)
    ax[i%3].set_xlabel(f'{col}')
    ax[i%3].set_ylabel('Dist')
    plt.subplots_adjust(hspace=0.45)

### 分析每天的資訊
Looking at the day wise trend,we see that there has been sudden peak and dips for certain days over the month.While the temperatures have been above 20 deg after late may, there is a dip in temperature less than 15 dec after Nov but there is a sudden increase in mid december.

In [None]:
# 字串轉換日期格式
train['date_time']=pd.to_datetime(train['date_time'],format='%Y-%m-%d %H:%M:%S')
test['date_time']=pd.to_datetime(test['date_time'],format='%Y-%m-%d %H:%M:%S')
# Following code is inspired from - https://www.kaggle.com/nroman/eda-for-ashrae
fig,ax=plt.subplots(1,1,figsize=(12,6))
train[['date_time','deg_C']].set_index('date_time').resample('D').mean()['deg_C'].plot(ax=ax,label='by hour(train)',alpha=1,color='blue').set_ylabel('deg C',fontsize=10)
ax.set_title('Trend of Mean deg_C by Day',fontsize=12)
ax.set_xlabel('')

For the test set, if we try to compare between March month of train, we could see the temperatures have started from approx 3 deg and increased above 15 deg.

In [None]:
fig,ax=plt.subplots(1,1,figsize=(12,6))
test[['date_time','deg_C']].set_index('date_time').resample('D').mean()['deg_C'].plot(ax=ax,label='by hour(train)',alpha=1,color='blue').set_ylabel('deg C',fontsize=10)
ax.set_title('Trend of Mean deg_C by Day',fontsize=12)
ax.set_xlabel('')

## Data preprocessing
進一步處理之前先確認是否有缺失值：

In [None]:
all_data.isnull().sum()

In [None]:
# all_data['hr'] = all_data.date_time.dt.hour*60+all_data.date_time.dt.minute
# all_data['day'] =all_data.date_time.dt.weekday//5
# all_data['satday'] = all_data.date_time.dt.weekday==5
# all_data['hr1'] = all_data.date_time.dt.hour*60+all_data.date_time.dt.minute

In [None]:
# all_data['year'] = all_data['date_time'].dt.year
# all_data['month'] = all_data['date_time'].dt.month
# all_data['week'] = all_data['date_time'].dt.week
# all_data['day'] = all_data['date_time'].dt.day
# all_data['dayofweek'] = all_data['date_time'].dt.dayofweek
# all_data['hour'] = all_data['date_time'].dt.hour
# # convert datetime to timestamp(s)
# all_data['time'] = all_data['date_time'].astype(np.int64)//10**9
# all_data.drop(columns = 'date_time', inplace = True)
# print('all_data shape:', all_data.shape)
# all_data.head()

In [None]:
# The months will be used for folds split
months = all_data["date_time"].dt.month[:len(train)]
## New idea
all_data["hour"] = all_data["date_time"].dt.hour
all_data["working_hours"] =  all_data["hour"].isin(np.arange(8, 21, 1)).astype("int")
all_data["is_weekend"] = (all_data["date_time"].dt.dayofweek >= 5).astype("int")
all_data['hr'] = all_data.date_time.dt.hour*60+all_data.date_time.dt.minute
all_data['satday'] = (all_data.date_time.dt.weekday==5).astype("int")
# add sensor shift
all_data["s1-6"] = all_data["sensor_1"] - all_data["sensor_1"].shift(periods=6, fill_value=0)
all_data["s2-6"] = all_data["sensor_2"] - all_data["sensor_2"].shift(periods=6, fill_value=0)
all_data["s3-6"] = all_data["sensor_3"] - all_data["sensor_3"].shift(periods=6, fill_value=0)
all_data["s4-6"] = all_data["sensor_4"] - all_data["sensor_4"].shift(periods=6, fill_value=0)
all_data["s5-6"] = all_data["sensor_5"] - all_data["sensor_5"].shift(periods=6, fill_value=0)
all_data.drop(columns = 'hour', inplace = True)
# convert datetime to timestamp(s)
all_data['time'] = all_data['date_time'].astype(np.int64)//10**9
all_data.drop(columns = 'date_time', inplace = True)

In [None]:
X=all_data[:len(train)].drop(columns = ['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']).values
y=all_data[:len(train)][['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']]
y_log=np.log1p(y)
X_test=all_data[len(train):].drop(columns = ['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']).values
print('X_train shape:', X.shape)
print('y_train shape:', y.shape)
print('X_test shape:', X_test.shape)

## Train Model

In [None]:
# load submission
preds = pd.read_csv('../input/tabular-playground-series-jul-2021/sample_submission.csv')

In [None]:
# Sets of hyperparameters optimized by Optuna for each target
# cb_params = [
#                 {'learning_rate': 0.010169009412219588,
#                  'l2_leaf_reg': 8.908337085912136,
#                  'bagging_temperature': 8.384477224270551,
#                  'random_strength': 1.950237493637981,
#                  'depth': 6,
#                  'grow_policy': 'Lossguide',
#                  'leaf_estimation_method': 'Newton'},
#                 {'learning_rate': 0.166394867169309,
#                  'l2_leaf_reg': 8.704675157564441,
#                  'bagging_temperature': 3.340826164726799,
#                  'random_strength': 1.538518016574368,
#                  'depth': 3,
#                  'grow_policy': 'Depthwise',
#                  'leaf_estimation_method': 'Newton'},
#                 {'learning_rate': 0.028141156076957437,
#                  'l2_leaf_reg': 3.116523267336638,
#                  'bagging_temperature': 4.420661209459851,
#                  'random_strength': 1.8011752694610028,
#                  'depth': 6,
#                  'grow_policy': 'Depthwise',
#                  'leaf_estimation_method': 'Newton'},
#             ]
cb_params = [
                {'learning_rate': 0.11152721528043753, 'l2_leaf_reg': 9.495998186799408, 'bagging_temperature': 1.6183369518324908, 'random_strength': 1.7628253181122102, 'depth': 7, 'grow_policy': 'Lossguide', 'leaf_estimation_method': 'Gradient'},
                {'learning_rate': 0.005613720196384217, 'l2_leaf_reg': 0.8262159912383316, 'bagging_temperature': 8.414314200388226, 'random_strength': 1.0777361548370274, 'depth': 4, 'grow_policy': 'Depthwise', 'leaf_estimation_method': 'Newton'},
                {'learning_rate': 0.3402632799551716, 'l2_leaf_reg': 9.163175050035028, 'bagging_temperature': 5.471927179930505, 'random_strength': 1.008934010879257, 'depth': 4, 'grow_policy': 'SymmetricTree', 'leaf_estimation_method': 'Gradient'}
]

## StratifiedKFold
Public Score: 
- 0.21217 with 11 features 
 - ['deg_C', 'relative_humidity', 'absolute_humidity', 'sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5', 'working_hours', 'is_weekend', 'hr', 'satday', 'time']
- 0.19965 with 13 feature 
 - ['deg_C', 'relative_humidity', 'absolute_humidity', 'sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5', 'working_hours', 'is_weekend', 'time']

inside test error:
- 0.126380933 with 11 features 
- 0.119173847 with 13 features (0.09045966192305815+0.08042533289018933+0.18663654561947446)

In [None]:
# %%time
# from sklearn.metrics import mean_squared_log_error
# from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
# from catboost import CatBoostRegressor
# from sklearn.multioutput import MultiOutputRegressor

# all_fi = []
# splits = 10
# target_names = y_log.columns

# for i, target in enumerate(target_names):
#     print(f"\nTraining for {target}...")
#     skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42)
#     oof_preds = np.zeros((X_scaled.shape[0],))
#     model_preds = 0
#     model_fi = 0
#     for num, (train_idx, valid_idx) in enumerate(skf.split(X_scaled, months)):
#         X_train, X_valid = X_scaled[[train_idx]], X_scaled[[valid_idx]]
#         y_train, y_valid = y_log.loc[train_idx, target], y_log.loc[valid_idx, target]
#         model = CatBoostRegressor(random_state=42,
#                                  thread_count=4,
#                                  verbose=False,
#                                  loss_function='RMSE',
#                                  eval_metric='RMSE',
#                                  od_type="Iter",
#                                  early_stopping_rounds=500,
#                                  use_best_model=True,
#                                  iterations=10000,
#                                  **cb_params[i])
#         model.fit(X_train, y_train,
#                   eval_set=(X_valid, y_valid),
#                   verbose=False)
#         model_preds += np.expm1(model.predict(X_test_scaled)) / splits
#         model_fi += model.feature_importances_
#         oof_preds[valid_idx] = np.expm1(model.predict(X_valid))
#         print(f"Fold {num} RMSLE: {np.sqrt(mean_squared_log_error(np.expm1(y_valid), oof_preds[valid_idx]))}")
#     print(f"\nOverall RMSLE: {np.sqrt(mean_squared_log_error(np.expm1(y_log[target]), oof_preds))}")    
#     preds[target] = model_preds
#     all_fi.append(dict(zip(X.columns, model_fi)))

## LeaveOneGroupOut
cat_features=["working_hours","is_weekend","satday"] 8、9、11

Public Score: 
- 0.19336 with 13 feature 
 - ['deg_C', 'relative_humidity', 'absolute_humidity', 'sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5', 'working_hours', 'is_weekend', 'time']

inside test error:
- 0.18441005 with 13 features (0.14049546445212416+0.09160805296249219+0.3211266334764782)

In [None]:
%%time
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import LeaveOneGroupOut
from catboost import CatBoostRegressor

all_fi = []
splits = 10
target_names=y_log.columns

for i, target in enumerate(target_names):
    print(f"\nTraining for {target}...")
    logo = LeaveOneGroupOut()
    oof_preds = np.zeros((X.shape[0],))
    model_preds = 0
    model_fi = 0
    for num, (train_idx, valid_idx) in enumerate(logo.split(X, y_log, months)):
        X_train, X_valid = X[[train_idx]], X[[valid_idx]]
        y_train, y_valid = y_log.loc[train_idx, target], y_log.loc[valid_idx, target]
        model = CatBoostRegressor(random_state=42,
                                 thread_count=4,
                                 verbose=False,
                                 loss_function='RMSE',
                                 eval_metric='RMSE',
                                 od_type="Iter",
                                 early_stopping_rounds=500,
                                 use_best_model=True,
                                 iterations=10000,
                                 task_type="CPU",
                                 **cb_params[i])
        
        model.fit(X_train, y_train,
                  eval_set=(X_valid, y_valid),
                  verbose=False)
        model_preds += np.expm1(model.predict(X_test)) / splits
        model_fi += model.feature_importances_
        oof_preds[valid_idx] = np.expm1(model.predict(X_valid))
        print(f"Fold {num} RMSLE: {np.sqrt(mean_squared_log_error(np.expm1(y_valid), oof_preds[valid_idx]))}")
    print(f"\nOverall RMSLE: {np.sqrt(mean_squared_log_error(np.expm1(y_log[target]), oof_preds))}")    
    preds[target] = model_preds
    all_fi.append(dict(zip(all_data.columns, model_fi)))

## Feature importances
Thanks to: @Maxim Kazantsev https://www.kaggle.com/maximkazantsev/tps-07-21-eda-catboost

In [None]:
# Creating feature list from feature importance dictionaries
feature_list = set()
for i in np.arange(len(all_fi)):
    feature_list = set.union(feature_list, set(all_fi[i].keys()))
print(f"There are {len(feature_list)} unique features used for training: {feature_list}")

In [None]:
# Combining feature importances of different models into one dataframe
df = pd.DataFrame(columns=["Feature"])
df["Feature"] = list(feature_list)
for i in np.arange(len(all_fi)):
    for key in all_fi[i].keys():
        df.loc[df["Feature"] == key, "Importance_" + str(i+1)] = all_fi[i][key] / 1000
df.fillna(0, inplace=True)
df.sort_values("Importance_1", axis=0, ascending=False, inplace=True)

In [None]:
x = np.arange(0, len(df["Feature"]))
height = 0.3

fig, ax = plt.subplots(figsize=(12, 9))
bars1 = ax.barh(x-height, df["Importance_1"], height=height,
                color="cornflowerblue",
                edgecolor="black",
                label=target_names[0])
bars2 = ax.barh(x, df["Importance_2"], height=height,
                color="palevioletred",
                edgecolor="black",
                label=target_names[1])
bars3 = ax.barh(x+height, df["Importance_3"], height=height,
                color="mediumseagreen",
                edgecolor="black",
                label=target_names[2])
ax.set_title("Feature importances", fontsize=20, pad=5)
ax.set_ylabel("Feature names", fontsize=15, labelpad=5)
ax.set_xlabel("Feature importance", fontsize=15, labelpad=5)
ax.set_yticks(x)
ax.set_yticklabels(df["Feature"], fontsize=12)
ax.tick_params(axis="x", labelsize=12)
ax.grid(axis="x")
ax.legend(fontsize=13, loc="lower right")
plt.margins(0.04, 0.01)
plt.gca().invert_yaxis()

## Save Predict File

In [None]:
# preds.head()

In [None]:
# all_pred=np.vstack(([preds['target_carbon_monoxide'].values, preds['target_benzene'].values, preds['target_nitrogen_oxides'].values])).T
# scaler.inverse_transform(all_pred)

In [None]:
# preds['target_carbon_monoxide']=scaler.inverse_transform(all_pred)[:,0]
# preds['target_benzene']=scaler.inverse_transform(all_pred)[:,1]
# preds['target_nitrogen_oxides']=scaler.inverse_transform(all_pred)[:,2]

In [None]:
preds.head()

In [None]:
preds.to_csv('submission.csv', index=False)