In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
INPUT_DIR = "../input/tabular-playground-series-jul-2021/"
train_csv = pd.read_csv(INPUT_DIR + 'train.csv')
test_csv = pd.read_csv(INPUT_DIR + 'test.csv')

from datetime import datetime
def log_scaling(col):
    col = np.log1p(col)
    return col
def treat_data(csv):
    """
    Takes: pd df input data in a form similar to train.csv
    Returns: pd df data split into feature and target categories. The date_time column has been 
    converted from string format to datetime format.
    
    """
    csv['date_time'] = [datetime.strptime(i, '%Y-%m-%d %H:%M:%S') for i in csv['date_time']]
    csv['hour'] = [i.hour for i in csv['date_time']]
    csv['day'] = [i.day for i in csv['date_time']]
    csv['week'] = [i.week for i in csv['date_time']]
    #csv['month'] = [i.month for i in csv['date_time']]
    #csv['year'] = [i.year for i in csv['date_time']]
    csv['weekday'] = [i.dayofweek for i in csv['date_time']]
    first_day = min(csv['date_time'])
    #csv['time_since_start'] = [(i - first_day).days for i in csv['date_time']]
    csv['weekday'] = csv['weekday'].astype(object)
    csv['SMC'] = (csv['absolute_humidity'] * 100) / csv['relative_humidity']
    csv = pd.get_dummies(csv)
    cols = ['sensor_1', 'sensor_2', 'sensor_3','sensor_5','SMC']
    for col in cols:
        csv[col] = log_scaling(csv[col])
    return csv
train_csv, test_csv = treat_data(train_csv), treat_data(test_csv)

In [None]:
cols = pd.Series(train_csv.columns).drop(0)
fig, ax = plt.subplots(len(cols), 2, figsize=(12,25))
n = 0
for i in cols:
    sns.histplot(train_csv[i], ax=ax[n, 0]);
    sns.histplot(log_scaling(train_csv[i]), ax=ax[n,1])
    n += 1
fig.tight_layout()
plt.show()

In [None]:
train_csv.info()

In [None]:
print(test_csv.describe())
print(train_csv.describe())

In [None]:
from sklearn.model_selection import train_test_split, KFold
def pre_inference(csv):
    # In preparation for training
    csv = csv.drop(['date_time'], axis=1)
    return csv
train_csv = pre_inference(train_csv)

def log_scaling(col):
    col = np.log1p(col)
    return col

def x_y_split(csv):
    y_csv = csv[['target_carbon_monoxide','target_benzene','target_nitrogen_oxides']]
    x_csv = csv.drop(['target_carbon_monoxide','target_benzene','target_nitrogen_oxides'], axis=1)
    return x_csv, y_csv
X, y = x_y_split(train_csv)

In [None]:
import xgboost as xgb
from sklearn.multioutput import MultiOutputRegressor 
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from collections import OrderedDict
from hyperopt import hp, fmin, tpe
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning)

def cross_validate(learning_rate=0.1, max_depth=10, colsample_bytree=0.8, subsample=1, n_estimators=100, booster="gbtree"):
    kf = KFold(n_splits=5)
    losses = []
    models = []
    for index, (train_index, val_index) in enumerate(kf.split(X)):
        print("Split:",index+1,"VAL:", min(val_index),'-',max(val_index))
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        #data_dmatrix = xgb.DMatrix(data=X_train,label=y_train)
        xg_reg = MultiOutputRegressor(xgb.XGBRegressor(objective ='reg:linear', 
                    colsample_bytree = colsample_bytree, learning_rate = learning_rate,
                    max_depth = max_depth, alpha = 10, n_estimators = n_estimators,verbosity = 0, random_state=100, booster=booster))
        xg_reg.fit(X_train,y_train)
        preds = xg_reg.predict(X_val)
        RMSLE = np.sqrt(mean_squared_log_error(y_val, np.clip(preds, 0.001, None)))
        print("loss:",RMSLE)
        losses.append(RMSLE); models.append(xg_reg) 
    print("RMSLE : %f" % (np.mean(losses)))
    return models

"""
SPACE = OrderedDict([('learning_rate', hp.loguniform('learning_rate', 
                                                     np.log(0.1), np.log(1))),
                    ('max_depth', hp.choice('max_depth', range(1, 20, 1))),
                    ('colsample_bytree', hp.loguniform('colsample_bytree', np.log(0.02),np.log(0.5))),
                    ('subsample', hp.loguniform('subsample', np.log(0.1), np.log(1.0))),
                    ('n_estimators', hp.choice('n_estimators', range(1,200,1))),
                    ('booster', hp.choice('booster',['gbtree','gblinear','dart']))
                    #('gamma', hp.lognormal('gamma', 0.02,0.5))
                    ])

def train_evaluate(learning_rate, max_depth, colsample_bytree, subsample, n_estimators, booster):
    kf = KFold(n_splits=5)
    losses = []
    for index, (train_index, val_index) in enumerate(kf.split(X)):
        #print("Split:",index+1,"VAL:", min(val_index),'-',max(val_index))
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        #data_dmatrix = xgb.DMatrix(data=X_train,label=y_train)
        xg_reg = MultiOutputRegressor(xgb.XGBRegressor(objective ='reg:linear', 
                    colsample_bytree = colsample_bytree, learning_rate = learning_rate,
                    max_depth = max_depth, alpha = 10, n_estimators = n_estimators,verbosity = 0))
        xg_reg.fit(X_train,y_train)
        preds = xg_reg.predict(X_val)
        RMSLE = np.sqrt(mean_squared_log_error(y_val, np.clip(preds, 0.001, None)))
        losses.append(RMSLE)    
    print("RMSLE : %f" % (np.mean(losses)))
    return np.mean(losses)
def objective(params):
    print(params)
    return train_evaluate(**params)
best = fmin(objective, SPACE, algo=tpe.suggest, max_evals=100)
print(best)"""
pass

In [None]:
models = cross_validate()

In [None]:
# Training Log
#---------------
# Original: RMSLE : 0.327137
# Big (200 n_estimators): 0.327596 -> not actually a whole lot better
# Subsample 1: 0.327137 (gah)
# Learning rate = 0.1 : 0.320755
# Learning rate = 0.05: 0.322487 -> Bah, these gains are all marginal at best

# Gbtree/Gblinear/Dart: 0.320755 / 0.566162 / 0.322403

# W/Feature engineering v1: 0.262034
# W/Feature engineering w/out log target modifiers: 0.260650

In [None]:
# Train final model
import xgboost as xgb
from sklearn.multioutput import MultiOutputRegressor 
from sklearn.metrics import mean_squared_error, mean_squared_log_error
"""
xg_reg = MultiOutputRegressor(xgb.XGBRegressor(objective ='reg:linear', 
            colsample_bytree = 0.8, learning_rate = 0.1, subsample = 1,
            max_depth = 10, alpha = 10, n_estimators = 100, booster = 'gbtree'))
xg_reg.fit(X,y)
preds = xg_reg.predict(X)
RMSLE = np.sqrt(mean_squared_log_error(y, np.clip(preds, 0.001, None)))
print("Train loss: ",RMSLE)

final_preds = xg_reg.predict(pre_inference(test_csv))
"""
# K-fold ensemble
preds = []
for model in models:
    preds.append(model.predict(X))
preds = np.mean(preds,axis=0)
RMSLE = np.sqrt(mean_squared_log_error(y, np.clip(preds, 0.001, None)))
print("Train loss: ",RMSLE)

preds = []
for model in models:
    preds.append(model.predict(pre_inference(test_csv)))
final_preds = np.mean(preds,axis=0)

In [None]:
#pd.Series(X.columns).drop([20,17,11,18,20,14,19,16,15])

In [None]:
"""
from xgboost import plot_importance
for model in xg_reg.estimators_:
    # plot
    plot_importance(model)
"""

In [None]:
targets = ['target_carbon_monoxide','target_benzene','target_nitrogen_oxides']
for i in range(3): print(targets[i],"Train:",y[targets[i]].mean(),"Test:",final_preds[:,i].mean())

In [None]:
submission_df = pd.DataFrame(final_preds)
submission_df.insert(0, 'date_time', test_csv['date_time'])
submission_df.columns = ['date_time','target_carbon_monoxide','target_benzene','target_nitrogen_oxides']
submission_df.to_csv('submission.csv',index=False)

In [None]:
pd.read_csv(INPUT_DIR + 'sample_submission.csv')