# Contents

[Dealing with bad indices](#bad_indices)

[Validation](#validation)

[Final models](#final_models)


In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

train = pd.read_csv("../input/tabular-playground-series-jul-2021/train.csv", keep_default_na=True)
test = pd.read_csv("../input/tabular-playground-series-jul-2021/test.csv", keep_default_na=True)

train.info()
test.info() 
    
train_len = train.shape[0]
submit = test["date_time"].to_frame()


In [None]:
def plot_days(days, feature):
    for d in days:
        train.iloc[24 * d:24 * (d + 1)].plot(x = 'date_time', y = feature, figsize = (12, 4))

#shapes are broadly similar from day to day
#but values can vary significantly, including between the same days of the week
    
plot_days([0, 1, 77, 78, 231, 232], 'target_carbon_monoxide')

In [None]:
all_data = pd.concat([train, test])

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

#sensor_4 is poorly correlated with target_nitrogen_oxides

def corrplot():
    sns.set(font_scale=1.1)
    correlation_train = all_data.iloc[:train_len].corr('spearman')
    mask = np.triu(correlation_train)
    plt.figure(figsize=(20, 20))
    sns.heatmap(correlation_train,
                annot=True,
                fmt='.1f',
                cmap='coolwarm',
                square=True,
                mask=mask,
                linewidths=1,
                cbar=False)

    plt.show()
    
corrplot()

In [None]:
train = all_data.iloc[:train_len]

plt.scatter(train['sensor_1'], train['sensor_4'])

#note the group of outliers in this and other scatterplots

In [None]:
bad_indices = all_data['sensor_1'].between(1000, 1210)
bad_indices &= all_data['sensor_4'].between(500, 700)

print(bad_indices.sum())  

train_bad_indices = bad_indices.iloc[:train_len]
test_bad_indices = bad_indices.iloc[train_len:]

print(train_bad_indices.sum())  #228

bad_indices_list = []
for i, v in bad_indices.items():
    if v:
        bad_indices_list.append(i)
        

train_bad_indices_list = bad_indices_list[:train_bad_indices.sum()]
test_bad_indices_list = bad_indices_list[train_bad_indices.sum():]

print(train_bad_indices_list)
print(test_bad_indices_list)

In [None]:
#a reasonable interpretation is that these bad indices are when the sensors are not
#functioning properly, due to e.g bad weather

#I'll train different models for the 'bad' data and the good data

train_bad = train.loc[train_bad_indices]

train_bad.info()

fig, axes = plt.subplots(5, 3, figsize=(15, 15))

for i in range(5):
    axes[i, 0].scatter(train['sensor_' + str(i + 1)], train["target_carbon_monoxide"])
    axes[i, 1].scatter(train['sensor_' + str(i + 1)], train["target_benzene"])
    axes[i, 2].scatter(train['sensor_' + str(i + 1)], train["target_nitrogen_oxides"])
    
    axes[i, 0].scatter(train_bad['sensor_' + str(i + 1)], train_bad["target_carbon_monoxide"], color = 'red')
    axes[i, 1].scatter(train_bad['sensor_' + str(i + 1)], train_bad["target_benzene"], color = 'red')
    axes[i, 2].scatter(train_bad['sensor_' + str(i + 1)], train_bad["target_nitrogen_oxides"], color = 'red')

In [None]:
all_data['hour'] = all_data['date_time'].apply(lambda x: int(x[11:13]))

all_data['date_time']=pd.to_datetime(all_data['date_time'],format='%Y-%m-%d %H:%M:%S')

all_data['is_weekend'] = (all_data["date_time"].dt.dayofweek >= 5).astype("int")

#including data from a week ago greatly improves model rmse. 
#Probably two reasons: the strong correlation of the features with the hour and 
#day of week
#as well as the long sequences of bad indices. Data from 168 hours ago is usually
#good data
sensor_offsets = [1, 168]

for i in range(1, 6):
    feature = "sensor_" + str(i)
    initial_value = train.at[0, feature]
    
    for off in sensor_offsets:
        offset_feature = feature + "_p" + str(off)
        all_data[offset_feature] = all_data[feature].shift(periods = off, fill_value = initial_value)
    


all_data.info()
all_data.head()

In [None]:
train_by_hour = all_data.iloc[:train_len].groupby("hour")

print(train_by_hour.median())

#all_data.iloc[:train_len].groupby("hour").plot(x = "hour", y = "target_carbon_monoxide", kind = "box")

In [None]:
#target values are generally lowest at 4 am, with a bimodal distribution
#that peaks at 8 am and 7 pm, which are commute hours. Representing 4 am as 0
#in the hour column may make the dataset more amenable to tree classification

#working hours start from ~7 am. Representing 7 am as 0 may also be effective.
#this seems to outperform introducing an is_working_hour feature.

print(all_data.head())

all_data.hour = all_data.hour.apply(lambda x: (x + 17) % 24)

all_data.head()

In [None]:
train = all_data.iloc[:train_len]
test = all_data.iloc[train_len:]

<a id="bad_indices"></a>
# Dealing with bad indices

In [None]:
targets = ["target_carbon_monoxide", "target_benzene", "target_nitrogen_oxides"]

y_all = np.log1p(train[targets])

#naive approach to predicting benzene values at bad indices
#makes awful predictions
#bad_benzene_pred = []

#for i in test_bad_indices_list:
#    hour = test.at[i, "hour"]
#    day = test.at[i, "date_time"].dayofweek
#    pred = train[(train["hour"] == hour) & (train["date_time"].dt.dayofweek == day)]["target_benzene"].median()
#    bad_benzene_pred.append(pred)
    
#print(bad_benzene_pred)
    
train.drop(targets, axis = 1, inplace = True)
train.drop("date_time", axis = 1, inplace = True)
test.drop("date_time", axis = 1, inplace = True)


print(train_len)
y_all.info()

corrplot()

In [None]:
'''
def last_good_reading_at_this_hour(index):
    for i in range(index, 0, -24):
        if i not in test_bad_indices_list:
            return i
    
print(test.iloc[1666:1669])    
    
imputed = []    
lgr = []

for i in test_bad_indices_list:
    if (i + 1) not in test_bad_indices_list and (i-1) not in test_bad_indices_list:
        imputed.append("Neighbour mean")
        test.iloc[i] = (test.iloc[i + 1] + test.iloc[i - 1])/2
            
    else:
        imputed.append("Lgr")
        test.iloc[i] = test.iloc[last_good_reading_at_this_hour(i)]
        lgr.append(last_good_reading_at_this_hour(i))
            
            
print(lgr)
print(len(lgr))
'''
test.drop(targets, axis = 1, inplace = True)

<a id="validation"></a>
# Validation

In [None]:
offset = 0
valid_frac = 5  #every 5th entry (skipping bad indices) will be reserved for validation.
                #in part, 5 is chosen to be coprime to 7 (number of days of the week)
                #and 24 (number of hours in a day)
        
valid_indices = [i for i in range(offset, train_len, valid_frac) 
                 if i not in train_bad_indices_list]

train_indices = [i for i in range(train_len)
                 if i not in valid_indices
                 if i not in train_bad_indices_list]

X_valid = train.iloc[valid_indices]
X_train = train.iloc[train_indices]

y_valid = y_all.iloc[valid_indices]
y_train = y_all.iloc[train_indices]


In [None]:
import xgboost as xgb


def cv_xgb(dmatrix):

    models = []
    for c in [1]:
        for s in [0.7, 0.8, 0.9]:
            models.append(xgb.XGBRegressor(colsample_bytree = 1, subsample = s, 
                                 max_depth=6, min_child_weight=0.8, 
                                 eta = 0.2, n_estimators=5000,
                                 reg_alpha = 0.6, reg_lambda = 1.2, 
                                 gamma=0))

    for m in models:        
        results = []
        seeds = [11, 22, 33]
        for seed in seeds:
            cvresult = xgb.cv(m.get_xgb_params(), dmatrix, num_boost_round=m.get_params()['n_estimators'],
                   nfold=5, metrics='rmse', early_stopping_rounds = 50, seed = seed)
            results.append(cvresult.iloc[-1])
        
        average = results[0]
        for r in results[1:]:
            average += r
        print("Averages for subsample =", m.get_xgb_params()["subsample"], "reg_lambda =", m.get_xgb_params()["reg_lambda"], ":")
        print(average/ len(results))

#carbon_monoxide_dmatrix = xgb.DMatrix(data=train,label=y_carbon_monoxide)

#cv_xgb(carbon_monoxide_dmatrix)

#with few features, colsample_bytree = 1 makes sense

#consider eta 0.01 for final model 
#carbon_monoxide_model = xgb.XGBRegressor(colsample_bytree= 1, subsample = 0.8, 
#                                 max_depth=4, min_child_weight=1.7, 
#                                 eta = 0.02, n_estimators=2000,
#                                 reg_alpha=0.3, reg_lambda = 1.2, 
#                                 gamma=0)

#benzene_dmatrix = xgb.DMatrix(data=train,label=y_benzene)

#cv_xgb(benzene_dmatrix)

#benzene_model = xgb.XGBRegressor(colsample_bytree=1, subsample = 0.8, 
#                                 max_depth=3, min_child_weight=0.8, 
#                                 eta = 0.02, n_estimators=5000,
#                                 reg_alpha=0.6, reg_lambda = 1.2, 
#                                 gamma=0)

#nitrogen_oxides_dmatrix = xgb.DMatrix(data=train,label=y_nitrogen_oxides)

#cv_xgb(nitrogen_oxides_dmatrix)

#nitrogen_oxides_model = xgb.XGBRegressor(colsample_bytree=1, subsample = 0.9, 
#                                 max_depth=6, min_child_weight=0.8, 
#                                 eta = 0.02, n_estimators=5000,
#                                 reg_alpha=0.6, reg_lambda = 1.2, 
#                                 gamma=0)

#carbon_monoxide_model.fit(train, y_carbon_monoxide)
#submit['target_carbon_monoxide'] = np.expm1(carbon_monoxide_model.predict(test))

#benzene_model.fit(train, y_benzene)
#submit['target_benzene'] = np.expm1(benzene_model.predict(test))

#nitrogen_oxides_model.fit(train, y_nitrogen_oxides)
#submit['target_nitrogen_oxides'] = np.expm1(nitrogen_oxides_model.predict(test))

In [None]:
import catboost as cat

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score


def cat_val(col):
    
    all_params = {}
    
    for l in [1, 1.5, 2, 2.5, 3]: 
        
            #sensor_1 - 5 have highest correlation with targets, but giving them
            #higher weights doesn't improve model rmse
            #feature_weights = { 'sensor_1': w,
            #                    'sensor_2': w,
            #                    'sensor_3': w,
            #                    'sensor_4': w,
            #                    'sensor_5': w,
            #            }

            params = { 'depth': 9,
                       'learning_rate': 0.05,
                       'random_strength': 1.4,
                       'l2_leaf_reg': l,
                       'grow_policy': 'SymmetricTree',
                       'eval_metric': 'RMSE',
                       #'monotone_constraints': {'sensor_2': 1}  #this makes the model perform worse for benzene
                                                                 #despite perfect monotone corr with the good data
                }

            model = cat.CatBoostRegressor(**params)
            model.fit(X_train, y_train[col], verbose = False, use_best_model = True,
                     eval_set = (X_valid, y_valid[col]))

            all_params[col] = model.get_all_params()
            print(col, mean_squared_error(y_valid[col], model.predict(X_valid)))
            

#cat_val('target_carbon_monoxide')
#cat_val('target_benzene')
#cat_val('target_nitrogen_oxides')
#
#print(all_params)

In [None]:
<a id="final_models"></a>
# Final models

In [None]:
carbon_monoxide_model = cat.CatBoostRegressor(depth = 9, learning_rate = 0.05, random_strength = 1.6, l2_leaf_reg = 2, verbose = False)
carbon_monoxide_model.fit(train.loc[~train_bad_indices], y_all['target_carbon_monoxide'].loc[~train_bad_indices])

benzene_model = cat.CatBoostRegressor(depth = 6, verbose = False)
benzene_model.fit(train.loc[~train_bad_indices], y_all['target_benzene'].loc[~train_bad_indices])

nitrogen_oxides_model = cat.CatBoostRegressor(depth = 9, learning_rate = 0.05, random_strength = 1.4, l2_leaf_reg = 2,verbose = False)
nitrogen_oxides_model.fit(train.loc[~train_bad_indices], y_all['target_nitrogen_oxides'].loc[~train_bad_indices])

In [None]:
#fitting the models for the bad data on the entire training set
#has produced much more accurate models that fitting them on just the bad data

bad_carbon_monoxide_model = cat.CatBoostRegressor(depth = 8, verbose = False)
bad_carbon_monoxide_model.fit(train, y_all['target_carbon_monoxide'])

#unsure how to deal with the bad benzene 0 values. I'm just leaving it to CatBoost here
#maybe something like imputing from non-target values with a linear regressor could be effective
bad_benzene_model = cat.CatBoostRegressor(depth = 6, verbose = False)
bad_benzene_model.fit(train, y_all['target_benzene'])

bad_nitrogen_oxides_model = cat.CatBoostRegressor(depth = 9, verbose = False)
bad_nitrogen_oxides_model.fit(train, y_all['target_nitrogen_oxides'])

In [None]:
def merge(good, bad, bad_indices):
    if len(good) != bad_indices.count(False):
        raise ValueError("Number of False values in bad_indices doesn't match length of good data series")
    if len(good) + len(bad) != len(bad_indices):
        raise ValueError("Number of True values in bad_indices doesn't match length of bad data series")
    
    merged = []
    good_i = 0
    bad_i = 0
    for b in bad_indices:
        if b:
            merged.append(bad[bad_i])
            bad_i += 1
        else:
            merged.append(good[good_i])
            good_i += 1
            
    return pd.Series(merged)


In [None]:
carbon_monoxide_pred = np.expm1(carbon_monoxide_model.predict(test[~test_bad_indices]))
benzene_pred = np.expm1(benzene_model.predict(test[~test_bad_indices]))
nitrogen_oxides_pred = np.expm1(nitrogen_oxides_model.predict(test[~test_bad_indices]))

bad_carbon_monoxide_pred = np.expm1(bad_carbon_monoxide_model.predict(test[test_bad_indices]))
bad_benzene_pred = np.expm1(bad_benzene_model.predict(test[test_bad_indices]))
bad_nitrogen_oxides_pred = np.expm1(bad_nitrogen_oxides_model.predict(test[test_bad_indices]))

In [None]:
submit["target_carbon_monoxide"] = merge(carbon_monoxide_pred, bad_carbon_monoxide_pred, test_bad_indices.tolist())

submit["target_benzene"] = merge(benzene_pred, bad_benzene_pred, test_bad_indices.tolist())

submit["target_nitrogen_oxides"] = merge(nitrogen_oxides_pred, bad_nitrogen_oxides_pred, test_bad_indices.tolist())
    
print(submit)

submit.to_csv('submission.csv', index=False)