# TabularPlaygroundSeriesJul2021

In [None]:
import numpy as np
import pandas as pd 
import os, gc
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_absolute_error

# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True, figsize=(11, 5))
plt.rc("axes",
       labelweight="bold",
       labelsize="large",
       titleweight="bold",
       titlesize=14,
       titlepad=10,)
plot_params = dict(color="0.75",
                   style=".-",
                   markeredgecolor="0.25",
                   markerfacecolor="0.25",
                   legend=False,)

In [None]:
BASE_PATH = Path('/kaggle/input/tabular-playground-series-jul-2021')

train_df = pd.read_csv(BASE_PATH / 'train.csv')
test_df  = pd.read_csv(BASE_PATH / 'test.csv')

targets = ['target_carbon_monoxide', 'target_benzene',  'target_nitrogen_oxides']
sensors = ['sensor_1','sensor_2','sensor_3','sensor_4','sensor_5']
other_cols = ['deg_C', 'relative_humidity', 'absolute_humidity',]

In [None]:
train_df['date_time'] = pd.to_datetime(train_df['date_time'])
test_df['date_time'] = pd.to_datetime(test_df['date_time'])
train_range = pd.date_range(start='2010-03-10 18', end='2011-01-01 00', freq='H')
test_range = pd.date_range(start='2011-01-01 00', end='2011-04-04 14', freq='H')

# 
print(len(train_df), len(train_range))
print(len(test_df), len(test_range))
del train_range, test_range

# submission_df['date_time'] = pd.to_datetime(submission_df['date_time'])
train_df['month'] = train_df['date_time'].dt.month
test_df['month'] = test_df['date_time'].dt.month
train_df['hour'] = train_df['date_time'].dt.hour
test_df['hour'] = test_df['date_time'].dt.hour

In [None]:
for col in targets:
    train_df[f'{col}_cumsum'] = train_df[col].cumsum()

for col in targets:
    sns.lineplot(data=train_df, x='date_time', y=f'{col}_cumsum')
    plt.show()

In [None]:
for col in targets:
    train_df[f'{col}_cumsum'] = train_df[col].cumsum()

for col in targets:
    sns.lineplot(data=train_df, x='date_time', y=f'{col}_cumsum')
    plt.show()

In [None]:
targets_cumsum = [f'{s}_cumsum' for s in targets]
all_df = pd.concat([train_df.drop(columns=targets), test_df[1:]])
for col in sensors+other_cols:
    all_df[f'{col}_1D_mean'] = all_df[col].rolling(24).mean()
    all_df[f'{col}_7D_mean'] = all_df[col].rolling(24*7).mean()
    all_df[f'{col}_cumsum'] = all_df[col].cumsum()
tmp_df = all_df[:len(train_df)]
train_df = pd.concat([tmp_df,train_df[targets]],axis=1)
test_df = all_df[len(train_df)-1:].reset_index(drop=True)
train_df.head(3)

In [None]:
# from sklearn.model_selection import train_test_split,TimeSeriesSplit
from sklearn.linear_model import LinearRegression
# tr_df = train_df.loc[train_df.date_time<'2010-10-01']
# val_df = train_df.loc[train_df.date_time>='2010-10-01']

training_features = [s for s in train_df.columns.tolist() if ('sensor' in s) and 'cumsum' in s]+['deg_C_cumsum', 'relative_humidity_cumsum', 'absolute_humidity_cumsum']
# X_train = tr_df[training_features]
# y_train = tr_df[targets_cumsum].values
# X_val   = val_df[training_features]
# y_val   = val_df[targets_cumsum].values
X_train = train_df[training_features]
y_train = train_df[targets_cumsum].values
X_test =  test_df[training_features]

In [None]:
model = LinearRegression()
model.fit(X_train,y_train)
sub_preds = model.predict(X_test)

In [None]:
for i,target in enumerate(targets_cumsum):
    test_df[target] = sub_preds[:,i]
for i, col in enumerate(targets_cumsum):
    sns.lineplot(data=train_df, x='date_time', y=col)
    sns.lineplot(data=test_df, x='date_time', y=col)
    plt.legend(['train','test'])
    plt.show()

In [None]:
# https://codereview.stackexchange.com/questions/117183/extracting-original-values-from-cumulative-sum-values
def inverse_cumsum(x_cumsum):
    x_cumsum_shifted = np.insert(np.delete(x_cumsum, -1), 0, 0)
    return x_cumsum - x_cumsum_shifted
    
for i, col in enumerate(targets_cumsum):
    x_cumsum = train_df[targets_cumsum[i]].values
    print(col, inverse_cumsum(x_cumsum))

In [None]:
x_cumsums = pd.concat([train_df.loc[:len(train_df)-2, targets_cumsum],test_df[targets_cumsum]]).values

for i, (col, col_cumsum) in enumerate(zip(targets,targets_cumsum)):
    pred = inverse_cumsum(x_cumsums[:,i])
    test_df[col] = pred[len(train_df)-1:]
for col in targets:
    test_df.loc[0, col] = train_df[col].tail(1).values
    test_df[col] = np.clip(test_df[col].values,0,None)

In [None]:
train_df[targets].head(3)

In [None]:
test_df[targets].head(3)

In [None]:
submission_df = pd.read_csv(BASE_PATH / 'sample_submission.csv')
for col in targets:
    submission_df[col] = test_df[col].astype(np.float32)

In [None]:
submission_df.to_csv('submission.csv', index=False)
pd.read_csv('submission.csv').head(3)

In [None]:
all_df = pd.concat([train_df.loc[:len(train_df)-2, ['date_time']+targets], test_df[['date_time']+targets]], axis=0)
for col in targets:
    sns.lineplot(data=all_df, x='date_time', y=col)
    plt.show()