In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

np.random.seed(2021)

In [None]:
# !pip install xlrd
!pip install openpyxl  # for loading pseudolabels in .xlsx format

Thanks to Alexander Ryzhkov for [this notebook](https://www.kaggle.com/alexryzhkov/tps-lightautoml-baseline-with-pseudolabels/notebook).

## Load data

In [None]:
train_data = pd.read_csv('../input/tabular-playground-series-jul-2021/train.csv')
train_data.head()

In [None]:
train_data.shape

In [None]:
test_data = pd.read_csv('../input/tabular-playground-series-jul-2021/test.csv')
test_data.head()

In [None]:
test_data.shape

In [None]:
sample_sub = pd.read_csv('../input/tabular-playground-series-jul-2021/sample_submission.csv')
sample_sub.head()

In [None]:
sample_sub.shape

In [None]:
# Pseudolabels from true dataset 
pseudolabels_true = pd.read_excel('/kaggle/input/air-quality-time-series-data-uci/AirQualityUCI.xlsx')
pseudolabels_true = pseudolabels_true.iloc[7110:].reset_index(drop = True)
pseudolabels_true.rename({'CO(GT)': 'target_carbon_monoxide',
                          'C6H6(GT)': 'target_benzene',
                          'NOx(GT)': 'target_nitrogen_oxides'},
                         axis = 1, inplace = True)
pseudolabels_true.head(5)

### Load pseudolabels

In [None]:
pseudolabels_preds = pd.read_csv('../input/tps-lightautoml-baseline-with-pseudolabels/lightautoml_with_pseudolabelling_kernel_version_15.csv')

In [None]:
pseudolabels_preds.shape

In [None]:
test_data['target_carbon_monoxide'] = np.where(pseudolabels_true['target_carbon_monoxide'].values >= 0, 
                                               pseudolabels_true['target_carbon_monoxide'].values, 
                                               pseudolabels_preds['target_carbon_monoxide'].values)
test_data['target_benzene'] = np.where(pseudolabels_true['target_benzene'].values >= 0, 
                                       pseudolabels_true['target_benzene'].values, 
                                       pseudolabels_preds['target_benzene'].values)
test_data['target_nitrogen_oxides'] = np.where(pseudolabels_true['target_nitrogen_oxides'].values >= 0, 
                                       pseudolabels_true['target_nitrogen_oxides'].values, 
                                       pseudolabels_preds['target_nitrogen_oxides'].values)
    
test_data.head(5)

In [None]:
ALL_DF = pd.concat([train_data, test_data]).reset_index(drop = True)
print(ALL_DF.shape)

### Some feature engineering

Thanks Remek Kinas for [your notebook](https://www.kaggle.com/remekkinas/mljar-code-minimal).

In [None]:
# Feature engineering func from Remek Kinas kernel with MLJAR (https://www.kaggle.com/remekkinas/mljar-code-minimal) - do not forget to upvote his kernel
    
import math

def pb_add(X):
    X['day'] = X.date_time.dt.weekday
    is_odd = (X['sensor_4'] < 646) & (X['absolute_humidity'] < 0.238)
    X['is_odd'] = is_odd
    diff = X['date_time'] - min(X['date_time'])
    trend = diff.dt.days
    X['f1s'] = np.sin(trend * 2 * math.pi / (365 * 1)) 
    X['f1c'] = np.cos(trend * 2 * math.pi / (365 * 1))
    X['f2s'] = np.sin(2 * math.pi * trend / (365 * 2)) 
    X['f2c'] = np.cos(2 * math.pi * trend / (365 * 2)) 
    X['f3s'] = np.sin(2 * math.pi * trend / (365 * 3)) 
    X['f3c'] = np.cos(2 * math.pi * trend / (365 * 3)) 
    X['f4s'] = np.sin(2 * math.pi * trend / (365 * 4)) 
    X['f4c'] = np.cos(2 * math.pi * trend / (365 * 4)) 
    X['fh1s'] = np.sin(diff.dt.seconds * 2 * math.pi / ( 3600 * 24 * 1))
    X['fh1c'] = np.cos(diff.dt.seconds * 2 * math.pi / ( 3600 * 24 * 1))
    X['fh2s'] = np.sin(diff.dt.seconds * 2 * math.pi / ( 3600 * 24 * 2))
    X['fh2c'] = np.cos(diff.dt.seconds * 2 * math.pi / ( 3600 * 24 * 2))
    X['fh3s'] = np.sin(diff.dt.seconds * 2 * math.pi / ( 3600 * 24 * 3))
    X['fh3c'] = np.cos(diff.dt.seconds * 2 * math.pi / ( 3600 * 24 * 3))
    
    sensor_features = [
        'deg_C', 
        'relative_humidity', 'absolute_humidity', 
        'sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5' ]
    
    lags = [-1, -4, -24, -7 * 24]  
    for sensor_feature in sensor_features:
        this = X[sensor_feature]

        for lag in lags:
            feature = f'{sensor_feature}_{abs(lag)}b'
            this_f = X[sensor_feature].shift(lag)
            X[feature] = (this_f - this).fillna(0)
        # look forwards
        for lag in lags:
            feature = f'{sensor_feature}_{abs(-lag)}f'
            this_f = X[sensor_feature].shift(-lag)
            X[feature] = (this_f - this).fillna(0)
            
    return X

ALL_DF['date_time'] = pd.to_datetime(ALL_DF['date_time'])
ALL_DF["hour"] = ALL_DF["date_time"].dt.hour
ALL_DF["working_hours"] =  ALL_DF["hour"].isin(np.arange(8, 21, 1)).astype("int")
ALL_DF["is_weekend"] = (ALL_DF["date_time"].dt.dayofweek >= 5).astype("int")
ALL_DF['hr'] = ALL_DF.date_time.dt.hour * 60 + ALL_DF.date_time.dt.minute
ALL_DF['satday'] = (ALL_DF.date_time.dt.weekday==5).astype("int")
ALL_DF["SMC"] = (ALL_DF["absolute_humidity"] * 100) / ALL_DF["relative_humidity"]
ALL_DF.drop(columns = 'hour', inplace = True)

pb_add(ALL_DF)

ALL_DF['date_time'] = ALL_DF['date_time'].astype(str)

In [None]:
ALL_DF.head()

In [None]:
def create_target_feats(df):
    for lag in [1, 4, 24, 7 * 24]:
        for t in ['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']:
            df['{}_lag_{}'.format(t, lag)] = df[t].shift(lag)
            df['{}_lag_m{}'.format(t, lag)] = df[t].shift(-lag)
            df['diff_{}_{}'.format(t, lag)] = df['{}_lag_m{}'.format(t, lag)] - df['{}_lag_{}'.format(t, lag)]
            df['div_{}_{}'.format(t, lag)] = df['{}_lag_m{}'.format(t, lag)] / df['{}_lag_{}'.format(t, lag)]
create_target_feats(ALL_DF)

## Split data into training and testing sets

In [None]:
train_data, test_data = ALL_DF.iloc[:(len(ALL_DF) - len(test_data)), :], ALL_DF.iloc[(len(ALL_DF) - len(test_data)):, :]
print(train_data.shape, test_data.shape)
train_data.tail(5)

In [None]:
test_data.tail()

## Prophet setup

In [None]:
from fbprophet import Prophet


# Add exogenous regressors
def add_regressors(m, train):
    for reg in train.columns.values.tolist():
        if reg not in ["ds", "y"]:
            m.add_regressor(reg)    

def do_prophet(train, future):
    m = Prophet(yearly_seasonality=False, weekly_seasonality=False, daily_seasonality=True,
                changepoint_prior_scale=0.1, seasonality_prior_scale=10)
    add_regressors(m, train)
    m.fit(train)
    #future = m.make_future_dataframe(periods=100,freq='H')
    predict = m.predict(future)
    fig1 = m.plot(predict)
    fig2 = m.plot_components(predict)
    return(predict)

### Hi ha una variable que té un comporament diferent a partir d'una data. Quina? Entrenar només a partir d'aquella data.

In [None]:
train_NO = train_data.rename(columns={'date_time': 'ds', 'target_nitrogen_oxides': 'y'})
test_NO = test_data.rename(columns={'date_time': 'ds', 'target_nitrogen_oxides': 'y'})
pred_NO = do_prophet(train_NO.fillna(0), test_NO.fillna(0))

## Obtain predictions for each target

In [None]:
train_CO = train_data.rename(columns={'date_time': 'ds', 'target_carbon_monoxide': 'y'})
test_CO = test_data.rename(columns={'date_time': 'ds', 'target_carbon_monoxide': 'y'})

pred_CO = do_prophet(train_CO.fillna(0), test_CO.fillna(0))

In [None]:
train_B = train_data.rename(columns={'date_time': 'ds', 'target_benzene': 'y'})
test_B = test_data.rename(columns={'date_time': 'ds', 'target_benzene': 'y'})

pred_B = do_prophet(train_B.fillna(0), test_B.fillna(0))

## Gather predictions for submission

In [None]:
pred_CO.loc[:,["ds", "yhat"]].head()

In [None]:
CO_subm = pred_CO.loc[:,["ds", "yhat"]]
CO_subm.rename(columns={"yhat": "target_carbon_monoxide",
                        "ds": "date_time"}, inplace=True)


B_subm = pred_B.loc[:,["ds", "yhat"]]
B_subm.rename(columns={"yhat": "target_benzene",
              "ds": "date_time"}, inplace=True)

NO_subm = pred_NO.loc[:,["ds", "yhat"]]
NO_subm.rename(columns={"yhat": "target_nitrogen_oxides",
                        "ds": "date_time"}, inplace=True)

In [None]:
CO_subm.head(2)

In [None]:
B_subm.head(2)

In [None]:
NO_subm.head(2)

In [None]:
subm = pd.merge(CO_subm, B_subm, left_on='date_time', right_on='date_time')
subm = pd.merge(subm, NO_subm, left_on='date_time', right_on='date_time')

In [None]:
subm.head(3)

In [None]:
print(subm.isnull().sum().sum())

Cannot have negative values for the targets.

In [None]:
print(subm[subm['target_carbon_monoxide'] < 0]['target_carbon_monoxide'].count())
print(subm[subm['target_benzene'] < 0]['target_benzene'].count())
print(subm[subm['target_nitrogen_oxides'] < 0]['target_nitrogen_oxides'].count())

In [None]:
subm['target_carbon_monoxide'] = np.where(subm['target_carbon_monoxide'].values >= 0, 
                                          subm['target_carbon_monoxide'].values, 
                                          0)
subm['target_carbon_monoxide'] = np.where(subm['target_carbon_monoxide'].values >= 0, 
                                          subm['target_carbon_monoxide'].values, 
                                          0)
subm['target_carbon_monoxide'] = np.where(subm['target_carbon_monoxide'].values >= 0, 
                                          subm['target_carbon_monoxide'].values, 
                                          0)

In [None]:
subm.to_csv('prophet_pseudolabels_featEng_submission.csv', index=False)