# Installing Dependencies

In [None]:
import os
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn.model_selection as ms
import sklearn.metrics as metrics
from sklearn.preprocessing import MinMaxScaler
from tqdm.notebook import tqdm
from sklearn.linear_model import LinearRegression

In [None]:
model_save_folder = "models"
csv_folder = "csv"

In [None]:
os.makedirs(model_save_folder, exist_ok=True)
os.makedirs(csv_folder, exist_ok=True)

# Loading Data

- train feature engineered csv
- test feature engineered csv
- leak test submission csv from [this notebook](https://www.kaggle.com/junhyeok99/tps-pycaret-data-leaked)

In [None]:
train_data = pd.read_csv("../input/tabular-playground-series-jul-2021-more-features/csv/train_data.csv")
test_data = pd.read_csv("../input/tabular-playground-series-jul-2021-more-features/csv/test_data.csv")
test_leaked_sub = pd.read_csv("../input/tps-pycaret-data-leaked/sub.csv")

In [None]:
targets = ['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']

features from feature selection using lasso regression

In [None]:
features = ['deg_C', 'sensor_1', 'sensor_3', 'sensor_5', 'year', 'month', 'day', 'hour', 'dayofweek', 'week_of_year', 'quarter', 'weekend', 'phase', 'working_hours', 'season', 'r_rh_temp', 'deg_C_diff_0', 'deg_C_diff_2', 'deg_C_diff_6', 'relative_humidity_diff_0', 'relative_humidity_diff_6', 'absolute_humidity_diff_0', 'absolute_humidity_diff_2', 'absolute_humidity_diff_6', 'sensor_3_diff_2', 'sensor_3_diff_6', 'sensor_4_diff_7', 'sensor_4_diff_6', 'sensor_5_diff_2', 'r_rh_temp_diff_0', 'r_rh_temp_diff_2', 'r_rh_temp_diff_6', 'sensor_1_mvag_6', 'sensor_2_mvag_6', 'sensor_5_mvag_6', 'r_rh_temp_mvag_6', 'deg_C_pct_change', 'deg_C_pct_change_sign', 'relative_humidity_pct_change', 'relative_humidity_pct_change_sign', 'absolute_humidity_pct_change', 'absolute_humidity_pct_change_sign', 'sensor_1_pct_change', 'sensor_1_pct_change_sign', 'sensor_2_pct_change_sign', 'sensor_3_pct_change_sign', 'sensor_4_pct_change', 'sensor_4_pct_change_sign', 'sensor_5_pct_change', 'sensor_5_pct_change_sign', 'r_rh_temp_pct_change_sign', 'relative_humidity_quant_6', 'absolute_humidity_quant_6', 'sensor_3_quant_6', 'sensor_4_quant_6']

In [None]:
test_leaked_sub.head()

In [None]:
test_data[targets] = test_leaked_sub[targets]

In [None]:
test_data.head()

# Feature and target scaling

In [None]:
scaler_features = MinMaxScaler()
scaler_target = MinMaxScaler()

In [None]:
scaler_features.fit(train_data[features])
scaler_target.fit(train_data[targets])

In [None]:
train_data[features] = scaler_features.transform(train_data[features])
train_data[targets] = scaler_target.transform(train_data[targets])

In [None]:
train_data.head()

In [None]:
test_data.head()

### removing any *np.inf* values from dataset

In [None]:
np.where(np.isinf(train_data[features].values))

In [None]:
np.where(np.isinf(test_data[features].values))

In [None]:
test_data = test_data.replace([np.inf, -np.inf], 1e8)

In [None]:
np.where(np.isinf(test_data[features].values))

In [None]:
test_data[features] = scaler_features.transform(test_data[features])
test_data[targets] = scaler_target.transform(test_data[targets])

In [None]:
test_data.head()

# cross validation setup and utils

In [None]:
def msle(y_true, y_pred):
    y_true = np.abs(y_true)
    y_pred = np.abs(y_pred)
    return np.sqrt(metrics.mean_squared_log_error(y_true, y_pred))

In [None]:
class CrossValidation:
    def __init__(self, df, shuffle,random_state=None):
        self.df = df
        self.random_state = random_state
        self.shuffle = shuffle
        if shuffle is True:
            self.df = df.sample(frac=1,
                random_state=self.random_state).reset_index(drop=True)
        if not shuffle:
            self.random_state = None

    def hold_out_split(self,percent,stratify=None):
        if stratify is not None:
            y = self.df[stratify]
            train,val = ms.train_test_split(self.df, test_size=percent/100,
                stratify=y, random_state=self.random_state)
            return train,val
        size = len(self.df) - int(len(self.df)*(percent/100))
        train = self.df.iloc[:size,:]
        val = self.df.iloc[size:,:]
        return train,val

    def kfold_split(self, splits, stratify=None):
        if stratify is not None:
            kf = ms.StratifiedKFold(n_splits=splits, 
                random_state=self.random_state)
            y = self.df[stratify]
            for train, val in kf.split(X=self.df,y=y):
                t = self.df.iloc[train,:]
                v = self.df.iloc[val, :]
                yield t,v
        else:
            kf = ms.KFold(n_splits=splits, shuffle=self.shuffle,
                random_state=self.random_state)
            for train, val in kf.split(X=self.df):
                t = self.df.iloc[train,:]
                v = self.df.iloc[val, :]
                yield t,v

In [None]:
class TFSimpleDataset:
    def __init__(self,batch_size, repeat, shuffle=False,
        drop_remainder_in_batch=False, 
        num_parallel_calls=tf.data.experimental.AUTOTUNE,
        buffer_size=tf.data.experimental.AUTOTUNE):
        self.batch_size = batch_size
        self.drop_remainder = drop_remainder_in_batch
        self.num_parallel_calls = num_parallel_calls
        self.buffer_size = buffer_size
        self.repeat = repeat
        self.shuffle = shuffle

    def create_dataset(self, X, Y=None):
        datasetX = tf.data.Dataset.from_tensor_slices(X)
        if Y is not None :
            datasetY = tf.data.Dataset.from_tensor_slices(Y)
            dataset = tf.data.Dataset.zip((datasetX,datasetY))
        else:
            dataset = datasetX
        if self.shuffle:
            dataset = dataset.shuffle(int(self.shuffle))
        dataset = dataset.batch(self.batch_size, 
            drop_remainder=self.drop_remainder)
        if self.repeat:
            dataset = dataset.repeat()
        dataset = dataset.prefetch(buffer_size=self.buffer_size)
        return dataset

In [None]:
def get_model(num_features):
    model = tf.keras.Sequential([
            tf.keras.layers.Dense(num_features),
            tf.keras.layers.Dense(64, activation="relu"),
            tf.keras.layers.Dense(16, activation="relu"),
            tf.keras.layers.Dropout(0.2),
            tf.keras.layers.Dense(8, activation="relu"),
            tf.keras.layers.Dense(1)
        ])
    model.compile(loss=tf.keras.losses.MeanSquaredError(), optimizer="adam", 
              metrics=["mse", "msle"])
    return model

In [None]:
def plot_history(history):
    fig, ax = plt.subplots(2,1)
    if history.history.get("loss"):
        ax[0].plot(history.history['loss'], color='b', 
            label="Training loss")
    if history.history.get("val_loss"):
        ax[0].plot(history.history['val_loss'], color='r', 
            label="Validation loss",axes =ax[0])
    legend = ax[0].legend(loc='best', shadow=True)
    if history.history.get("mse"):
        ax[1].plot(history.history['mse'], color='b', 
            label="Training mse")
    if history.history.get("val_mse"):
        ax[1].plot(history.history['val_mse'], color='r', 
            label="Validation mse")
    legend = ax[1].legend(loc='best', shadow=True)
    plt.show()

# Training model folds

In [None]:
def train_loop(train_dataset, val_dataset, epochs, params, model_name, callbacks=None, verbose=True):
    reg = get_model(**params)
    
    if callbacks is None:
        callbacks = []
        
    callbacks.append(tf.keras.callbacks.ModelCheckpoint(
        os.path.join("models", f"{model_name}.h5"), monitor="val_msle", verbose=1, 
        save_best_only=True, save_freq='epoch', mode="min"
    ))
    history = reg.fit(train_dataset, epochs=epochs, 
                      validation_data=val_dataset, 
                      callbacks=callbacks, verbose=int(verbose))
    
    plot_history(history)
    
    msle_s = []
    r2_s = []
    for trainX, trainY in train_dataset:
        predY_train = reg.predict(trainX)
        msle_s.append(msle(trainY, predY_train))
        r2_s.append(metrics.r2_score(trainY, predY_train))
    
    train_msle = np.mean(msle_s)
    train_r2 = np.mean(r2_s)
    
    if verbose:
        print("Training msle: ", train_msle)
        print("Training r2: ", train_r2)
    
    msle_s = []
    r2_s = []
    for valX, valY in val_dataset:
        predY_val = reg.predict(valX)
        msle_s.append(msle(valY, predY_val))
        r2_s.append(metrics.r2_score(valY, predY_val))
    
    val_msle = np.mean(msle_s)
    val_r2 = np.mean(r2_s)
    if verbose:
        print("Validation msle: ", val_msle)
        print("Validation r2: ", val_r2)
        
    return {"model": reg,
            "train_scores":{"r2": train_r2, "msle": train_msle},
            "val_scores":{"r2": val_r2, "msle": val_msle}
           }

In [None]:
def train_folds(cv, tf_dataset, feature_cols, target_col, not_target, num_folds, params, 
                epochs=10, callbacks=None, verbose=False):
    fold_train_rmsle = []
    fold_train_r2 = []
    fold_val_rmsle = [] 
    fold_val_r2 = []
    fold_models = []
    for fold, (train_, val_) in enumerate(cv.kfold_split(splits=num_folds)):
        print("Training Fold ",fold)
        train_dataset = tf_dataset.create_dataset(X=train_[feature_cols + not_target].values,
                                                    Y=train_[target_col].values)
        val_dataset = tf_dataset.create_dataset(X=val_[feature_cols + not_target].values,
                                                  Y=val_[target_col].values)
        
        model_name = f"model_{target_col}_{fold+1}"
        result = train_loop(train_dataset,
                            val_dataset,
                            model_name=model_name,
                            params=params,
                            epochs=epochs,
                            callbacks=callbacks,
                            verbose=verbose
                           )
        fold_train_rmsle.append(result["train_scores"]["msle"])
        
        fold_train_r2.append(result["train_scores"]["r2"])

        fold_val_rmsle.append(result["val_scores"]["msle"])
        fold_val_r2.append(result["val_scores"]["r2"])

        fold_models.append(result["model"])
        
    return {"models":fold_models,
            "train_scores":{"r2": np.mean(fold_train_r2), "msle": np.mean(fold_train_rmsle)},
            "val_scores":{"r2":np.mean(fold_val_r2), "msle":np.mean(fold_val_rmsle)}
           }

In [None]:
batch_size = 1024
epochs = 100
seed = 11
folds = 5

In [None]:
model_params = {"num_features": len(features)}

In [None]:
fold_models = {tar:[] for tar in targets}

In [None]:
data_creator = TFSimpleDataset(batch_size, repeat=False, shuffle=100)

In [None]:
cv = CrossValidation(train_data, shuffle=True, random_state=42)

In [None]:
callbacks = [
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2,
                              patience=1, min_lr=0.001, verbose=1),
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, verbose=1)
    ]

In [None]:
target = "target_carbon_monoxide"
not_target = [t for t in targets if t is not target]
results = train_folds(cv, data_creator, features, target, not_target, num_folds=folds, params=model_params, 
                      epochs=epochs, callbacks=callbacks)
fold_models[target] = results["models"]

print("="*50)
print("Training MSLE: ", results["train_scores"]["msle"])
print("Training R2: ", results["train_scores"]["r2"])
print("Validation MSLE: ", results["val_scores"]["msle"])
print("Validation R2: ", results["val_scores"]["r2"])

In [None]:
target = "target_benzene"
not_target = [t for t in targets if t is not target]
results = train_folds(cv, data_creator, features, target, not_target, num_folds=folds, 
                      params=model_params, 
                      epochs=epochs, callbacks=callbacks)
fold_models[target] = results["models"]

print("="*50)
print("Training MSLE: ", results["train_scores"]["msle"])
print("Training R2: ", results["train_scores"]["r2"])
print("Validation MSLE: ", results["val_scores"]["msle"])
print("Validation R2: ", results["val_scores"]["r2"])

In [None]:
target = "target_nitrogen_oxides"
not_target = [t for t in targets if t is not target]
results = train_folds(cv, data_creator, features, target, not_target, num_folds=folds, 
                      params=model_params, 
                      epochs=epochs, callbacks=callbacks)
fold_models[target] = results["models"]

print("="*50)
print("Training MSLE: ", results["train_scores"]["msle"])
print("Training R2: ", results["train_scores"]["r2"])
print("Validation MSLE: ", results["val_scores"]["msle"])
print("Validation R2: ", results["val_scores"]["r2"])

In [None]:
def get_weights(predictions, targets):
    lnr = get_model(5)
    callbacks = [tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3, restore_best_weights=True, 
                                                  verbose=1)]
    lnr.fit(predictions, targets, epochs=100, batch_size=128, callbacks=callbacks)
    return lnr, lnr.predict(predictions)

### load best weights

In [None]:
for key, models in fold_models.items():
    for fold, model in enumerate(models):
        model.load_weights(os.path.join("models", f"model_{key}_{fold+1}.h5" ))

In [None]:
trainY = train_data[targets]

In [None]:
predictions = []

In [None]:
preds = []
not_target = [t for t in targets if t is not targets[0]]
trainX = train_data[features+not_target].values
for model in fold_models[targets[0]]:
    preds.append(model.predict(trainX))

preds = np.array(preds)
co_mod, preds = get_weights(preds.transpose().squeeze(), trainY[targets[0]].values)
predictions.append(preds)

In [None]:
preds = []
not_target = [t for t in targets if t is not targets[1]]
trainX = train_data[features+not_target].values
for model in fold_models[targets[1]]:
    preds.append(model.predict(trainX))

preds = np.array(preds)
ben_mod, preds = get_weights(preds.transpose().squeeze(), trainY[targets[1]].values)
predictions.append(preds)

In [None]:
preds = []
not_target = [t for t in targets if t is not targets[2]]
trainX = train_data[features+not_target].values
for model in fold_models[targets[2]]:
    preds.append(model.predict(trainX))

preds = np.array(preds)
nox_mod, preds = get_weights(preds.transpose().squeeze(), trainY[targets[2]].values)
predictions.append(preds)

In [None]:
predictions = np.array(predictions).transpose().squeeze()
print(predictions.shape)
print(trainY.shape)

In [None]:
predictions = np.abs(predictions)

In [None]:
print("R2 score: ", metrics.r2_score(trainY, predictions))
print("RMSLE score: ", msle(trainY, predictions))

In [None]:
predictions.shape

In [None]:
scaler_target.inverse_transform(predictions)

In [None]:
new_df = pd.DataFrame()
new_df[features] = train_data[features]
new_df[targets] = scaler_target.inverse_transform(predictions)
new_df.to_csv(os.path.join(csv_folder, "train_predictions.csv"), index=False)

# Inference on test dataset

In [None]:
predictions = []

In [None]:
preds = []
not_target = [t for t in targets if t is not targets[0]]
testX = test_data[features+not_target].values
for model in fold_models[targets[0]]:
    preds.append(model.predict(testX))

preds = np.array(preds).transpose()

preds = co_mod.predict(preds)
predictions.append(preds)

In [None]:
preds = []
not_target = [t for t in targets if t is not targets[1]]
testX = test_data[features+not_target].values
for model in fold_models[targets[1]]:
    preds.append(model.predict(testX))

preds = np.array(preds).transpose()

preds = ben_mod.predict(preds)
predictions.append(preds)

In [None]:
preds = []
not_target = [t for t in targets if t is not targets[2]]
testX = test_data[features+not_target].values
for model in fold_models[targets[2]]:
    preds.append(model.predict(testX))

preds = np.array(preds).transpose()

preds = nox_mod.predict(preds)
predictions.append(preds)

In [None]:
predictions = np.array(predictions).transpose()

In [None]:
predictions = np.abs(predictions)

In [None]:
submission_csv = pd.read_csv("../input/tabular-playground-series-jul-2021/sample_submission.csv")

In [None]:
submission_csv[targets] = scaler_target.inverse_transform(predictions.squeeze())

In [None]:
scaler_target.inverse_transform(predictions.squeeze())

In [None]:
submission_csv.to_csv("submission.csv", index=False)

In [None]:
submission_csv.head()

In [None]:
new_df = pd.DataFrame()
new_df[features] = test_data[features]
new_df[targets] = scaler_target.inverse_transform(predictions.squeeze())
new_df.to_csv(os.path.join(csv_folder, "test_predictions.csv"), index=False)

In [None]:
!ls models