In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as torch_optim
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models
from datetime import datetime
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Function creation for feature engineering / evaluation

In [None]:
#Metorological feature engineering functions: all inspired by MetPy
def sat_vapor_pressure(temperature):
    #ref: http://wwwdca.iag.usp.br/material/hallak/AGM-5716/Artigo_Bolton1980/Bolton-MWR-1980.pdf
    return 6.112 * np.exp((17.67 * temperature)/(temperature+243.5))

def dewpoint(vapour_pressure):
    tp = np.log(vapour_pressure/6.112)
    top = 243.5*tp
    bottom = 17.67-tp
    return top/bottom

def dewpoint_rh(rh,temp):
    tp = sat_vapor_pressure(temp)
    vp = rh/100*tp
    return dewpoint(vp)

In [None]:
def SLE(pred, actual):
    return (np.log((pred + 1))-np.log((actual +1)))**2
def RMSLE(series_sle):
    return np.sqrt(np.mean(series_sle))
def m_RMSLE(model, X, y):
    model_sle = SLE(model.predict(X),y)
    return RMSLE(model_sle)

## Preliminary data work

In [None]:
#lets import the data
test = pd.read_csv("/kaggle/input/tabular-playground-series-jul-2021/test.csv")
train = pd.read_csv("/kaggle/input/tabular-playground-series-jul-2021/train.csv")


test_labs = test["date_time"]

train.drop(train.tail(1).index,inplace=True)


In [None]:
#normalizing the data, all targets are left skewed
y = ["target_carbon_monoxide","target_benzene","target_nitrogen_oxides"]

for c in y:
    train[c] = np.log1p(train[c])


In [None]:
#Grouping data and feature engineering by time
stacked_df = train.append(test)

stacked_df.date_time = pd.to_datetime(stacked_df["date_time"],format = "%Y-%m-%d %H:%M:%S")


## Feature engineering time and temperature data lead to diminished accuracy on the test data set:
#Possibly a result of overfitting?


#stacked_df["season"] = (stacked_df["date_time"].dt.month%12 // 3 +1).astype("float")
#stacked_df["month"] = (stacked_df["date_time"].dt.month).astype("float")

#stacked_df["is_weekend"] = (stacked_df["date_time"].dt.dayofweek > 4).astype("float")
#stacked_df["day_of_week"] = (stacked_df["date_time"].dt.dayofweek).astype("float")
#stacked_df["week"] = (stacked_df["date_time"].dt.week).astype("float")

#stacked_df["hour"] = (stacked_df["date_time"].dt.hour).astype("float")

#stacked_df['saturated_vapour_pressure'] = sat_vapor_pressure(stacked_df["deg_C"])
#stacked_df["dewpoint"] = dewpoint_rh(stacked_df["relative_humidity"],stacked_df["deg_C"])

In [None]:
stacked_df.columns

In [None]:
sensor_columns = ["sensor_5","sensor_4","sensor_3","sensor_2","sensor_1"]

In [None]:
#creating columns for the difference in sensor data from 6 hours ago to present, this should allow the model to identify trends in the levels of target.
for c in sensor_columns:
    tmp = ((stacked_df[c].shift(periods = 6)).fillna(np.mean(stacked_df[c])) - stacked_df[c])
    tmp.name = c +" trend"
    stacked_df.insert(4,tmp.name,tmp)

In [None]:
#setting up individual drops
y = stacked_df[["target_carbon_monoxide","target_benzene","target_nitrogen_oxides"]]
drop_TB = [ "date_time", "target_benzene","target_carbon_monoxide", "target_nitrogen_oxides"]
drop = [ "date_time", "target_carbon_monoxide", "target_nitrogen_oxides"]
drop_NO = [ "date_time", "target_nitrogen_oxides"]

In [None]:
no_df = stacked_df.drop(columns = drop_TB)
tb_df = stacked_df.drop(columns = drop_TB)
stacked_df = stacked_df.drop(columns = drop_TB)

In [None]:
Xcm = stacked_df[:7110]
valid_Xcm = stacked_df[7110:]

Xno = no_df[:7110]
valid_Xno = no_df[7110:]

Xtb = tb_df[:7110]
valid_Xtb= tb_df[7110:]


y = y[:7110]

In [None]:
cm = y["target_carbon_monoxide"]
no = y["target_nitrogen_oxides"]
tb = y["target_benzene"]

In [None]:
Xtrain_tb,Xtest_tb, ytrain_tb, ytest_tb = train_test_split(Xtb,tb , random_state = 4)

Xtrain_cm,Xtest_cm, ytrain_cm, ytest_cm = train_test_split(Xcm,cm, random_state = 4)

Xtrain_no,Xtest_no, ytrain_no, ytest_no = train_test_split(Xno,no, random_state = 4)

## model generation

Using the tried and tested kaggle strategy of averaging multiple models

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from catboost import CatBoostRegressor, Pool
import xgboost as xgb

In [None]:
tb_t = Pool(Xtrain_tb,ytrain_tb)
test_tb = Pool(Xtest_tb, ytest_tb)

no_t = Pool(Xtrain_no,ytrain_no)
test_no = Pool(Xtest_no, ytest_no)

cm_t = Pool(Xtrain_cm,ytrain_cm)
test_cm = Pool(Xtest_cm, ytest_cm)



In [None]:
rf = RandomForestRegressor(random_state = 4)
cbr = CatBoostRegressor(random_state = 4, depth = 6, learning_rate = 0.05)
gbm = GradientBoostingRegressor(random_state = 4)
xgd = xgb.XGBRegressor(n_estimators=600,n_jobs=8,learning_rate=0.1)

In [None]:
def m_feat_importance(model,dataframe):
    return pd.DataFrame({"cols":dataframe.columns,"imp":model.feature_importances_})

In [None]:
from sklearn.metrics import mean_absolute_error
rf= RandomForestRegressor()
rf.fit(Xtrain_cm,ytrain_cm)
predictions= rf.predict(Xtest_cm)
print("mean ab error: "+ str(mean_absolute_error(ytest_cm,predictions) ))

In [None]:
#predictions for tb
rf.fit(Xtb,tb)
random_forest_preds = rf.predict(valid_Xtb)
tb_preds = pd.DataFrame({"rf":random_forest_preds})

gbm.fit(Xtb,tb)
tb_preds["gbm"] = gbm.predict(valid_Xtb)

cbr.fit(tb_t, eval_set = (test_tb),use_best_model = True,silent = True)
tb_preds["cb"] = cbr.predict(valid_Xtb)

tr = xgb.DMatrix(Xtrain_tb, ytrain_tb) 
xgr = xgd.fit(Xtrain_tb, ytrain_tb, eval_set = [(Xtest_tb,ytest_tb)], verbose = False)
tb_preds["xgb"] = xgr.predict(valid_Xtb)

mean_preds_tb = tb_preds.mean(axis = 1)
final_preds_tb = np.expm1(mean_preds_tb)

In [None]:
tb_preds

In [None]:
#predictions for cm
rf.fit(Xcm,cm)
random_forest_preds = rf.predict(valid_Xcm)
cm_preds = pd.DataFrame({"rf":random_forest_preds})

gbm.fit(Xcm,cm)
cm_preds["gbm"] = gbm.predict(valid_Xcm)

cbr.fit(cm_t, eval_set = (test_cm),use_best_model = True,silent = True)
cm_preds["cb"] = cbr.predict(valid_Xcm)

tr = xgb.DMatrix(Xtrain_tb, ytrain_tb) 
xgr = xgd.fit(Xtrain_cm, ytrain_cm, eval_set = [(Xtest_cm,ytest_cm)], verbose = False)
cm_preds["xgb"] = xgr.predict(valid_Xcm)

mean_preds_cm = cm_preds.mean(axis = 1)
final_preds_cm = np.expm1(mean_preds_cm)

In [None]:
cm_preds

In [None]:
#predictions for no
rf.fit(Xno,no)
random_forest_preds = rf.predict(valid_Xno)
no_preds = pd.DataFrame({"rf":random_forest_preds})

gbm.fit(Xno,no)
no_preds["gbm"] = gbm.predict(valid_Xno)

cbr.fit(no_t, eval_set = (test_no),use_best_model = True,silent = True)
no_preds["cb"] = cbr.predict(valid_Xno)

tr = xgb.DMatrix(Xtrain_tb, ytrain_tb) 
xgr = xgd.fit(Xtrain_no, ytrain_no, eval_set = [(Xtest_no,ytest_no)], verbose = False)
no_preds["xgb"] = xgr.predict(valid_Xno)

mean_preds_no = no_preds.mean(axis = 1)
final_preds_no = np.expm1(mean_preds_no)

In [None]:
final_preds_no

In [None]:
output = pd.DataFrame({'date_time': test_labs, 'target_carbon_monoxide': final_preds_cm,
                      'target_benzene': final_preds_tb, 'target_nitrogen_oxides': final_preds_no })

In [None]:
"""output['target_nitrogen_oxides'] = output['target_nitrogen_oxides']+ (300-output['target_nitrogen_oxides']).mean()
output['target_carbon_monoxide'] = output['target_carbon_monoxide']+ (2-output['target_carbon_monoxide']).mean()
output['target_benzene'] = output['target_benzene']+ (10-output['target_benzene']).mean()"""

In [None]:
output.head()

In [None]:
output.to_csv('avg_pred.csv',index = False)
print('Great success!')