In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-jul-2021/train.csv',parse_dates=['date_time'])
test = pd.read_csv('/kaggle/input/tabular-playground-series-jul-2021/test.csv')
sample = pd.read_csv('/kaggle/input/tabular-playground-series-jul-2021/sample_submission.csv')


# Ideas
* Relative Humidity vs Absolute Humidity. Partial pressure of h20, or amount of h20 in the air might be an interesting feature. 
* Fourier Transform some of the data and see if that provides any interesting features
* Correlation plots vs each of the targets.


# Exploratory Data Analysis

Units for CO and benzen appear to in ppm based on ranges of values given in various cities via the EPA in the 2011-2013 time range. The Nitrogen oxides are typically measured in the 10s of ppb, not ppm. So the numbers may be a bit small to be ppm for nitrogen oxides. There also appears to be a difference in amount of NO in the EPA data between areas with high humidity. Areas with higher humidity appear to have lower amounts of NO.

In [None]:
train.describe()

In [None]:
targets = sample.columns[sample.columns.str.contains('target')].tolist()
inputs = test.columns.tolist()


In [None]:
train['day_name'] = train['date_time'].apply(lambda x: x.dayofweek)
train['hour'] = train['date_time'].apply(lambda x: x.hour)
train['day_hour'] = train['day_name']*24 + train['hour']

In [None]:
sns.pairplot(train[['hour'] + targets])

In [None]:
fig, ax = plt.subplots(3,1,figsize=(20,20))
ax[0].hist(train['target_carbon_monoxide'],bins=20)
ax[1].hist(train['target_benzene'],bins=20)
ax[2].hist(train['target_nitrogen_oxides'],bins=20)

In [None]:
sns.pairplot(train)

# Training
Starting with sensors first, since it's a smaller feature set.

In [None]:
features = ['sensor_1', 'sensor_2', 'sensor_3', 'sensor_4']
X = train[features]
y = train[targets]

In [None]:
from skmultilearn.model_selection import iterative_train_test_split

X_train, y_train, X_test, y_test = iterative_train_test_split(X.to_numpy(), y.to_numpy(), test_size = 0.8)

In [None]:
# Scale everything
xScaler = MinMaxScaler(feature_range=(1,2))
yScaler = MinMaxScaler(feature_range=(1,2))

newX = xScaler.fit_transform(X_train)
newY = yScaler.fit_transform(y_train)

In [None]:
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import fbeta_score, make_scorer

msle_scorer = make_scorer(mean_squared_log_error,greater_is_better=False)

parameters = [{'kernel': ['rbf'], 'C': [1, 10, 100, 1000]},
              {'kernel': ['sigmoid'], 'C': [1, 10, 100, 1000]}]

grid_parameters = [{'estimator__kernel': ['rbf'], 'estimator__C': [1, 10, 100, 1000]},
              {'estimator__kernel': ['sigmoid'], 'estimator__C': [1, 10, 100, 1000]}]



In [None]:
# Grid Search Training
mySVR = SVR()

clf = GridSearchCV(MultiOutputRegressor(mySVR), grid_parameters, scoring=msle_scorer)
#clf = MultiOutputRegressor(GridSearchCV(SVR(), parameters, scoring=msle_scorer))

clf.fit(X=newX, y=newY)

print(clf.best_params_)

means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']

for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

In [None]:
y_pred = yScaler.inverse_transform(clf.predict(xScaler.transform(X_test)))

mean_squared_log_error(y_test,y_pred)

In [None]:
# Custom HyperParamater Tuning
# Scale everything
xScaler = StandardScaler()
yScaler = StandardScaler()

newX = xScaler.fit_transform(X_train)
newY = yScaler.fit_transform(y_train)

In [None]:
SVR_dict = dict()
for param in parameters:
    for Cparam in param['C']:
        kernel = param['kernel'][0]
        newSVR = SVR(kernel=kernel, C=Cparam)
        custom_reg = MultiOutputRegressor(newSVR)
        SVR_dict[kernel+'_'+str(Cparam)] = custom_reg
        custom_reg.fit(newX,newY)
        
        y_orig_pred = yScaler.inverse_transform(custom_reg.predict(xScaler.transform(X_test)))
        y_pred = np.where(y_orig_pred < 0, 0, y_orig_pred)
        if ((y_test < 0).any() or (y_pred < 0).any()):
            print("Something weird happened:")
            print((y_test < 0).any())
            print((y_pred < 0).any())
        else:
            msle = mean_squared_log_error(y_test,y_pred)
        print(f"{param['kernel']} with C:{Cparam} MSLE: {msle}")

print("No tweaks")
base = MultiOutputRegressor(SVR())
base.fit(newX,newY)
y_orig_pred = yScaler.inverse_transform(custom_reg.predict(xScaler.transform(X_test)))
if ((y_test < 0).any() or (y_pred < 0).any()):
    print("Something weird happened:")
    print((y_test < 0).any())
    print((y_pred < 0).any())
else:
    msle = mean_squared_log_error(y_test,y_pred)
print(f"MSLE: {msle}")


In [None]:
# Debugging
newSVR = SVR(kernel='rbf', C=10)
custom_reg = MultiOutputRegressor(newSVR)
custom_reg.fit(newX,newY)
        
y_pred = yScaler.inverse_transform(custom_reg.predict(xScaler.transform(X_test)))

(y_pred < 0).any()


In [None]:
# Best one was RBF with C = 10
SVR_dict['rbf_10']

clf = SVR_dict['rbf_10']

# Predictions

Ok, this is terrible and I shouldn't do it, but I'm curious to see if it makes my final predictions different. For some reason tuning the hyperparameters made for a *worse* outcome than my initial first pass. Which is super confusing to me. So I'm going to use the tuned hyperparameters with *all* of the original data.

In [None]:
xScaler = StandardScaler()
yScaler = StandardScaler()

newX = xScaler.fit_transform(X)
newY = yScaler.fit_transform(y)

newSVR = SVR(kernel='rbf', C=10)
clf = MultiOutputRegressor(newSVR)
clf.fit(newX,newY)



In [None]:

testX = xScaler.transform(test[features])
testY_orig = yScaler.inverse_transform(clf.predict(testX))
testY = np.where(testY_orig < 0, 0, testY_orig)

In [None]:
submission_df = pd.DataFrame(columns = ['date_time'] + targets)
submission_df['date_time'] = test['date_time']
submission_df[targets] = testY

In [None]:
submission_df.to_csv('/kaggle/working/submission.csv',index=False)