**The main purpose of the notebook is to explore all the different types of regression models and see how they perform on the test set. As I am new to this sort of problem I might have made a few errors or would have not included some important parameter in model building so please feel free to tell me in the comments section as it will be helpful for everyone. If you have new model suggestions then also please comment.**

**Ridge -> 0.34508<br>
Lasso -> 0.33936<br>
ElasticNet -> 0.33950<br>
DecisionTreeRegressor -> 0.37486<br>
Sarima -> 0.71287<br>
XGBRegressor -> 0.21350**

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-jul-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-jul-2021/test.csv')
sample_submission = pd.read_csv('/kaggle/input/tabular-playground-series-jul-2021/sample_submission.csv')

In [None]:
print(train.shape)
train.head(2)

In [None]:
print(test.shape)
test.head(2)

In [None]:
columns = test.columns[1:]
columns

In [None]:
X = train[columns].values
X_test = test[columns].values
target_1 = train['target_carbon_monoxide'].values.reshape(-1,1)
target_2 = train['target_benzene'].values.reshape(-1,1)
target_3 = train['target_nitrogen_oxides'].values.reshape(-1,1)

In [None]:
# scaler = StandardScaler()

# X = scaler.fit_transform(X)
# X_test = scaler.transform(X_test)

In [None]:
train.shape, test.shape

# Ridge

In [None]:
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV

In [None]:
rr = Ridge()
rr.fit(X, target_1)
sample_submission['target_carbon_monoxide'] = rr.predict(X_test)

rr = Ridge()
rr.fit(X, target_2)
sample_submission['target_benzene'] = rr.predict(X_test)

rr = Ridge()
rr.fit(X, target_3)
sample_submission['target_nitrogen_oxides'] = rr.predict(X_test)

sample_submission.head()

# Lasso

In [None]:
from sklearn.linear_model import Lasso, LassoCV

In [None]:
ls = Lasso()
ls.fit(X, target_1)
sample_submission['target_carbon_monoxide'] = ls.predict(X_test)

ls = Lasso()
ls.fit(X, target_2)
sample_submission['target_benzene'] = ls.predict(X_test)

ls = Lasso()
ls.fit(X, target_3)
sample_submission['target_nitrogen_oxides'] = ls.predict(X_test)

sample_submission.head()

In [None]:
lamdbalar = 10**np.linspace(10,-2,100)*0.5

lasso_cv1 = LassoCV(alphas = lamdbalar).fit(X, target_1)
ls = Lasso(alpha=lasso_cv1.alpha_)
ls.fit(X, target_1)
sample_submission['target_carbon_monoxide'] = ls.predict(X_test)

lasso_cv2 = LassoCV(alphas = lamdbalar).fit(X, target_2)
ls = Lasso(alpha=lasso_cv2.alpha_)
ls.fit(X, target_2)
sample_submission['target_benzene'] = ls.predict(X_test)

lasso_cv3 = LassoCV(alphas = lamdbalar).fit(X, target_3)
ls = Lasso(alpha=lasso_cv3.alpha_)
ls.fit(X, target_3)
sample_submission['target_nitrogen_oxides'] = ls.predict(X_test)

sample_submission.head()

# ElasticNet

In [None]:
from sklearn.linear_model import ElasticNet, ElasticNetCV

In [None]:
en = ElasticNet()
en.fit(X, target_1)
sample_submission['target_carbon_monoxide'] = en.predict(X_test)

en = ElasticNet()
en.fit(X, target_2)
sample_submission['target_benzene'] = en.predict(X_test)

en = ElasticNet()
en.fit(X, target_3)
sample_submission['target_nitrogen_oxides'] = en.predict(X_test)

sample_submission.head()

In [None]:
lamdbalar = 10**np.linspace(10,-2,100)*0.5

en_cv1 = ElasticNetCV(alphas = lamdbalar).fit(X, target_1)
en = ElasticNet(alpha=en_cv1.alpha_)
en.fit(X, target_1)
sample_submission['target_carbon_monoxide'] = en.predict(X_test)

en_cv2 = ElasticNetCV(alphas = lamdbalar).fit(X, target_2)
en = ElasticNet(alpha=en_cv2.alpha_)
en.fit(X, target_2)
sample_submission['target_benzene'] = en.predict(X_test)

en_cv3 = ElasticNetCV(alphas = lamdbalar).fit(X, target_3)
en = ElasticNet(alpha=en_cv3.alpha_)
en.fit(X, target_3)
sample_submission['target_nitrogen_oxides'] = en.predict(X_test)

sample_submission.head()

# Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error
from math import sqrt

A1_train,A1_test,b1_train,b1_test=train_test_split(X,target_1,random_state=0)
A2_train,A2_test,b2_train,b2_test=train_test_split(X,target_2,random_state=0)
A3_train,A3_test,b3_train,b3_test=train_test_split(X,target_3,random_state=0)

dtr = DecisionTreeRegressor()
dtr.fit(A1_train, b1_train)
out1 = dtr.predict(A1_test)
rmsle1 = sqrt(mean_squared_log_error(b1_test, out1))

dtr = DecisionTreeRegressor()
dtr.fit(A2_train, b2_train)
out2 = dtr.predict(A2_test)
rmsle2 = sqrt(mean_squared_log_error(b2_test, out2))

dtr = DecisionTreeRegressor()
dtr.fit(A3_train, b3_train)
out3 = dtr.predict(A3_test)
rmsle3 = sqrt(mean_squared_log_error(b3_test, out3))

print(rmsle1)
print(rmsle2)
print(rmsle3)
print((rmsle1+rmsle2+rmsle3)/3)

In [None]:
dtr = DecisionTreeRegressor()
dtr.fit(X, target_1)
sample_submission['target_carbon_monoxide'] = dtr.predict(X_test)

dtr = DecisionTreeRegressor()
dtr.fit(X, target_2)
sample_submission['target_benzene'] = dtr.predict(X_test)

dtr = DecisionTreeRegressor()
dtr.fit(X, target_3)
sample_submission['target_nitrogen_oxides'] = dtr.predict(X_test)

sample_submission.head()

# Sarima

In [None]:
pd.plotting.register_matplotlib_converters()
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose
import itertools
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Converting to float
data = train[['date_time', 'target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']]
data['target_nitrogen_oxides'] = data['target_nitrogen_oxides'] * 1.0
data = data.set_index(pd.DatetimeIndex(data['date_time']))
data_cm = data['target_nitrogen_oxides']
# f, (ax1) = plt.subplots(1, figsize=(12, 6))
# data_cm.resample('H').sum().plot(ax = ax1)

# resampling daywise
data_sarima = data.resample('H').mean()
#data_sarima = data_sarima.fillna(data_sarima['Sales'].mean())
data_sarima = data_sarima[['target_nitrogen_oxides']]
# data_sarima.plot()

# Define the p, d and q parameters to take any value between 0 and 3
p = d = q = range(0, 2)
# Generate all different combinations of p, q and q triplets
pdq = list(itertools.product(p, d, q))
# Generate all different combinations of seasonal p, q and q triplets
seasonal_pdq = [(x[0], x[1], x[2], 12) for x in list(itertools.product(p, d, q))]

# Determing p,d,q combinations with AIC scores.
for param in pdq:
    for param_seasonal in seasonal_pdq:
        mod = sm.tsa.statespace.SARIMAX(data_sarima,
                                        order=param,
                                        seasonal_order=param_seasonal,
                                        enforce_stationarity=False,
                                        enforce_invertibility=False)

        results = mod.fit()

        # print('ARIMA{}x{}12 - AIC:{}'.format(param, param_seasonal, results.aic))

In [None]:
# Fitting the data to SARIMA model 
model_sarima = sm.tsa.statespace.SARIMAX(data_sarima,
                                order=(1, 0, 1),
                                seasonal_order=(1, 1, 1, 12),
                                enforce_stationarity=False,
                                enforce_invertibility=False)

results_sarima = model_sarima.fit()
# print(results_sarima.summary().tables[1])

pred = results_sarima.get_prediction(start=pd.to_datetime('2011-01-01 00:00:00'), end=pd.to_datetime('2011-04-04 14:00:00'), dynamic = False) 
data_sarima_forecasted = pred.predicted_mean
ans = list(data_sarima_forecasted.values)
sample_submission['target_nitrogen_oxides'] = ans

In [None]:
# sample_submission.to_csv('submission_sarima.csv', index=False)

# XGBRegressor

In [None]:
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit

In [None]:
train = pd.read_csv("../input/tabular-playground-series-jul-2021/train.csv", low_memory=False)#, nrows=10000)
train["date_time"] = pd.to_datetime(train["date_time"], format="%Y-%m-%d %H:%M:%S")
test = pd.read_csv("../input/tabular-playground-series-jul-2021/test.csv", low_memory=False)
test["date_time"] = pd.to_datetime(test["date_time"], format="%Y-%m-%d %H:%M:%S")
train.info(memory_usage="deep")

targets = ["target_carbon_monoxide", "target_benzene", "target_nitrogen_oxides"]
target_names = ["Carbon monoxide", "Benzene", "Nitrogen oxides"]

def make_new_features(df):
    df["month"] = df["date_time"].dt.month
    df["day_of_week"] = df["date_time"].dt.dayofweek
    df["day_of_year"] = df["date_time"].dt.dayofyear
    df["hour"] = df["date_time"].dt.hour
    df["quarter"] = df["date_time"].dt.quarter
    df["week_of_year"] = df["date_time"].dt.isocalendar().week.astype("int")
#     df["is_winter"] = df["month"].isin([1, 2, 12])
#     df["is_sprint"] = df["month"].isin([3, 4, 5])
#     df["is_summer"] = df["month"].isin([6, 7, 8])
#     df["is_autumn"] = df["month"].isin([9, 10, 11])
    df["working_hours"] =  df["hour"].isin(np.arange(8, 21, 1)).astype("int")
    df["is_weekend"] = (train["date_time"].dt.dayofweek >= 5).astype("int")
    return df

train_copy = train.copy()
# test_copy = test.copy()
train = make_new_features(train)
# test = make_new_features(test)

In [None]:
# The months will be used for folds split
months = train_copy.drop([7110], axis=0)["date_time"].dt.month

preds = pd.DataFrame()
preds["date_time"] = test["date_time"].copy()

train_copy['date_time'] = train_copy['date_time'].astype('datetime64[ns]').astype(np.int64)/10**9
test['date_time'] = test['date_time'].astype('datetime64[ns]').astype(np.int64)/10**9

# Dropping the last row as noise
X = train_copy.drop([7110], axis=0)
targets = X[["target_carbon_monoxide", "target_benzene", "target_nitrogen_oxides"]].copy()
X.drop(["target_carbon_monoxide", "target_benzene", "target_nitrogen_oxides"], axis=1, inplace=True)#, "date_time"], axis=1, inplace=True)
y = np.log1p(targets)
X_test = test#.drop("date_time", axis=1)

def rmspe(y_true, y_pred):
    y_pred = y_pred[y_true != 0]
    y_true = y_true[y_true != 0]
    err = np.sqrt(np.mean((1 - y_pred / y_true) ** 2))
    return err

def rmspe_xgb(y_pred, y_true):
    y_true = y_true.get_label()
    err = rmspe(np.expm1(y_true), np.expm1(y_pred))
    return "rmspe", err

In [None]:
%%time

all_fi = []
splits = 10

for i, target in enumerate(targets.columns):
    print(f"\nTraining for {target}...")
    skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42)
    oof_preds = np.zeros((X.shape[0],))
    model_preds = 0
    model_fi = 0
    for num, (train_idx, valid_idx) in enumerate(skf.split(X, months)):
        X_train, X_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y.loc[train_idx, target], y.loc[valid_idx, target]
        model = xgb.XGBRegressor(n_estimators=5000, objective="reg:squarederror", max_depth=10,
                           learning_rate=0.03, colsample_bytree=0.7, subsample=0.9,
                           random_state=i, tree_method="gpu_hist")
        model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)],
            eval_metric=rmspe_xgb, early_stopping_rounds=100, verbose=False)
        model_preds += np.expm1(model.predict(X_test)) / splits
        model_fi += model.feature_importances_
        oof_preds[valid_idx] = np.expm1(model.predict(X_valid))
        print(f"Fold {num} RMSLE: {mean_squared_log_error(np.expm1(y_valid), oof_preds[valid_idx])}")
    print(f"\nOverall RMSLE: {mean_squared_log_error(np.expm1(y[target]), oof_preds)}")    
    preds[target] = model_preds
    all_fi.append(model_fi)

In [None]:
preds.to_csv('submission_xgbregressortuned.csv', index=False)
preds.head()