In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_log_error
import math

In [None]:
#1. load
#2. explore
#3. prepare 
#4. train
#5. predict
#6. visualize
#7. evaluate

In [None]:
#load csv
dataset_train = pd.read_csv("../input/tabular-playground-series-jul-2021/train.csv")

In [None]:
dataset_train

In [None]:
#plot a target column to get an overview
plt.plot(dataset_train.target_carbon_monoxide)

In [None]:
#setting time index (not necessary, but usual in bigger datasets)
dataset_train["time-idx"] = dataset_train.index

In [None]:
#checking for missing values
dataset_train.isnull().any()

In [None]:
######################################## 
######################################## creating and preparing datasets 


In [None]:
train_size_percent = 0.7
TRAINING_SIZE = int(len(dataset_train)*train_size_percent)

#dropping useless columns for gradient-boosting
COLS_TO_DROP = ["date_time","time-idx"]
TARGET_COLS = ["target_carbon_monoxide","target_benzene","target_nitrogen_oxides"]

#train-test split
train_df = dataset_train.iloc[:TRAINING_SIZE]
test_df = dataset_train.iloc[TRAINING_SIZE:]
print(test_df.shape)
print(train_df.shape)

In [None]:
######################################## creating datasets 
######################################## 
target = "target_carbon_monoxide"
X_train = train_df.drop(COLS_TO_DROP+TARGET_COLS,axis=1)
y_train = train_df[TARGET_COLS]

X_test = test_df.drop(COLS_TO_DROP+TARGET_COLS,axis=1)
y_test = test_df[TARGET_COLS]

X_test.head()

In [None]:
#checking for useless columns
X_train.columns

In [None]:
########################################## 
########################################## Linear Regression
#performing regression on target columns

#creating gradient booster
gradient_booster = GradientBoostingRegressor(random_state=0)

#instatiate multi-output-regressor, which uses our created gradient-booster
multi_regressor = MultiOutputRegressor(gradient_booster)

#fit
multi_regressor.fit(X_train,y_train)

#predict
y_pred = multi_regressor.predict(X_test)

########################################## Linear Regression 
########################################## 

In [None]:
y_pred

In [None]:
y_pred.shape


In [None]:
#converting arrays to DataFrames to set the index for plotting
pred_carbon_monoxide =pd.DataFrame( y_pred[:,0])
pred_benzene = pd.DataFrame(y_pred[:,1])
pred_nitrogen_oxides = pd.DataFrame(y_pred[:,2])

In [None]:
#shape-check in between and index correction
print("Original data length "+str(len(dataset_train)))
print("training data length "+str(len(train_df)))

print("prediction data length "+str(len(pred_carbon_monoxide)))
print("test data data length "+str(len(test_df)))
print(y_test.index)

pred_carbon_monoxide.index = y_test.index
pred_benzene.index = y_test.index
pred_nitrogen_oxides.index = y_test.index

In [None]:
##############################plotting predictions


In [None]:
plt.plot(y_train.target_carbon_monoxide,label="training data")
plt.plot(pred_carbon_monoxide,label="predicted data")
plt.title("Carbon Monoxide")
plt.legend()
plt.show()


In [None]:
plt.plot(y_train.target_benzene,label="training data")
plt.plot(pred_benzene,label="predicted data")
plt.title("Benzene")
plt.legend()
plt.show()


In [None]:
plt.plot(y_train.target_nitrogen_oxides,label="training data")
plt.plot(pred_nitrogen_oxides,label="predicted data")
plt.title("Nitrogen Oxides")
plt.legend()
plt.show()


In [None]:
###################################Evaluating error [RMSLE]
rmsle_carbon_monoxide = np.sqrt(mean_squared_log_error(y_test.iloc[:,0], pred_carbon_monoxide))
rmsle_benzene = np.sqrt(mean_squared_log_error(y_test.iloc[:,1], pred_benzene))
rmsle_nitrogen_oxides = np.sqrt(mean_squared_log_error(y_test.iloc[:,2], pred_nitrogen_oxides))

#used external source  for this formular: [https://www.kaggle.com/questions-and-answers/60012]

In [None]:
rmsle_carbon_monoxide

In [None]:
rmsle_benzene

In [None]:
rmsle_nitrogen_oxides

In [None]:
######################################MODEL-CONCEPT FINNISHED 
######################################
######################################PREDICTING FOR SUBMISSION FILE -->

submission_X = pd.read_csv("../input/tabular-playground-series-jul-2021/test.csv")
submission_X.head()

In [None]:
#taking whole dataset as train-set
orig_train_X = dataset_train.drop(COLS_TO_DROP+TARGET_COLS,axis=1)
orig_train_y = dataset_train[TARGET_COLS]


In [None]:
###################### creating final model

In [None]:
#creating gradient booster
final_gradient_booster = GradientBoostingRegressor(random_state=1)

#instatiate multi-output-regressor, which uses our created gradient-booster
final_multi_regressor = MultiOutputRegressor(gradient_booster)

#fit
final_multi_regressor.fit(orig_train_X,orig_train_y)

#predict
final_y_pred = multi_regressor.predict(submission_X.drop("date_time",axis=1))

In [None]:
#converting predictions to DataFrame for submission
final_y_pred = pd.DataFrame(final_y_pred)

In [None]:
#adding date time again
final_y_pred["date_time"] = submission_X.date_time

In [None]:
###setting column names 
final_y_pred.columns = TARGET_COLS+["date_time"]
final_y_pred

In [None]:
#saving submission file
final_y_pred.to_csv("./final_y_pred.csv",index=False)