In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

train = pd.read_csv('/kaggle/input/tabular-playground-series-jul-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-jul-2021/test.csv')

In [None]:
train.head()

In [None]:
train.describe()

In [None]:
train.dtypes

In [None]:
train['date_time'] = pd.to_datetime(train['date_time'])

In [None]:
train.dtypes

In [None]:
import seaborn as sns
sns.scatterplot(x = 'date_time', y = 'deg_C', data = train)

In [None]:
sns.scatterplot(x = 'date_time', y = 'relative_humidity', data = train)

In [None]:
sns.scatterplot(x = 'date_time', y = 'absolute_humidity', data = train)

In [None]:
sns.displot(x = 'target_carbon_monoxide', data = train, kind = 'kde')

In [None]:
sns.displot(x = 'target_benzene', data = train, kind = 'kde')

In [None]:
sns.displot(x = 'target_nitrogen_oxides', data = train, kind = 'kde')

In [None]:
g = sns.pairplot(train, plot_kws=dict(alpha= 0.1))

In [None]:
X = train.drop(['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides'], axis = 1).copy()
y = train[['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']].copy()

In [None]:
X = X.drop('date_time', axis = 1)

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 42)

In [None]:
y_train_co = y_train['target_carbon_monoxide']
y_train_bz  = y_train['target_benzene']
y_train_no = y_train['target_nitrogen_oxides']

y_test_co = y_test['target_carbon_monoxide']
y_test_bz  = y_test['target_benzene']
y_test_no = y_test['target_nitrogen_oxides']

In [None]:
from sklearn.linear_model import ElasticNet, Lasso, BayesianRidge, LassoLarsIC, LinearRegression
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor, StackingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb



In [None]:
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [None]:
co_lm = LinearRegression()
co_lm.fit(X = X_train_scaled, y = y_train_co)
bz_lm = LinearRegression()
bz_lm.fit(X = X_train_scaled, y = y_train_bz)
no_lm = LinearRegression()
no_lm.fit(X = X_train_scaled, y = y_train_no)

In [None]:
X_test_scaled = scaler.transform(X_test)
print('CO Model Score: ' + str(co_lm.score(X_test_scaled, y_test_co)))
print('BZ Model Score: ' + str(bz_lm.score(X_test_scaled, y_test_bz)))
print('NO Model Score: ' + str(no_lm.score(X_test_scaled, y_test_no)))

In [None]:
co_log_lm = LinearRegression()
co_log_lm.fit(X = X_train_scaled, y = np.log(y_train_co))
bz_log_lm = LinearRegression()
bz_log_lm.fit(X = X_train_scaled, y = np.log(y_train_bz))
no_log_lm = LinearRegression()
no_log_lm.fit(X = X_train_scaled, y = np.log(y_train_no))

In [None]:

print('CO Model Logged Score: ' + str(co_log_lm.score(X_test_scaled, np.log(y_test_co))))
print('BZ Model Logged Score: ' + str(bz_log_lm.score(X_test_scaled, np.log(y_test_bz))))
print('NO Model Logged Score: ' + str(no_log_lm.score(X_test_scaled, np.log(y_test_no))))

In [None]:
valid = test.drop('date_time', axis = 1)
valid_scaled = scaler.transform(valid)

In [None]:
target_carbon_monoxide = np.exp(co_log_lm.predict(valid_scaled))
target_benzene = np.exp(bz_log_lm.predict(valid_scaled))
target_nitrogen_oxides_logged = no_log_lm.predict(valid_scaled)
target_nitrogen_oxides = np.exp(target_nitrogen_oxides_logged)

Gets a score of 0.343... we can do better

In [None]:
n_folds = 5

def rmsle_cv(model, train, y_train):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train.values)
    rmse= np.sqrt(-cross_val_score(model, train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)



In [None]:
lasso_co = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))
lasso_co.fit(X_train, y_train_co)
co_score = rmsle_cv(lasso_co, X_train, y_train_co)
print("\nLasso score: {:.4f} ({:.4f})\n".format(co_score.mean(), co_score.std()))

In [None]:
rf = RandomForestRegressor(random_state = 42)

params_rf = {'n_estimators': [400,500],
             'max_depth': [20,30],
    'max_features':['log2', 'auto', 'sqrt'],
    'min_samples_leaf':[2,5,10]
}

grid_rf_co = GridSearchCV(estimator=rf,
                       param_grid=params_rf,
                       scoring='neg_mean_squared_error',
                       cv=5,
                       verbose=1,
                       n_jobs=-1)

grid_rf_co.fit(X_train, y_train_co)



In [None]:
rf_bz = RandomForestRegressor(random_state = 42)
grid_rf_bz = GridSearchCV(estimator=rf_bz,
                       param_grid=params_rf,
                       scoring='neg_mean_squared_error',
                       cv=5,
                       verbose=1,
                       n_jobs=-1)

grid_rf_bz.fit(X_train, y_train_bz)

In [None]:
rf_no = RandomForestRegressor(random_state = 42)
grid_rf_no = GridSearchCV(estimator=rf_no,
                       param_grid=params_rf,
                       scoring='neg_mean_squared_error',
                       cv=5,
                       verbose=1,
                       n_jobs=-1)

grid_rf_no.fit(X_train, y_train_no)

In [None]:
best_rf_co = grid_rf_co.best_estimator_
best_rf_bz = grid_rf_bz.best_estimator_
best_rf_no = grid_rf_no.best_estimator_


In [None]:
best_rf_co.score(X_train, y_train_co)
best_rf_bz.score(X_train, y_train_bz)
best_rf_no.score(X_train, y_train_no)

In [None]:
y_predict_co = best_rf_co.predict(valid)
y_predict_bz = best_rf_bz.predict(valid)
y_predict_no = best_rf_no.predict(valid)

In [None]:
submission_df = test[['date_time']].copy()
submission_df['target_carbon_monoxide'] = y_predict_co
submission_df['target_benzene'] = y_predict_bz
submission_df['target_nitrogen_oxides'] = y_predict_no

In [None]:
submission_df

In [None]:
submission_df.to_csv('submission.csv', index = False)