In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.linear_model import Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

train_df = pd.read_csv('../input/tabular-playground-series-jul-2021/train.csv')
test_df = pd.read_csv('../input/tabular-playground-series-jul-2021/test.csv')
sample_df = pd.read_csv('../input/tabular-playground-series-jul-2021/sample_submission.csv')

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
sample_df.info()

In [None]:
plt.figure(figsize = (15,10))
sns.heatmap(train_df.corr())

In [None]:
i = 0
fig, axs = plt.subplots(11,1, figsize = (20,35))
for i in range(len(axs)):
    axs[i].plot(train_df.iloc[:,i+1])
    axs[i].set_title(train_df.columns[i+1])
    i += 1

In [None]:
X_train = train_df.iloc[:, 1:9]
y_train = train_df.iloc[:, -3:]
X_test = test_df.iloc[:, 1:]

In [None]:
lasso_reg = Lasso()
ridge_reg = Ridge()
rf_reg = RandomForestRegressor()
xgb_reg = XGBRegressor()
lgbm_reg = LGBMRegressor()
models = [lasso_reg, ridge_reg, rf_reg, xgb_reg, lgbm_reg]

1. **target_carbon_monoxide**

In [None]:
ols_1 = sm.OLS(y_train.iloc[:,0], X_train).fit()
ols_1.summary()

In [None]:
X_train_part, X_valid, y_train_part, y_valid = train_test_split(X_train, y_train, test_size = 0.3)

In [None]:
for model in models:
    model.fit(X_train_part, y_train_part['target_carbon_monoxide'])
    y_pred = model.predict(X_valid)
    mse = mean_squared_error(y_valid['target_carbon_monoxide'], y_pred)
    rmse = np.sqrt(mse)
    print("####", model.__class__.__name__, "'s target_carbon_monoxide RMSE : ", rmse, '\n')

In [None]:
cm_preds = lgbm_reg.fit(X_train, y_train.iloc[:,0]).predict(X_test)

2. **target_benzene**

In [None]:
ols_2 = sm.OLS(y_train.iloc[:,1], X_train).fit()
ols_2.summary()

In [None]:
X_tr_bz = X_train.drop(['absolute_humidity'], axis = 1, inplace = False)
X_test_bz = X_test.drop('absolute_humidity', axis = 1, inplace = False)
X_train_part, X_valid, y_train_part, y_valid = train_test_split(X_tr_bz, y_train, test_size = 0.3)

In [None]:
for model in models:
    model.fit(X_train_part, y_train_part['target_benzene'])
    y_pred = model.predict(X_valid)
    mse = mean_squared_error(y_valid['target_benzene'], y_pred)
    rmse = np.sqrt(mse)
    print("####", model.__class__.__name__, "'s target_benzene RMSE : ", rmse, '\n')

In [None]:
bz_preds = lgbm_reg.fit(X_tr_bz, y_train.iloc[:,1]).predict(X_test_bz)

3. **target_nitrogen_oxides**

In [None]:
ols_3 = sm.OLS(y_train.iloc[:,2], X_train).fit()
ols_3.summary()

In [None]:
X_tr_no = X_train.drop(['deg_C', 'absolute_humidity'], axis = 1, inplace = False)
X_test_no = X_test.drop(['deg_C', 'absolute_humidity'], axis = 1, inplace = False)
X_train_part, X_valid, y_train_part, y_valid = train_test_split(X_tr_no, y_train, test_size = 0.3)

In [None]:
for model in models:
    model.fit(X_train_part, y_train_part['target_nitrogen_oxides'])
    y_pred = model.predict(X_valid)
    mse = mean_squared_error(y_valid['target_nitrogen_oxides'], y_pred)
    rmse = np.sqrt(mse)
    print("####", model.__class__.__name__, "'s target_nitrogen_oxides RMSE : ", rmse, '\n')

In [None]:
no_preds = lgbm_reg.fit(X_tr_no, y_train.iloc[:,2]).predict(X_test_no)

In [None]:
predictions = pd.DataFrame({sample_df.columns[0]:test_df['date_time'], sample_df.columns[1]:cm_preds, sample_df.columns[2]:bz_preds, sample_df.columns[3]:no_preds})

In [None]:
predictions.head()

In [None]:
predictions.to_csv('submission.csv', index = False)