In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np

import calendar
import xgboost

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV, LeaveOneOut, LeaveOneGroupOut, StratifiedKFold
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, RobustScaler

from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LinearRegression, MultiTaskElasticNetCV, MultiTaskLassoCV
from sklearn.metrics import mean_squared_log_error

pd.set_option('display.max_colwidth', None)

import warnings 
warnings.filterwarnings('ignore')

In [None]:
!pip install catboost
from catboost import CatBoostRegressor

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-jul-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-jul-2021/test.csv')

print('Shape of train: ', train.shape)
print('Shape of test: ', test.shape)

Lets understand the terminology first:

- Humidity: Is the amount of water or moisture present in the air in form of water vapour
- Relative Humidity: Is the percentage of moisture against the highest possible level of moisture in the air at specific temperature
- Absolute Humidity: Is the measure of moisture in the air regardless of temperature and expressed as grams of moisture per cubic meter of air(g/m3)

In [None]:
#Making a copy of train and test data

train_copy = train.copy()
test_copy = test.copy()

In [None]:
# Divide 'relative_humidity' column by 100, to convert from percentage 

train['relative_humidity'] = train['relative_humidity']/100
test['relative_humidity'] = test['relative_humidity']/100

In [None]:
# Check for outliers 

plt.figure(figsize=(25,20))
plt.subplot(4,4,1)
sns.boxplot(train['sensor_1'])

plt.subplot(4,4,2)
sns.boxplot(train['sensor_2'])

plt.subplot(4,4,3)
sns.boxplot(train['sensor_3'])

plt.subplot(4,4,4)
sns.boxplot(train['sensor_4'])

plt.subplot(4,4,5)
sns.boxplot(train['sensor_5'])

plt.subplot(4,4,6)
sns.boxplot(train['relative_humidity'])

plt.subplot(4,4,7)
sns.boxplot(train['absolute_humidity'])

plt.subplot(4,4,8)
sns.boxplot(train['deg_C'])

In [None]:
# Lets look at the records in detail 

display(train[train['sensor_2']>2250])
print()
display(train[train['sensor_3']>2400])
print()
display(train[train['sensor_4']>2800])
print()
display(train[train['absolute_humidity']>2.2])

In [None]:
# Drop indexes 6160, 5520, 4462, 6586, 6587, 6589, 6590, 6592

index = [6160, 5520, 4462, 6586, 6587, 6589, 6590, 6592]
train = train.drop(labels = index, axis = 0)

In [None]:
# Check distribution of target variables 

plt.figure(figsize=(13,10))
plt.subplot(2,2,1)
sns.histplot(train['target_benzene'], kde = True)

plt.subplot(2,2,2)
sns.histplot(train['target_carbon_monoxide'], kde = True)

plt.subplot(2,2,3)
sns.histplot(train['target_nitrogen_oxides'], kde = True)

In [None]:
# Convert distribution

plt.figure(figsize=(13,10))
plt.subplot(2,2,1)
sns.histplot(np.sqrt(train['target_benzene']), kde = True, color = 'Green')

plt.subplot(2,2,2)
sns.histplot(np.sqrt(train['target_carbon_monoxide']), kde = True, color = 'Green')

plt.subplot(2,2,3)
sns.histplot(np.log(train['target_nitrogen_oxides']), kde = True, color = 'Green')

In [None]:
train['target_benzene'] = np.sqrt(train['target_benzene'])
train['target_carbon_monoxide'] = np.sqrt(train['target_carbon_monoxide'])
train['target_nitrogen_oxides'] = np.log(train['target_nitrogen_oxides'])

In [None]:
# Assign target variable

target = pd.DataFrame(train[['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']])

In [None]:
train = train.drop(['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides'], axis = 1)

In [None]:
print('Shape of train: ', train.shape)
print('Shape of test: ', test.shape)

In [None]:
data = pd.concat([train, test])
data.shape

In [None]:
# Work with date column

data['date_time'] = pd.to_datetime(data['date_time'], format = '%Y-%m-%d %H:%M:%S')
data['hour'] = data['date_time'].dt.hour
data['working_hours'] = data['hour'].isin(np.arange(8,21,1)).astype('int')
data['is_weekend'] = (data['date_time'].dt.dayofweek>=5).astype('int')
#data['hr'] = data.date_time.dt.hour*60 +data.date_time.dt.minute
#data['satday'] = (data.date_time.dt.weekday == 5).astype('int')
data['Day_of_Week'] = data['date_time'].apply(lambda x:calendar.day_name[x.weekday()])
data['SMC'] = (data['absolute_humidity']*100)/data['relative_humidity']

In [None]:
data.head(3)

In [None]:
data = data.drop(['date_time', 'hour'], axis = 1)

In [None]:
data_num_cols = data._get_numeric_data().columns 
data_num_cols

In [None]:
data_cat_cols = data.columns.difference(data_num_cols)
data_cat_cols

In [None]:
#Separating both numeric and categorical data from set

data_num_data = data.loc[:, data_num_cols]
data_cat_data = data.loc[:, data_cat_cols]

print("Shape of num data:", data_num_data.shape)
print("Shape of cat data:", data_cat_data.shape)

In [None]:
s_scaler = RobustScaler()
data_num_data_s = s_scaler.fit_transform(data_num_data)

data_num_data_s = pd.DataFrame(data_num_data_s, columns = data_num_cols )

In [None]:
data_cat_data = pd.get_dummies(data_cat_data )
data_cat_data.head()

In [None]:

data_num_data_s.reset_index(drop=True, inplace=True)
data_cat_data.reset_index(drop=True, inplace=True)

data_new = pd.concat([data_num_data_s, data_cat_data], axis = 1)

In [None]:
train_new = data_new.loc[:7102,]
test_new = data_new.loc[7103:,]

print("Shape of train data:", train_new.shape)
print("Shape of test data:", test_new.shape)

In [None]:
from sklearn.model_selection import train_test_split 

trainx,valx,trainy,valy = train_test_split(train_new,target,test_size=0.25,random_state=1234)
#print(cust_data.shape)
print(trainx.shape)
print(valx.shape)

## XGBoost

In [None]:
xgb = xgboost.XGBRFRegressor()
xgb_m = MultiOutputRegressor(xgb)
xgb_m.fit(trainx, trainy)

In [None]:
#Predecting values on train and validation sets

pred_train_xgb = xgb_m.predict(trainx)
pred_val_xgb = xgb_m.predict(valx)

In [None]:
RMSLE_train_xgb = np.sqrt(mean_squared_log_error(trainy, abs(pred_train_xgb)))
RMSLE_val_xgb = np.sqrt(mean_squared_log_error(valy, abs(pred_val_xgb)))

RMSLE_val_xgb

In [None]:
pred_test_xgb = xgb_m.predict(test_new)

In [None]:
pred_test_xgb = pd.DataFrame(pred_test_xgb, columns=['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides'])
pred_test_xgb.head()

In [None]:
submission_xgb = pd.DataFrame(test_copy[['date_time']])
submission_xgb['target_carbon_monoxide'] = np.square(pred_test_xgb['target_carbon_monoxide'])
submission_xgb['target_benzene'] = np.square(pred_test_xgb['target_benzene'])
submission_xgb['target_nitrogen_oxides'] = (pred_test_xgb['target_nitrogen_oxides']*pred_test_xgb['target_nitrogen_oxides']*pred_test_xgb['target_nitrogen_oxides'])
submission_xgb.head()

## CatBoost

In [None]:
cat = CatBoostRegressor()
cat_m = MultiOutputRegressor(cat)
cat_m.fit(trainx, trainy)

In [None]:
#Predecting values on train and validation sets

pred_train_cat = cat_m.predict(trainx)
pred_val_cat = cat_m.predict(valx)

In [None]:
RMSLE_train_cat = np.sqrt(mean_squared_log_error(trainy, abs(pred_train_cat)))
RMSLE_val_cat = np.sqrt(mean_squared_log_error(valy, abs(pred_val_cat)))

RMSLE_val_cat

In [None]:
pred_test_cat = cat_m.predict(test_new)

In [None]:
pred_test_cat = pd.DataFrame(pred_test_cat, columns=['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides'])
pred_test_cat.head()

In [None]:
submission_cat = pd.DataFrame(test_copy[['date_time']])
submission_cat['target_carbon_monoxide'] = np.square(pred_test_cat['target_carbon_monoxide'])
submission_cat['target_benzene'] = np.square(pred_test_cat['target_benzene'])
submission_cat['target_nitrogen_oxides'] = np.exp(pred_test_cat['target_nitrogen_oxides'])
submission_cat.head()

## AdaBoost Regressor

In [None]:
ada = AdaBoostRegressor()
ada_m = MultiOutputRegressor(cat)
ada_m.fit(trainx, trainy)

In [None]:
#Predecting values on train and validation sets

pred_train_ada = ada_m.predict(trainx)
pred_val_ada = ada_m.predict(valx)

In [None]:
RMSLE_train_ada = np.sqrt(mean_squared_log_error(trainy, abs(pred_train_ada)))
RMSLE_val_ada = np.sqrt(mean_squared_log_error(valy, abs(pred_val_ada)))

RMSLE_val_ada

In [None]:
pred_test_ada = ada_m.predict(test_new)

In [None]:
pred_test_ada = pd.DataFrame(pred_test_ada, columns=['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides'])
pred_test_ada.head()

In [None]:
submission_ada = pd.DataFrame(test_copy[['date_time']])
submission_ada['target_carbon_monoxide'] = np.square(pred_test_ada['target_carbon_monoxide'])
submission_ada['target_benzene'] = np.square(pred_test_ada['target_benzene'])
submission_ada['target_nitrogen_oxides'] = np.exp(pred_test_ada['target_nitrogen_oxides'])
submission_ada.head()

## RandomForest Regressor

In [None]:
rfr = RandomForestRegressor()
rfr.fit(X = trainx,y = trainy)

In [None]:
#Predecting values on train and validation sets

pred_train_rfr = rfr.predict(trainx)
pred_val_rfr = rfr.predict(valx)

In [None]:
RMSLE_train_rfr = np.sqrt(mean_squared_log_error(trainy, abs(pred_train_rfr)))
RMSLE_val_rfr = np.sqrt(mean_squared_log_error(valy, abs(pred_val_rfr)))

RMSLE_val_rfr

In [None]:
pred_test_rfr = rfr.predict(test_new)

In [None]:
pred_test_rfr = pd.DataFrame(pred_test_rfr, columns=['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides'])
pred_test_rfr.head()

In [None]:
submission_rfr = pd.DataFrame(test_copy[['date_time']])
submission_rfr['target_carbon_monoxide'] = np.square(pred_test_rfr['target_carbon_monoxide'])
submission_rfr['target_benzene'] = np.square(pred_test_rfr['target_benzene'])
submission_rfr['target_nitrogen_oxides'] = np.exp(pred_test_rfr['target_nitrogen_oxides'])
submission_rfr.head()

## LassoCV

In [None]:
lasso_model = MultiTaskLassoCV()

lasso_model.fit(trainx, trainy)

In [None]:
pred_train_lso = lasso_model.predict(trainx)
pred_val_lso = lasso_model.predict(valx)

In [None]:
RMSLE_train_lso = np.sqrt(mean_squared_log_error(trainy, abs(pred_train_lso)))
RMSLE_val_lso = np.sqrt(mean_squared_log_error(valy, abs(pred_val_lso)))

RMSLE_val_lso

In [None]:
pred_test_lso = lasso_model.predict(test_new)

In [None]:
pred_test_lso = pd.DataFrame(pred_test_rfr, columns=['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides'])
pred_test_lso.head()

In [None]:
submission_lso = pd.DataFrame(test_copy[['date_time']])
submission_lso['target_carbon_monoxide'] = np.square(pred_test_lso['target_carbon_monoxide'])
submission_lso['target_benzene'] = np.square(pred_test_lso['target_benzene'])
submission_lso['target_nitrogen_oxides'] = np.exp(pred_test_lso['target_nitrogen_oxides'])
submission_lso.head()

## Submission

In [None]:
Submission = pd.DataFrame(test_copy[['date_time']])
Submission['target_carbon_monoxide'] = (submission_ada['target_carbon_monoxide']+submission_cat['target_carbon_monoxide']+submission_lso['target_carbon_monoxide']
                                       +submission_rfr['target_carbon_monoxide']+submission_xgb['target_carbon_monoxide'])/5
Submission['target_benzene'] = (submission_ada['target_benzene']+submission_cat['target_benzene']+submission_lso['target_benzene']
                               +submission_rfr['target_benzene']+submission_xgb['target_benzene'])/5
Submission['target_nitrogen_oxides'] = (submission_ada['target_nitrogen_oxides']+submission_cat['target_nitrogen_oxides']+submission_lso['target_nitrogen_oxides']
                                       +submission_rfr['target_nitrogen_oxides']+submission_xgb['target_nitrogen_oxides'])/5
Submission.head()

In [None]:
Submission.head()

In [None]:
Submission.to_csv('Sub.csv', index = False)