In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV

In [None]:
df_train=pd.read_csv("../input/tabular-playground-series-jul-2021/train.csv")
df_test=pd.read_csv("../input/tabular-playground-series-jul-2021/test.csv")

# Data Exploration:

## 1. Description and Statistics:

In [None]:
print(f"The train dataset has {df_train.shape[0]} entries and {df_train.shape[1]} columns.\n")
print(f"The test dataset has {df_test.shape[0]} entries and {df_test.shape[1]} columns.")

In [None]:
df_train.info()

There are 9 features and 3 target variables in the train set.

In [None]:
df_train.describe()

The above represents the basic statistics of the train set, which furnishes quantities like standard deviation, mean, etc... for every column.

## 2. Missing Values Handling:

In [None]:
df_train.isnull().sum()

In [None]:
df_test.isnull().sum()

There are no missing values in the train set and test set.

## 3. Outlier Detection:

In [None]:
fig,ax=plt.subplots(4,2,figsize=(30,20))
sns.boxplot(x='deg_C', data=df_train,ax=ax[0,0])
ax[0,0].set_title('deg_C')
sns.boxplot(x='relative_humidity', data=df_train,ax=ax[0,1])
ax[0,1].set_title('relative_humidity')
sns.boxplot(x='absolute_humidity', data=df_train,ax=ax[1,0])
ax[1,0].set_title('absolute_humidity')
sns.boxplot(x='sensor_1', data=df_train,ax=ax[1,1])
ax[1,1].set_title('sensor_1')
sns.boxplot(x='sensor_2', data=df_train,ax=ax[2,0])
ax[2,0].set_title('sensor_2')
sns.boxplot(x='sensor_3', data=df_train,ax=ax[2,1])
ax[2,1].set_title('sensor_3')
sns.boxplot(x='sensor_4', data=df_train,ax=ax[3,0])
ax[3,0].set_title('sensor_4')
sns.boxplot(x='sensor_5', data=df_train,ax=ax[3,1])
ax[3,1].set_title('sensor_5')
fig.suptitle("Outlier Check",size=30)

As we can see from the above boxplots, all the "sensor" variables are experimentally determined quantities. So outliers present in them are called *NATURAL OUTLIERS* and cannot be removed.

In order to find out number of outliers in each column:

In [None]:
def outlier_no(df,col):
    me=df[col].mean()
    st=df[col].std()
    upper= 3*st + me
    lower= -3*st + me
    l=df[(df[col]<=upper) & (df[col]>=lower)]
    return l.shape[0]

In [None]:
cols=['deg_C', 'relative_humidity', 'absolute_humidity', 'sensor_1',
       'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5']
for i in cols:
    legit=outlier_no(df_train,i)
    print(i," has", str(df_train.shape[0]-legit))

## 4. Univariate Analysis:

Probability Distribution Curve:

In [None]:
fig,ax=plt.subplots(4,2,figsize=(30,20))
sns.kdeplot(x='deg_C', data=df_train,ax=ax[0,0])
ax[0,0].set_title('deg_C')
sns.kdeplot(x='relative_humidity', data=df_train,ax=ax[0,1])
ax[0,1].set_title('relative_humidity')
sns.kdeplot(x='absolute_humidity', data=df_train,ax=ax[1,0])
ax[1,0].set_title('absolute_humidity')
sns.kdeplot(x='sensor_1', data=df_train,ax=ax[1,1])
ax[1,1].set_title('sensor_1')
sns.kdeplot(x='sensor_2', data=df_train,ax=ax[2,0])
ax[2,0].set_title('sensor_2')
sns.kdeplot(x='sensor_3', data=df_train,ax=ax[2,1])
ax[2,1].set_title('sensor_3')
sns.kdeplot(x='sensor_4', data=df_train,ax=ax[3,0])
ax[3,0].set_title('sensor_4')
sns.kdeplot(x='sensor_5', data=df_train,ax=ax[3,1])

# Basic Feature Additions:

In [None]:
df_train["date_time"]=pd.to_datetime(df_train["date_time"])
df_test["date_time"]=pd.to_datetime(df_test["date_time"])

Adding corresponding day,month,hour and day of week of each date/time entry.

In [None]:
df_train["month"]=df_train["date_time"].apply(lambda x:x.month)
df_test["month"]=df_test["date_time"].apply(lambda x:x.month)

df_train["day"]=df_train["date_time"].apply(lambda x:x.day)
df_test["day"]=df_test["date_time"].apply(lambda x:x.day)

df_train["hour"]=df_train["date_time"].apply(lambda x:x.hour)
df_test["hour"]=df_test["date_time"].apply(lambda x:x.hour)

df_train["dayofweek"]=df_train["date_time"].apply(lambda x:x.dayofweek)
df_test["dayofweek"]=df_test["date_time"].apply(lambda x:x.dayofweek)

Season ---->

         Spring--For the months MARCH,APRIL,MAY
         Summer--For the months JUNE,JULY,AUGUST
         Autumn--For the months SEPTEMBER,OCTOBER,NOVEMBER
         Winter--For the months DECEMBER,JANUARY,FEBRUARY
Time of Day ---->

         Morning is from 6 AM to 11:59 AM.(Sunrise typically occurs around 6 AM)
         Afternoon is from 12:00 PM to around 5:00 PM(17:00).
         Evening is from 5:01 PM(17:01) to 8 PM(20:00).
         Night is from sunset to sunrise, so from 8:01 PM(20:01) until 5:59 AM.

In [None]:
#Adding Season for train
for i in range(df_train.shape[0]):
    if df_train.loc[i,"month"] in [3,4,5]:
        df_train.loc[i,"Season"]="Spring"
    elif df_train.loc[i,"month"] in [6,7,8]:
        df_train.loc[i,"Season"]="Summer"
    elif df_train.loc[i,"month"] in [9,10,11]:
        df_train.loc[i,"Season"]="Autumn"
    elif df_train.loc[i,"month"] in [12,1,2]:
        df_train.loc[i,"Season"]="Winter"
        
#Adding Season for test
for i in range(df_test.shape[0]):
    if df_test.loc[i,"month"] in [3,4,5]:
        df_test.loc[i,"Season"]="Spring"
    elif df_test.loc[i,"month"] in [6,7,8]:
        df_test.loc[i,"Season"]="Summer"
    elif df_test.loc[i,"month"] in [9,10,11]:
        df_test.loc[i,"Season"]="Autumn"
    elif df_test.loc[i,"month"] in [12,1,2]:
        df_test.loc[i,"Season"]="Winter"


In [None]:
# Adding TimeofDay for train
for i in range(df_train.shape[0]):
    if df_train.loc[i,"hour"] in np.arange(6,12):   #np.arange(6,12)
        df_train.loc[i,"TimeofDay"]="Morning"
    elif df_train.loc[i,"hour"] in np.arange(12,18): #np.arange(12,18)
        df_train.loc[i,"TimeofDay"]="Afternoon"
    elif df_train.loc[i,"hour"] in np.arange(18,21): #np.arange(18,21)
        df_train.loc[i,"TimeofDay"]="Evening"
    else:
        df_train.loc[i,"TimeofDay"]="Night"

        
# Adding TimeofDay for test
for i in range(df_test.shape[0]):
    if df_test.loc[i,"hour"] in np.arange(6,12):   #np.arange(6,12)
        df_test.loc[i,"TimeofDay"]="Morning"
    elif df_test.loc[i,"hour"] in np.arange(12,18): #np.arange(12,18)
        df_test.loc[i,"TimeofDay"]="Afternoon"
    elif df_test.loc[i,"hour"] in np.arange(18,21): #np.arange(18,21)
        df_test.loc[i,"TimeofDay"]="Evening"
    else:
        df_test.loc[i,"TimeofDay"]="Night"


Weekday/Weekend :

In [None]:
def funct_week(n):
    if n in [0,1,2,3,4]:
        return "Weekday"
    else:
        return "Weekend"

In [None]:
df_train["dayofweek"]=df_train["dayofweek"].apply(lambda x:funct_week(x))
df_test["dayofweek"]=df_test["dayofweek"].apply(lambda x:funct_week(x))

Dew Temp

In [None]:
df_train["dewtemp"]=df_train["deg_C"]-((100-df_train["relative_humidity"])/5)
df_test["dewtemp"]=df_test["deg_C"]-((100-df_test["relative_humidity"])/5)

In [None]:
df_train.head()

# Removing unwanted Features:

In [None]:
df_train.drop(["month","day"],axis=1,inplace=True)
df_test.drop(["month","day"],axis=1,inplace=True)

In [None]:
df_train.corr()

# Visualizing the target variable:

In [None]:
fig,ax=plt.subplots(1,3,figsize=(30,5))
sns.kdeplot(x=df_train["target_carbon_monoxide"],ax=ax[0],shade=True)
ax[0].set_title("target_carbon_monoxide")
sns.kdeplot(x=df_train["target_benzene"],ax=ax[1],shade=True)
ax[1].set_title("target_benzene")
sns.kdeplot(x=df_train["target_nitrogen_oxides"],ax=ax[2],shade=True)
ax[2].set_title("target_nitrogen_oxides")

Since all Target variables are Right-Skewed(Log-Normal), we apply LOG-TRANSFORMATION:

## Log Transformation of Target variables:

In [None]:
df_train["log_carbon_monoxide"]=np.log(df_train["target_carbon_monoxide"])
df_train["log_benzene"]=np.log(1+df_train["target_benzene"])
df_train["log_nitrogen_oxides"]=np.log(1+df_train["target_nitrogen_oxides"])

In [None]:
fig,ax=plt.subplots(1,3,figsize=(30,5))
sns.kdeplot(x=df_train["log_carbon_monoxide"],ax=ax[0],shade=True)
ax[0].set_title("log_carbon_monoxide")
sns.kdeplot(x=df_train["log_benzene"],ax=ax[1],shade=True)
ax[1].set_title("log_benzene")
sns.kdeplot(x=df_train["log_nitrogen_oxides"],ax=ax[2],shade=True)
ax[2].set_title("log_nitrogen_oxides")

Now Target variables almost look Normally Distributed. 

In [None]:
df_train.drop(columns=["target_carbon_monoxide","target_benzene","target_nitrogen_oxides"],inplace=True)

# Bivariate Analysis

## Time Series Analysis:

Categorisation based on "Season":

In [None]:
plt.figure(figsize=(30,5))
sns.lineplot(x="date_time",y="deg_C",data=df_train,hue="Season")

In [None]:
fig,ax=plt.subplots(10,1,figsize=(30,25))    #log_carbon_monoxide	log_benzene	log_nitrogen_oxides
sns.lineplot(x="date_time",y="relative_humidity",data=df_train,hue="Season",ax=ax[0])
sns.lineplot(x="date_time",y="absolute_humidity",data=df_train,hue="Season",ax=ax[1])
sns.lineplot(x="date_time",y="sensor_1",data=df_train,hue="Season",ax=ax[2])
sns.lineplot(x="date_time",y="sensor_2",data=df_train,hue="Season",ax=ax[3])
sns.lineplot(x="date_time",y="sensor_3",data=df_train,hue="Season",ax=ax[4])
sns.lineplot(x="date_time",y="sensor_4",data=df_train,hue="Season",ax=ax[5])
sns.lineplot(x="date_time",y="sensor_5",data=df_train,hue="Season",ax=ax[6])
sns.lineplot(x="date_time",y="log_carbon_monoxide",data=df_train,hue="Season",ax=ax[7])
sns.lineplot(x="date_time",y="log_benzene",data=df_train,hue="Season",ax=ax[8])
sns.lineplot(x="date_time",y="log_nitrogen_oxides",data=df_train,hue="Season",ax=ax[9])

plt.suptitle("Time Series Analysis",size=30)


Categorisation based on "TimeOfDay":

In [None]:
plt.figure(figsize=(30,10))
sns.lineplot(x="date_time",y="deg_C",data=df_train,hue="TimeofDay")

In [None]:
fig,ax=plt.subplots(10,1,figsize=(30,50))
sns.lineplot(x="date_time",y="relative_humidity",data=df_train,hue="TimeofDay",ax=ax[0])
sns.lineplot(x="date_time",y="absolute_humidity",data=df_train,hue="TimeofDay",ax=ax[1])
sns.lineplot(x="date_time",y="sensor_1",data=df_train,hue="TimeofDay",ax=ax[2])
sns.lineplot(x="date_time",y="sensor_2",data=df_train,hue="TimeofDay",ax=ax[3])
sns.lineplot(x="date_time",y="sensor_3",data=df_train,hue="TimeofDay",ax=ax[4])
sns.lineplot(x="date_time",y="sensor_4",data=df_train,hue="TimeofDay",ax=ax[5])
sns.lineplot(x="date_time",y="sensor_5",data=df_train,hue="TimeofDay",ax=ax[6])
sns.lineplot(x="date_time",y="log_carbon_monoxide",data=df_train,hue="TimeofDay",ax=ax[7])
sns.lineplot(x="date_time",y="log_benzene",data=df_train,hue="TimeofDay",ax=ax[8])
sns.lineplot(x="date_time",y="log_nitrogen_oxides",data=df_train,hue="TimeofDay",ax=ax[9])


## Pairplot Visualization:

In [None]:
df_train.columns

In [None]:
x=df_train[['sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5','log_carbon_monoxide','log_benzene', 'log_nitrogen_oxides','Season']]
sns.pairplot(x, hue ='Season')
plt.show()

In [None]:
x=df_train[['sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5','log_carbon_monoxide','log_benzene', 'log_nitrogen_oxides','TimeofDay']]
sns.pairplot(x, hue ='TimeofDay')
plt.show()

In [None]:
x=df_train[['sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5','log_carbon_monoxide','log_benzene', 'log_nitrogen_oxides','dayofweek']]
sns.pairplot(x, hue ='dayofweek')
plt.show()

# Label Encoding Categorical Variables:

In [None]:
X=df_train.drop(columns=["log_benzene","log_nitrogen_oxides","log_carbon_monoxide","date_time"],axis=1)
test=df_test.drop(["date_time"],axis=1)
y_CO=df_train["log_carbon_monoxide"]
y_C6H6=df_train["log_benzene"]
y_NO=df_train["log_nitrogen_oxides"]

In [None]:
le=LabelEncoder()
X["Season"]=le.fit_transform(X["Season"])
test["Season"]=le.transform(df_test["Season"])

In [None]:
le=LabelEncoder()
X["TimeofDay"]=le.fit_transform(X["TimeofDay"])
test["TimeofDay"]=le.transform(df_test["TimeofDay"])

In [None]:
le=LabelEncoder()
X["dayofweek"]=le.fit_transform(X["dayofweek"])
test["dayofweek"]=le.transform(df_test["dayofweek"])

In [None]:
X.head()

# Standard scaling the data:

In [None]:
sc=StandardScaler()
X=sc.fit_transform(X)
test=sc.transform(test)

# Model Selection

In [None]:
def fun_best(X,y):
    models=["Linear Regression","Lasso","Ridge","Random Forest Regressor","XGB Regressor","LGBM Regressor","Cat Boost Regressor","Gradient Boosting Regressor"]
    mean_score=[]
    
    lr=LinearRegression()
    score_lr=-1*cross_val_score(lr,X,y,cv=5,scoring="neg_mean_squared_error")
    mean_score.append(score_lr.mean())

    las=Lasso()
    score_las=-1*cross_val_score(las,X,y,cv=5,scoring="neg_mean_squared_error")
    mean_score.append(score_las.mean())

    rid=Ridge()
    score_rid=-1*cross_val_score(rid,X,y,cv=5,scoring="neg_mean_squared_error")
    mean_score.append(score_rid.mean())

    rf=RandomForestRegressor()
    score_rf=-1*cross_val_score(rf,X,y,cv=5,scoring="neg_mean_squared_error")
    mean_score.append(score_rf.mean())

    xgbr=XGBRegressor()
    score_xg=-1*cross_val_score(xgbr,X,y,cv=5,scoring="neg_mean_squared_error")
    mean_score.append(score_xg.mean())

    lgbm=LGBMRegressor()
    score_lg=-1*cross_val_score(lgbm,X,y,cv=5,scoring="neg_mean_squared_error")
    mean_score.append(score_lg.mean())

    cb=CatBoostRegressor()
    score_cb=-1*cross_val_score(cb,X,y,cv=5,scoring="neg_mean_squared_error")
    mean_score.append(score_cb.mean())

    gb=GradientBoostingRegressor()
    score_gb=-1*cross_val_score(gb,X,y,cv=5,scoring="neg_mean_squared_error")
    mean_score.append(score_gb.mean())

    return dict(zip(models,mean_score))



## 1. Predicting CO:

In [None]:
result=fun_best(X,y_CO)


In [None]:
print(result)

**LGBM wins!**

## 2. Predicting C6H6:

In [None]:
result=fun_best(X,y_C6H6)
print(result)

**LGBM wins!**

## 3. Predicting NO:

In [None]:
result=fun_best(X,y_NO)
print(result)

**CatBoost Regressor wins!**

# Final Predictions:

## 1. With Base Parameters:

Since **LGBM Regressor** and **Cat Boost Regressor** has given the best results,we will use it to predict our test results.

In [None]:
lg=LGBMRegressor()
lg.fit(X,y_CO)
y_pred_CO=lg.predict(test)
#print(lg.get_params())


In [None]:
lg=LGBMRegressor()
lg.fit(X,y_C6H6)
y_pred_C6H6=lg.predict(test)

In [None]:
cb=CatBoostRegressor()
cb.fit(X,y_NO)
y_pred_NO=cb.predict(test)
#print(cb.get_params())

In [None]:
#res=pd.DataFrame({"date_time":df_test["date_time"],"target_carbon_monoxide":np.exp(y_pred_CO),"target_benzene":(np.exp(y_pred_C6H6)-1),"target_nitrogen_oxides":(np.exp(y_pred_NO)-1)})

In [None]:
#res.to_csv("Result.csv",index=False)

## 2. Parameter tuning:

### (i)For CO:

In [None]:
n_estimators=[950,1000,1010,1020,1030,1040,1050,1060,1070,1080,1090,1100,1150,1200,1250]
max_depth=[2,3,4,5,6,7,8,9,10]
learning_rate=[0.01,0.015,0.025,0.02,0.03,0.035,0.04,0.045,0.05,0.055,0.06]
min_child_weight=[1,2,3,4,5,6,7,8]

hyperparameter_grid={
    "n_estimators":n_estimators,
    "max_depth": max_depth,
    "learning_rate" : learning_rate,
    "min_child_weight" : min_child_weight
    }

In [None]:
model=LGBMRegressor()
random_cv=RandomizedSearchCV(estimator=model,param_distributions=hyperparameter_grid,
                             cv=5,n_iter=50,scoring="neg_root_mean_squared_error",
                             n_jobs=4,verbose=5,return_train_score=True,random_state=1)

In [None]:
random_cv.fit(X,y_CO)

In [None]:
random_cv.best_params_

In [None]:
random_cv.best_estimator_

In [None]:
lgbm=LGBMRegressor(learning_rate=0.01, max_depth=6, min_child_weight=5,
              n_estimators=1010)
score_lg=-1*cross_val_score(lgbm,X,y_CO,cv=5,scoring="neg_mean_squared_error")
print(score_lg.mean())



In [None]:
lgbm.fit(X,y_CO)
y_pred_CO=lgbm.predict(test)

### (ii)For C6H6:

In [None]:
n_estimators=[950,1000,1010,1020,1030,1040,1050,1060,1070,1080,1090,1100,1150,1200,1250]
max_depth=[2,3,4,5,6,7,8,9,10]
learning_rate=[0.01,0.015,0.025,0.02,0.03,0.035,0.04,0.045,0.05,0.055,0.06]
min_child_weight=[1,2,3,4,5,6,7,8]

hyperparameter_grid={
    "n_estimators":n_estimators,
    "max_depth": max_depth,
    "learning_rate" : learning_rate,
    "min_child_weight" : min_child_weight
    }

In [None]:
model=LGBMRegressor()
random_cv=RandomizedSearchCV(estimator=model,param_distributions=hyperparameter_grid,
                             cv=5,n_iter=50,scoring="neg_root_mean_squared_error",
                             n_jobs=4,verbose=5,return_train_score=True,random_state=1)

In [None]:
random_cv.fit(X,y_C6H6)

In [None]:
random_cv.best_params_

In [None]:
random_cv.best_estimator_

In [None]:
lgbm=LGBMRegressor(learning_rate=0.06, max_depth=2, min_child_weight=2,
              n_estimators=1040)
score_lgbm=-1*cross_val_score(lgbm,X,y_C6H6,cv=5,scoring="neg_mean_squared_error")
print(score_lgbm.mean())



In [None]:
lgbm.fit(X,y_C6H6)
y_pred_C6H6=lgbm.predict(test)

### (iii)NO:

In [None]:
res=pd.DataFrame({"date_time":df_test["date_time"],"target_carbon_monoxide":np.exp(y_pred_CO),"target_benzene":(np.exp(y_pred_C6H6)-1),"target_nitrogen_oxides":(np.exp(y_pred_NO)-1)})

In [None]:
res.to_csv("Result.csv",index=False)