In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rcParams
import statsmodels.api as sm

%matplotlib inline

In [None]:
# Importing the dataframe
df = pd.read_csv('//kaggle/input/tabular-playground-series-jul-2021/train.csv', parse_dates = ["date_time"])
df.head()

In [None]:
# Viewing the information
df.info()

In [None]:
# Describing the dataframe
df.describe().T

In [None]:
# Creating list of target and features
target = ["target_carbon_monoxide", "target_benzene", "target_nitrogen_oxides"]
sensor = ['sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5']
value = ["target_carbon_monoxide", "target_benzene", "target_nitrogen_oxides", 'sensor_1', 'sensor_2', 
         'sensor_3', 'sensor_4', 'sensor_5']
humid = ["relative_humidity","absolute_humidity"]
temp_value = ["target_carbon_monoxide", "target_benzene", "target_nitrogen_oxides", "deg_C"]
features = ["date_time", 'deg_C', 'relative_humidity', 'absolute_humidity', 'sensor_1', 'sensor_2', 'sensor_3',
            'sensor_4', 'sensor_5']

In [None]:
# Plotting the variation of gases with time
for t in target:
    plt.figure(figsize = (20, 4))
    plt.title("Variation of gases with time")
    sns.lineplot(data = df, x = "date_time", y = df[t])

In [None]:
# Pairplot
sns.pairplot(df[target])
plt.show()

The histogram on the diagonal allows us to see the distribution of a single variable while the scatter plots on the upper and lower triangles show the relationship (or lack thereof) between two variables.

In [None]:
# Sensor 1 data for the gases and thier value over time
for t in target:
    plt.figure(figsize = (20, 4))
    plt.title("Variation gases according to sensor 1")
    sns.scatterplot(data = df, x = "date_time", hue = df[t], y = "sensor_1")

From above graphs we can see that the concentration of nitrogen oxide has increased between November and December 2010. 

In [None]:
# Sensor value over time

for s in sensor:
    plt.figure(figsize = (20, 4))
    plt.title("Sensors value over time")
    sns.lineplot(data = df, x = "date_time", y = df[s])

In [None]:
# Pair plot showing relationship between different sensors
sns.pairplot(df[sensor],  diag_kind = "hist")
plt.show()

In [None]:
# Correlation matrix shows how the sensor value relates with target 
seasonality_corr = df[value].corr()

sns.clustermap(seasonality_corr, annot=True)
plt.show();

Value for benzene is in high correlation with the values of sensor 2.

In [None]:
# Humidity plot
for h in humid:
    plt.figure(figsize = (20, 4))
    plt.title("Humidity over time")
    sns.lineplot(data = df, x = "date_time", y = df[h])

The months of April and May have less value of humidity while those of rainy season ranging from June to September have high value of humidity.

In [None]:
g = sns.pairplot(df[["relative_humidity","absolute_humidity"]], diag_kind = "kde")
g.map_lower(sns.kdeplot, levels=3, color=".2")
g.map_upper(sns.kdeplot, levels=3, color=".2")

plt.show();

Levels show the distribution of the density

In [None]:
# Temperature variation with relative humidity
plt.figure(figsize = (16, 4))
sns.scatterplot(data = df, x = "date_time", y = "deg_C",  hue = "relative_humidity", 
                size = "relative_humidity")
plt.title("Temperature variation over time")
plt.xlabel("Date")
plt.ylabel("Deg_C");

Comparing humidity and temperature graph we can say that when humidity is more the corresponding value of temperature is less.

In [None]:
# Temperature variation with target gases 

temp_corr = df[temp_value].corr(method='pearson')

sns.clustermap(temp_corr, annot=True)
plt.show();

From above we can say that there is no correlation between the temperature and target gases

In [None]:
df.set_index("date_time", inplace = True)
df.head()

In [None]:
# Sesonality trend of different gases
for t in target:
    plt.figure(figsize = (20, 4))
    plt.title(t)
    decomposition = sm.tsa.seasonal_decompose(df[t])
    decomposition.seasonal.plot(color = "#FFA07A");

The above graph is too much dense and we cannot see any seasonality trend. Thus in next graph we have taken only one month value

In [None]:
for t in target:
    plt.figure(figsize = (20, 4))
    plt.title(t)
    decomposition = sm.tsa.seasonal_decompose(df[t])
    decomposition.seasonal["2010-03-10":"2010-04-10"].plot(color = "LightCoral");

The y limits of all gases are different with nitogen oxide maximum. There is also the repetative pattern for all the gases.  

In [None]:
seasonality_dict = {}
for t in df[target]:
    decompositions = sm.tsa.seasonal_decompose(df[t])
    seasonality_dict[t] = decompositions.seasonal

rcParams["figure.figsize"] = 16, 4
    
pd.DataFrame(seasonality_dict)["2010-03-10":"2010-03-20"].plot(subplots = True, layout = (3, 1));


In [None]:
trend_dict = {}

for ts in df[["target_carbon_monoxide","target_benzene", "target_nitrogen_oxides"]]:
    decomposition = sm.tsa.seasonal_decompose(df[ts].dropna())
    # Store back the results
    trend_dict[ts] = decomposition.trend

rcParams["figure.figsize"] = 16, 4
pd.DataFrame(trend_dict).plot(subplots=True, layout=(3, 1));

In [None]:
residual_dict = {}

for ts in df[["target_carbon_monoxide","target_benzene", "target_nitrogen_oxides"]]:
    decomposition = sm.tsa.seasonal_decompose(df[ts].dropna())
    residual_dict[ts] = decomposition.resid

rcParams["figure.figsize"] = 16, 7

pd.DataFrame(residual_dict).plot(subplots=True, layout=(3, 1));

In [None]:
X = df[['deg_C', 'relative_humidity', 'absolute_humidity', 'sensor_1',
       'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5']]
y = df[['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']]

In [None]:
y_CO = np.array(y.loc[:, list(y.columns)[0] ] )# first column for prediction
y_benzene = np.array(y.loc[:, list(y.columns)[1] ] )
y_NO2 = np.array(y.loc[:, list(y.columns)[2] ] )

y = [y_CO, y_benzene, y_NO2]

In [None]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
import math
from sklearn.metrics import mean_squared_log_error as msle
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score


X_train, X_valid, y_train, y_valid = train_test_split(X, y_CO , test_size=0.2, random_state=1)
lgbm = LGBMRegressor()
params = {
'num_leaves': [7, 14, 21, 28, 31, 50],
'learning_rate': [0.1, 0.03, 0.003],
'max_depth': [-1, 3, 5],
'n_estimators': [50, 100, 200, 500],
}
grid= GridSearchCV(estimator=lgbm, param_grid = params, cv = 2, n_jobs=-1)
grid.fit(X_train, y_train)
y_pred = grid.predict(X_valid)
score = r2_score(y_valid, y_pred)
print(" Results from Grid Search: " )
print("-" * 20)
print("\n The best estimator across all searched parameters:\n", grid.best_estimator_)
print("\n The best score across all searched parameters:\n", grid.best_score_)
print("\n The best parameters across all searched parameters:\n", grid.best_params_)
print("/n The r2 score is: \n", score)

In [None]:
dict_compare = {"y_pred": y_pred, "y_valid": y_valid}
dict_compare

In [None]:
df_compare = pd.DataFrame({"y_pred": y_pred, "y_valid": y_valid})
df_compare

In [None]:
df_test = pd.read_csv("/kaggle/input/tabular-playground-series-jul-2021/test.csv", parse_dates = ["date_time"], index_col="date_time")

In [None]:
columns = df_test.columns[1:]
X = df[columns].values
X_test = df_test[columns].values

#Since we are to predict 3 targets so we are setting target 1,2,3
#Reshaping otherwise it will throw an error
target_CO = df['target_carbon_monoxide'].values.reshape(-1,1)
target_benzene = df['target_benzene'].values.reshape(-1,1)
target_NO2 = df['target_nitrogen_oxides'].values.reshape(-1,1)

sub = pd.read_csv("/kaggle/input/tabular-playground-series-jul-2021/sample_submission.csv")
sub.head()

In [None]:
lgbm = LGBMRegressor(learning_rate= 0.1, max_depth= 5, n_estimators= 500, num_leaves= 7)

lgbm.fit(X, target_CO)
sub['target_carbon_monoxide'] = lgbm.predict(X_test)

lgbm.fit(X, target_benzene)
sub['target_benzene'] = lgbm.predict(X_test)

lgbm.fit(X, target_NO2)
sub['target_nitrogen_oxides'] = lgbm.predict(X_test)

sub.head()

In [None]:
sub.to_csv('Final Submission LGBMBoost.csv', index=False)

In [None]:
sub.shape

Please provide me with the suggestion to improve the above notebook. Also please upvote if you like the notebook.