In [None]:
from datetime import datetime
from scipy.stats import skew  # for some statistics
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd 
from sklearn import preprocessing
import sklearn.metrics as metrics
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
train = pd.read_csv('../input/tabular-playground-series-jul-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-jul-2021/test.csv')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.shape

In [None]:
test.shape

In [None]:
train.info()

### There are no missing values

In [None]:
train.describe()

## Univariate Analysis

In [None]:
plt.rcParams['figure.figsize']= (20,10)
sns.distplot(train['deg_C'], color='red')
plt.xlabel('Degree in celcius', size=15)

In [None]:
plt.rcParams['figure.figsize']= (20,10)
sns.distplot(train['relative_humidity'], color='red')
plt.xlabel('Relative Humidity', size=15)

In [None]:
plt.rcParams['figure.figsize']= (20,10)
sns.distplot(train['absolute_humidity'], color='red')
plt.xlabel('Absolute Humidity', size=15)

In [None]:
plt.rcParams['figure.figsize']= (20,10)
sns.distplot(train['sensor_1'], color='red')
plt.xlabel('Data of sensor 1', size=15)

In [None]:
plt.rcParams['figure.figsize']= (20,10)
sns.distplot(train['sensor_2'], color='red')
plt.xlabel('Data of sensor 2', size=15)

In [None]:
plt.rcParams['figure.figsize']= (20,10)
sns.distplot(train['sensor_3'], color='red')
plt.xlabel('Data of sensor 3', size=15)

In [None]:
plt.rcParams['figure.figsize']= (20,10)
sns.distplot(train['sensor_4'], color='red')
plt.xlabel('Data of sensor 5', size=15)

In [None]:
new_train = train.drop(['target_benzene', 'date_time', 'target_carbon_monoxide', 'target_nitrogen_oxides'], axis=1) 
new_test = test.drop('date_time', axis=1)
features= ['deg_C', 'relative_humidity', 'absolute_humidity', 'sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5']
targets = ['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']

### Outliers in train set

In [None]:
import matplotlib.pyplot as plt
sns.set_theme(style="whitegrid")
fig, axes = plt.subplots(4,2) # create figure and axes
fig.subplots_adjust(left=0.08, right=0.98, bottom=0.05, top=0.9,
                    hspace=0.9, wspace=0.3)

df = pd.DataFrame(data={'Degree celsuis': train['deg_C'],
                    'Relative Humidity': train['relative_humidity'],
                    'Absolute Humidity': train['absolute_humidity'],
                    'Sensor 1 Data': train['sensor_1'],
                    'Sensor 2 Data': train['sensor_2'],
                    'Sensor 3 Data': train['sensor_3'],
                    'Sensor 4 Data': train['sensor_4'],
                    'Sensor 5 Data': train['sensor_5'],  
                       })

for i,el in enumerate(list(df.columns.values)):
    a = df.boxplot(el, ax=axes.flatten()[i])

#fig.delaxes(axes[1,1]) # remove empty subplot
plt.tight_layout() 

plt.show()

### Outliers in Test set

In [None]:
import matplotlib.pyplot as plt
sns.set_theme(style="whitegrid")
fig, axes = plt.subplots(4,2) # create figure and axes
fig.subplots_adjust(left=0.08, right=0.98, bottom=0.05, top=0.9,
                    hspace=0.9, wspace=0.3)

df = pd.DataFrame(data={'Degree celsuis': test['deg_C'],
                    'Relative Humidity': test['relative_humidity'],
                    'Absolute Humidity': test['absolute_humidity'],
                    'Sensor 1 Data': test['sensor_1'],
                    'Sensor 2 Data': test['sensor_2'],
                    'Sensor 3 Data': test['sensor_3'],
                    'Sensor 4 Data': test['sensor_4'],
                    'Sensor 5 Data': test['sensor_5'],  
                       })

for i,el in enumerate(list(df.columns.values)):
    a = df.boxplot(el, ax=axes.flatten()[i])

#fig.delaxes(axes[1,1]) # remove empty subplot
plt.tight_layout() 

plt.show()

In [None]:
sns.kdeplot(train['deg_C'], color='palegreen')
sns.kdeplot(test['deg_C'], color = 'lightcoral')

In [None]:
sns.kdeplot(train['absolute_humidity'], color='palegreen')
sns.kdeplot(test['absolute_humidity'], color = 'lightcoral')

In [None]:
sns.kdeplot(train['relative_humidity'], color='palegreen')
sns.kdeplot(test['relative_humidity'], color = 'lightcoral')

In [None]:
sns.kdeplot(train['sensor_1'], color='palegreen')
sns.kdeplot(test['sensor_1'], color = 'lightcoral')

In [None]:
sns.kdeplot(train['sensor_2'], color='palegreen')
sns.kdeplot(test['sensor_2'], color = 'lightcoral')

In [None]:
sns.kdeplot(train['sensor_3'], color='palegreen')
sns.kdeplot(test['sensor_3'], color = 'lightcoral')

In [None]:
sns.kdeplot(train['sensor_4'], color='palegreen')
sns.kdeplot(test['sensor_4'], color = 'lightcoral')

In [None]:
sns.kdeplot(train['sensor_5'], color='palegreen')
sns.kdeplot(test['sensor_5'], color = 'lightcoral')

In [None]:
train['date_time'] = pd.to_datetime(train['date_time'], errors='coerce')
sns.scatterplot(x=train['date_time'], y= train['target_carbon_monoxide'])
plt.title("carbon monoxide distribution over time", size=20)

In [None]:
train['date_time'] = pd.to_datetime(train['date_time'], errors='coerce')
sns.scatterplot(x=train['date_time'], y= train['target_benzene'])
plt.title("Benzene distribution over time", size=20)

In [None]:
train['date_time'] = pd.to_datetime(train['date_time'], errors='coerce')
sns.scatterplot(x=train['date_time'], y= train['target_nitrogen_oxides'])
plt.title("Nitrogen Oxides distribution over time", size=20)

### Nitrogen oxides have increased significantly over time wheras carbon monoxides and benzene don't show much difference

In [None]:
sns.lineplot(x= train['deg_C'], y= train['target_carbon_monoxide'])
plt.title("Relationship between carbon monoxide and degree", size=15)

In [None]:
sns.lineplot(x= train['deg_C'], y= train['target_benzene'])
plt.title("Relationship between benzene and degree", size=15)

In [None]:
sns.lineplot(x= train['deg_C'], y= train['target_nitrogen_oxides'])
plt.title("Relationship between nitrogen oxides and degree", size=15)

In [None]:
plt.figure(figsize = (12, 8))
corr_train = train.corr()
sns.heatmap(corr_train, annot = True);

In [None]:
plt.figure(figsize = (12, 8))
corr_test = test.corr()
sns.heatmap(corr_test, annot = True);

In [None]:
train['date_time'] = pd.to_datetime(train['date_time'], errors='coerce')
train.loc[:, 'weekofyear'] = train['date_time'].dt.weekofyear
train.loc[:, 'month'] = train['date_time'].dt.month
train.loc[:, 'hour'] = train['date_time'].dt.hour
train['fer_C'] = (train['deg_C']*(9/5)) + 32 # in farenheit
train['kel_C'] = train['deg_C'] + 273.15     #in kelvin
train['dew_point'] = train['deg_C'] - ((100 - train['relative_humidity'])/5)

In [None]:
test['date_time'] = pd.to_datetime(test['date_time'], errors='coerce')
test.loc[:, 'weekofyear'] = test['date_time'].dt.weekofyear
test.loc[:, 'month'] = test['date_time'].dt.month
test.loc[:, 'hour'] = test['date_time'].dt.hour
test['fer_C'] = (test['deg_C']*(9/5)) + 32
test['kel_C'] = test['deg_C'] + 273.15
test['dew_point'] = test['deg_C'] - ((100 - test['relative_humidity'])/5)

In [None]:
y = train.loc[:,targets]

In [None]:
y1 = np.array(y.loc[:, list(y.columns)[0] ] )# first column for prediction

y2 = np.array(y.loc[:, list(y.columns)[1] ]) # second column for prediction

y3 = np.array(y.loc[:, list(y.columns)[2] ]) # third column for prediction

In [None]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
import math
from sklearn.metrics import mean_squared_log_error as msle
from sklearn.model_selection import GridSearchCV

X_train, X_valid, y_train, y_valid = train_test_split(new_train, y1 , test_size=0.2, random_state=1)
lgbm = LGBMRegressor()
params = {
    'num_leaves': [7, 14, 21, 28, 31, 50],
    'learning_rate': [0.1, 0.03, 0.003],
    'max_depth': [-1, 3, 5],
    'n_estimators': [50, 100, 200, 500],
}
grid= GridSearchCV(estimator=lgbm, param_grid = params, cv = 2, n_jobs=-1)
grid.fit(X_train, y_train)
print(" Results from Grid Search " )
print("\n The best estimator across ALL searched params:\n", grid.best_estimator_)
print("\n The best score across ALL searched params:\n", grid.best_score_)
print("\n The best parameters across ALL searched params:\n", grid.best_params_)

In [None]:
columns = test.columns[1:]
X = train[columns].values
X_test = test[columns].values

#Since we are to predict 3 targets so we are setting target 1,2,3
#Reshaping otherwise it will throw an error
target_1 = train['target_carbon_monoxide'].values.reshape(-1,1)
target_2 = train['target_benzene'].values.reshape(-1,1)
target_3 = train['target_nitrogen_oxides'].values.reshape(-1,1)

sub = pd.read_csv("../input/tabular-playground-series-jul-2021/sample_submission.csv")
sub.head()

In [None]:
lgbm = LGBMRegressor(learning_rate= 0.1, max_depth= 5, n_estimators= 500, num_leaves= 7)

lgbm.fit(X, target_1)
sub['target_carbon_monoxide'] = lgbm.predict(X_test)

lgbm.fit(X, target_2)
sub['target_benzene'] = lgbm.predict(X_test)

lgbm.fit(X, target_3)
sub['target_nitrogen_oxides'] = lgbm.predict(X_test)

sub.head()

In [None]:
sub.to_csv('submission with LGBMBoost.csv', index=False)