In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.model_selection import GroupKFold,KFold
from sklearn.metrics import mean_squared_log_error
import lightgbm as lgb
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
plt.style.use('ggplot')

In [None]:
train=pd.read_csv('../input/tabular-playground-series-jul-2021/train.csv')
test=pd.read_csv('../input/tabular-playground-series-jul-2021/test.csv')
sample_submission=pd.read_csv('../input/tabular-playground-series-jul-2021/sample_submission.csv')

In [None]:
train.head()

In [None]:
train['date_time'].min(),train['date_time'].max()

In [None]:
test['date_time'].min(),test['date_time'].max()

* Train is available from Mar 2010 to Jan 2011. Only 1 day is available in Jan 2011 and a closer look tells us that the same data point is repeated in test set. Lets remove that row in train data and use 2010 data for training.
* Test data is for Jan 2011 to Apr 2011.

In [None]:
train['date_time']=pd.to_datetime(train['date_time'],format='%Y-%m-%d %H:%M:%S')

In [None]:
test['date_time']=pd.to_datetime(test['date_time'],format='%Y-%m-%d %H:%M:%S')

In [None]:
train.shape

In [None]:
train=train.loc[~(train['date_time']=='2011-01-01')].reset_index(drop=True)

In [None]:
train.shape

In [None]:
fig,ax=plt.subplots(4,2,figsize=(20,15))
for i,col in enumerate(train.columns[1:9]):
    ax[i%4][i//4].hist(train[col],bins=40,color='darkblue',label=f'{col}')
    ax[i%4][i//4].set_title(f'Distribution of {col}',fontsize=15)
    ax[i%4][i//4].set_xlabel(f'{col}')
    ax[i%4][i//4].set_ylabel('Dist')
    plt.subplots_adjust(hspace=0.45)

* The distribution of deg_C shows peaks between 20 to 30 deg.
* There is a dip in relative humidity at 40% and there are two peaks at 30% and 45% approx.
* The absolute humidity value shows peaks at 0.25g/m3(i have assumed it to be g/m3.Data info did not explicitly mention any units).
* The distribution of sensor_1,2,3 & 5 appears to be left skewed whereas sensor-4 is normal with outliers at 500.

In [None]:
fig,ax=plt.subplots(3,1,figsize=(10,15))
for i,col in enumerate(train.columns[9:12]):
    ax[i%3].hist(train[col],bins=40,color='darkblue',label=f'{col}')
    ax[i%3].set_title(f'Distribution of {col}',fontsize=15)
    ax[i%3].set_xlabel(f'{col}')
    ax[i%3].set_ylabel('Dist')
    plt.subplots_adjust(hspace=0.45)

In [None]:
train[['date_time','deg_C']].set_index('date_time').resample('D').mean()

In [None]:
# Following code is inspired from - https://www.kaggle.com/nroman/eda-for-ashrae
fig,ax=plt.subplots(1,1,figsize=(15,10))
train[['date_time','deg_C']].set_index('date_time').resample('D').mean()['deg_C'].plot(ax=ax,label='by hour(train)',alpha=1,color='blue').set_ylabel('deg C',fontsize=10)
ax.set_title('Trend of Mean deg_C by Day',fontsize=12)
ax.set_xlabel('')

Looking at the day wise trend,we see that there has been sudden peak and dips for certain days over the month.While the temperatures have been above 20 deg after late may, there is a dip in temperature less than 15 dec after Nov but there is a sudden increase in mid december.

In [None]:
fig,ax=plt.subplots(1,1,figsize=(15,10))
test[['date_time','deg_C']].set_index('date_time').resample('D').mean()['deg_C'].plot(ax=ax,label='by hour(train)',alpha=1,color='blue').set_ylabel('deg C',fontsize=10)
ax.set_title('Trend of Mean deg_C by Day',fontsize=12)
ax.set_xlabel('')

For the test set, if we try to compare between March month of train, we could see the temperatures have started from approx 3 deg and increased above 15 deg.

Lets try to build a quick baseline model and then come back to the EDA part to lookinto something useful.

In [None]:
## Quick modelling:
n_folds=5
#Using random gbm params,
params={"objective":'regression',
       'learning_rate':0.06,
       'num_leaves':2**7-1,
        'n_estimators':30,
        'min_child_samples':8,
       'n_jobs':-1,
       'max_depth':-1,
       'metric':'l2',
       'tree_learner':'serial',
        'bagging_fraction':0.8,
        'bagging_freq':5,
       'seed':42}
pred_cols=train.columns.drop(['date_time','target_carbon_monoxide','target_benzene','target_nitrogen_oxides'])
target=[c for c in train.columns if c.startswith('target')]

In [None]:
for t in target:
    train[t]=np.log1p(train[t])

In [None]:
for t in target:
    temp[t]=

In [None]:
folds=KFold(n_splits=n_folds,shuffle=False)
oof_preds=np.zeros((train.shape[0],3))
subs=np.zeros((test.shape[0],3))
feature_importance=pd.DataFrame()
feature_importance_df=pd.DataFrame()
for g,t in enumerate(target):
    print(f'********Starting training for target {t}***********')
    losses=[0]*n_folds
    val_scores=[0]*n_folds
    loss=0
    for i,(trn_idx,val_idx) in enumerate(folds.split(train)):
        trn_X,trn_Y=train[pred_cols].loc[trn_idx],train[t].loc[trn_idx]
        val_X,val_Y=train[pred_cols].loc[val_idx],train[t].loc[val_idx]
        
        trn_df=lgb.Dataset(trn_X,label=trn_Y)
        val_df=lgb.Dataset(val_X,label=val_Y)
        
        watchlist=[trn_df,val_df]
        
        model=lgb.train(params,
                        train_set=trn_df,
                        num_boost_round=3000,
                        early_stopping_rounds=30,
                        valid_sets=watchlist,
                        verbose_eval=30)
        
        feature_importance['target']=t
        feature_importance['columns']=pred_cols
        feature_importance['folds']=i
        feature_importance['imp']=model.feature_importance()
        preds=model.predict(val_X,num_iteration=model.best_iteration)
        val_scores[i]=np.sqrt(model.best_score['valid_1']['l2'])
        oof_preds[val_idx][:,g]=model.predict(val_X,num_iteration=model.best_iteration)
        subs[:,g]+=model.predict(test[pred_cols],num_iteration=model.best_iteration)
        rmsle=np.sqrt(mean_squared_log_error(val_Y,preds))
        print(f'Fold {i+1} Loss {rmsle} Best Score {val_scores[i]}')
    feature_importance_df=pd.concat([feature_importance_df,feature_importance],axis=0)
    subs[:,g]/=n_folds
    subs[:,g]=np.expm1(subs[:,g])
    
    print(f'Mean RMSLE {np.mean(val_scores)} std {np.std(val_scores)}')
    print('**************')
        

In [None]:
feature_importance_df

In [None]:
feature_importance.describe()

In [None]:
subs.shape

In [None]:
sample_submission.iloc[:,1:]=subs

In [None]:
sample_submission.head()

In [None]:
sample_submission.to_csv('sample_submission.csv',index=False)

**Work in progress**