In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#### This is a Basic implementation of random forest model to airpollution data with day, date and hour data used for predicting the targets 

### Load Dataset

In [None]:
import pandas as pd
path = '/kaggle/input/tabular-playground-series-jul-2021/'
df = pd.read_csv(path+'train.csv',parse_dates=['date_time'])

In [None]:
df.head()

### Check data consistency

In [None]:
df.isna().sum()

### Data visualization

In [None]:
pd.plotting.scatter_matrix(df,alpha=0.2, figsize=(20,20));

### Date_time data extraction

In [None]:
import re
import numpy as np

#### Create new features called Day of month, Day of weak and hours as new features
def add_datepart(df, field_name, drop=True, time=False):
    df[field_name] = pd.to_datetime(df[field_name], infer_datetime_format=True)
    field = df[field_name]
    attr = [ 'Day', 'Dayofweek']
    if time: attr = attr + ['Hour']
    week = field.dt.isocalendar().week.astype(field.dt.day.dtype) if hasattr(field.dt, 'isocalendar') else field.dt.week
    for n in attr: df[n] = getattr(field.dt, n.lower()) if n != 'Week' else week
    mask = ~field.isna()
    if drop: df.drop(field_name, axis=1, inplace=True)
    return df

In [None]:
df_processed = add_datepart(df,'date_time', time=True)
## Convert targets to log1p as per competition requirement
df_processed['target_carbon_monoxide'] = np.log1p(df_processed['target_carbon_monoxide'])
df_processed['target_benzene'] = np.log1p(df_processed['target_benzene'])
df_processed['target_nitrogen_oxides'] = np.log1p(df_processed['target_nitrogen_oxides'])

### Train test split

In [None]:
#### Split dataset to 7000 train and 111 as test dataset
df_processed_train = df_processed[:7000]
df_processed_test = df_processed[7000:]

In [None]:
X_train = df_processed_train.drop(['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides'],axis = 1)
Y_train = df_processed_train.loc[:,['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']]

X_valid = df_processed_test.drop(['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides'],axis = 1)
Y_valid = df_processed_test.loc[:,['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']]

### Model Building

In [None]:
from sklearn.ensemble.forest import RandomForestRegressor, RandomForestClassifier
import math
def rmse(x,y):
  return math.sqrt(((x-y)**2).mean())

def print_score_co(m):
  res = [rmse(m.predict(X_train),Y_train.target_carbon_monoxide),rmse(m.predict(X_valid),Y_valid.target_carbon_monoxide), m.score(X_train,Y_train.target_carbon_monoxide), m.score(X_valid,Y_valid.target_carbon_monoxide)]
  if hasattr(m,'oob_score_'): res.append(m.oob_score_)
  print(res)

def print_score_b(m):
  res = [rmse(m.predict(X_train),Y_train.target_benzene),rmse(m.predict(X_valid),Y_valid.target_benzene), m.score(X_train,Y_train.target_benzene), m.score(X_valid,Y_valid.target_benzene)]
  if hasattr(m,'oob_score_'): res.append(m.oob_score_)
  print(res)

def print_score_no(m):
  res = [rmse(m.predict(X_train),Y_train.target_nitrogen_oxides),rmse(m.predict(X_valid),Y_valid.target_nitrogen_oxides), m.score(X_train,Y_train.target_nitrogen_oxides), m.score(X_valid,Y_valid.target_nitrogen_oxides)]
  if hasattr(m,'oob_score_'): res.append(m.oob_score_)
  print(res)
  
m_co = RandomForestRegressor(n_jobs=-1, oob_score=True,max_features=0.5,min_samples_leaf=3,n_estimators=200)   ### default 10 trees, here set to 40 , n_jobs = number of cores preferable
m_co.fit(X_train, Y_train.target_carbon_monoxide)
print_score_co(m_co)

In [None]:
m_b = RandomForestRegressor(n_jobs=-1, oob_score=True,max_features=0.5,min_samples_leaf=3,n_estimators=200)   ### default 10 trees, here set to 40 , n_jobs = number of cores preferable
m_b.fit(X_train, Y_train.target_benzene)
print_score_b(m_b)

In [None]:
m_no = RandomForestRegressor(n_jobs=-1, oob_score=True,max_features=0.5,min_samples_leaf=3,n_estimators=200)   ### default 10 trees, here set to 40 , n_jobs = number of cores preferable
m_no.fit(X_train, Y_train.target_nitrogen_oxides)
print_score_no(m_no)

### RMSLE calculation

In [None]:
(0.125**2 + 0.107**2 + 0.157**2)**0.5