## Tabular Playground July 2021

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# print all the outputs in a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
df=pd.read_csv('/kaggle/input/tabular-playground-series-jul-2021/train.csv', parse_dates=True)
df.head()

#### Data Summary Statistics

In [None]:
df.info()

In [None]:
df.describe()

Sampling into train and validation set

In [None]:
df2=df.copy()
df2 = df2.sample(n = len(df2), random_state = 42)
df_valid = df2.sample(frac = 0.3, random_state = 42)
df_train = df2.drop(df_valid.index)
col_2_use=['deg_C','relative_humidity','absolute_humidity','sensor_1','sensor_2','sensor_3','sensor_4','sensor_5']


Splitting target variables and features

In [None]:
X_train = df_train[col_2_use]
X_valid = df_valid[col_2_use]
y_train_all = df_train[['target_carbon_monoxide','target_benzene','target_nitrogen_oxides']]
y_valid_all = df_valid[['target_carbon_monoxide','target_benzene','target_nitrogen_oxides']]
print('Training shapes:',X_train.shape, y_train_all.shape)
print('Validation shapes:',X_valid.shape, y_valid_all.shape)

### Modeling 

#### 1. RandomForestRegressor

In [None]:
rf_all=RandomForestRegressor(max_depth = 5, n_estimators=100, random_state = 42)
rf_all.fit(X_train, y_train_all)
predict_rf=rf_all.predict(X_valid)
rf_all.score(X_valid,y_valid_all)
rf_all.score(X_train, y_train_all)

In [None]:
rmse_rf=np.sqrt(mean_squared_error(y_valid_all,predict_rf))
print(rmse_rf)

#### 2. XGBRegressor

In [None]:
xg_all=XGBRegressor(max_depth = 5, n_estimators=100, random_state = 42)
wrapper = MultiOutputRegressor(xg_all)

wrapper.fit(X_train, y_train_all)
predict_xg=wrapper.predict(X_valid)

In [None]:
wrapper.score(X_valid,y_valid_all)
wrapper.score(X_train, y_train_all)

In [None]:
rmse_xg=np.sqrt(mean_squared_error(y_valid_all,predict_xg))
print(rmse_xg)

#### Normalising for KNN

In [None]:
# fit scaler on training data
norm = MinMaxScaler().fit(X_train)

# transform training data
X_train_norm = norm.transform(X_train)

# transform testing dataabs
X_valid_norm = norm.transform(X_valid)

In [None]:
trainX = [X_train, X_train_norm]
testX = [X_valid, X_valid_norm]

#### 3. KNN

In [None]:
# knn 
# model fitting and measuring RMSE
rmse = []
for i in range(len(trainX)):
    knn = KNeighborsRegressor(n_neighbors=7)
    # fit
    knn.fit(trainX[i],y_train_all)
    # predict
    pred = knn.predict(testX[i])
    print(knn.score(testX[i], y_valid_all))
    # RMSE
    rmse.append(np.sqrt(mean_squared_error(y_valid_all,pred)))

print(rmse)


In [None]:
# visualizing the result
df_knn = pd.DataFrame({'RMSE':rmse},index=['Original','Normalized'])
df_knn

In [None]:
knn = KNeighborsRegressor(n_neighbors=7)
# fit
knn.fit(trainX[1],y_train_all)
# predict
pred = knn.predict(testX[1])
print(knn.score(testX[1], y_valid_all))

In [None]:
df_test=pd.read_csv('/kaggle/input/tabular-playground-series-jul-2021/test.csv', parse_dates=True)
df_test.head()

In [None]:
df_submission=pd.read_csv('/kaggle/input/tabular-playground-series-jul-2021/sample_submission.csv', parse_dates=True)
df_submission

#### Prediction and submission

In [None]:
X_test = df_test[col_2_use]
X_test_norm = norm.transform(X_test)
pred_test = knn.predict(X_test_norm)
submission = pd.DataFrame(pred_test, columns=["target_carbon_monoxide","target_benzene","target_nitrogen_oxides"], index=df_test['date_time'])

In [None]:
submission.to_csv('/kaggle/working/submission.csv')

#### Please provide valuable feedback, comments and guidance that can help me improve my approach and skills.