In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Read data into dataframes

In [None]:
# Use date_time as index as every row is unique and provides no predictive power
train_data = pd.read_csv('../input/tabular-playground-series-jul-2021/train.csv', parse_dates=True, index_col=['date_time'])
test_data = pd.read_csv('../input/tabular-playground-series-jul-2021/test.csv', parse_dates=True, index_col=['date_time'])

## Exploratory Data Analysis (Learn about the features)

In [None]:
train_data

In [None]:
# No missing values and no outliers
train_data.describe()

In [None]:
train_data.columns

In [None]:
# Check that the features roughly follow a normal distribution
sns.distplot(train_data['deg_C'])

In [None]:
sns.distplot(train_data['relative_humidity'])

In [None]:
sns.distplot(train_data['absolute_humidity'])

In [None]:
sns.distplot(train_data['sensor_1'])

In [None]:
sns.distplot(train_data['sensor_2'])

In [None]:
sns.distplot(train_data['sensor_3'])

In [None]:
sns.distplot(train_data['sensor_4'])

In [None]:
sns.distplot(train_data['sensor_5'])

In [None]:
# See trends of features over time
sns.lineplot(x=train_data.index, y=train_data['deg_C'])

In [None]:
sns.lineplot(x=train_data.index, y=train_data['relative_humidity'])

In [None]:
sns.lineplot(x=train_data.index, y=train_data['absolute_humidity'])

In [None]:
# Conclusion: No need for any data cleaning based on the graphs plotted

## Split to validation and training data

In [None]:
from sklearn.model_selection import train_test_split

features = ['deg_C', 'relative_humidity', 'absolute_humidity', 'sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5']
labels = ['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']

# Shuffle the data
train_data = train_data.reindex(np.random.permutation(train_data.index))

X = train_data[features]
y = train_data[labels]

# Split data to training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y)

In [None]:
X_train.shape

## Creating XGBoost models

In [None]:
from xgboost import XGBRegressor

# Create models (hyperparameters adjusted according to mse)
model_carbon = XGBRegressor(n_estimators=500, max_depth=50, learning_rate=0.01)
model_benzene = XGBRegressor(n_estimators=500, max_depth=50, learning_rate=0.03)
model_nitrogen = XGBRegressor(n_estimators=500, max_depth=50, learning_rate=0.1)

model_carbon.fit(X_train, y_train['target_carbon_monoxide'])
model_benzene.fit(X_train, y_train['target_benzene'])
model_nitrogen.fit(X_train, y_train['target_nitrogen_oxides'])

## Testing the results on validation data

In [None]:
# Validation predictions
predicted_carbon = model_carbon.predict(X_valid)
predicted_benzene = model_benzene.predict(X_valid)
predicted_nitrogen = model_nitrogen.predict(X_valid)

In [None]:
y_valid

In [None]:
predicted_nitrogen

In [None]:
def mean_squared_error(predicted_numbers, label_name):
    total_squared_error = 0
    num_of_records = y_valid.shape[0]
    for i in range(num_of_records):
        total_squared_error += (y_valid[label_name].iloc[i] - predicted_numbers[i])**2
    mse = (1/num_of_records) * total_squared_error
    return mse

In [None]:
mean_squared_error(predicted_carbon, 'target_carbon_monoxide')

In [None]:
mean_squared_error(predicted_benzene, 'target_benzene')

In [None]:
mean_squared_error(predicted_nitrogen, 'target_nitrogen_oxides')

In [None]:
# Use these values to go back and do hyperparameter tuning when creating the model

## Creating the submission csv

In [None]:
# Realizing I need to use date_time for the submission, so I'm converting the index to a column again
date_time_col = test_data.index
date_time_col

In [None]:
# Create the submission dataframe
submission_df = pd.DataFrame(data=date_time_col, columns=['date_time'])
submission_df

In [None]:
submission_df['target_carbon_monoxide'] = model_carbon.predict(test_data)
submission_df['target_benzene'] = model_benzene.predict(test_data)
submission_df['target_nitrogen_oxides'] = model_nitrogen.predict(test_data)
submission_df

In [None]:
# Convert the dataframe to csv
submission_df.to_csv('pollution_submission.csv', index=False)