In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb

from model import XGBoostModel

from sklearn.model_selection import train_test_split

In [2]:
dataset = pd.read_csv('Train_Cleaned_KNN_Filtered.csv')
test = pd.read_csv('Test_Cleaned_KNN.csv')

### dataset hyperparameter search

In [3]:
dataset['LAT'] = (dataset['LAT'] - dataset['LAT'].mean()) / dataset['LAT'].std()
dataset['LON'] = (dataset['LON'] - dataset['LON'].mean()) / dataset['LON'].std()

In [4]:
drop_columns = list(set(dataset.columns.to_list()) - set(['NO2_trop', 'LAT','LON', 'LST', 'NO2_total', 'AAI', 'NO2_strat', 'CloudFraction']))

In [5]:
X = dataset.drop(drop_columns, axis=1)
y = dataset['GT_NO2']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
X_train = X_train.to_numpy()
y_train = y_train.to_numpy()

X_train = X_train.astype(np.float32)
y_train = y_train.astype(np.float32)

print(X_train.shape)
print(y_train.shape)

(65157, 8)
(65157,)


In [8]:
X_test = X_test.to_numpy()
y_test = y_test.to_numpy()

X_test = X_test.astype(np.float32)
y_test = y_test.astype(np.float32)

print(X_test.shape)
print(y_test.shape)

(16290, 8)
(16290,)


In [9]:
xgb = XGBoostModel(model_save_path='exp_xgboost_model.json')

In [10]:
params = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 0.1, 0.2],
}

In [11]:
model = xgb.gridsearch_exp(X_train, y_train, X_test, y_test, param_grid=params)

Best model parameters:
n_estimators: 150
max_depth: 7
learning_rate: 0.2
subsample: 1.0
colsample_bytree: 1.0
gamma: 0.1
RMSE: 8.542101860046387


### test

In [12]:
test.head()

Unnamed: 0,ID_Zindi,Date,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure
0,ID_U4KWPK,2019-01-01,45.582894,8.842165,0.0,282.98,-0.470822,0.153694,2.3e-05,0.000171,0.000148,14427.42478
1,ID_NKPFFW,2019-01-01,45.151743,10.781408,4.023135,280.917143,0.044013,0.891932,2.4e-05,0.000292,0.000201,14441.18578
2,ID_GHSZ6K,2019-01-01,45.186329,9.146666,0.0,281.394286,-0.198272,0.678858,2.3e-05,0.000149,0.000171,14440.8584
3,ID_P4U5WU,2019-01-01,45.836941,12.510362,0.0,283.374286,-0.229512,0.398208,2.3e-05,0.00012,0.000119,14434.0479
4,ID_QGSNTZ,2019-01-01,45.131947,10.015742,1.928031,281.562857,0.132952,0.756917,2.4e-05,0.000266,0.000251,14443.09006


In [13]:
test['LAT'] = (test['LAT'] - test['LAT'].mean()) / test['LAT'].std()
test['LON'] = (test['LON'] - test['LON'].mean()) / test['LON'].std()

In [14]:
drop_columns = list(set(test.columns.to_list()) - set(['NO2_trop', 'LAT','LON', 'LST', 'NO2_total', 'AAI', 'NO2_strat', 'CloudFraction']))

In [15]:
test.head()

id_string = []

for index, row in test.iterrows():
    id_string.append(str(row['ID_Zindi']))

x_test = test.drop(drop_columns, axis=1)
x_test = x_test.to_numpy()
x_test = x_test.astype(np.float32)

print(x_test.shape)


(6576, 8)


In [16]:
predictions = model.predict(x_test)

In [17]:
predictions.shape

(6576,)

In [18]:
df = pd.DataFrame({
    'ID': id_string,
    'Predicted_NO2': predictions.tolist()
})

In [19]:
df.to_csv('test_predictions_xgb.csv', index=False)

In [61]:
### all columns: 9.502021054
### 'NO2_trop', 'LAT', 'LST', 'LON', 'NO2_total': 10.68113867
### 'NO2_trop', 'normLAT', 'normLON': 10.4919
###'NO2_trop', 'normLAT', 'normLON', 'LST', 'NO2_total', 'AAI', 'NO2_strat', 'CloudFraction': 9.39

# n_estimators: 150
# max_depth: 7
# learning_rate: 0.2
# subsample: 1.0
# colsample_bytree: 1.0
# gamma: 0.1
# RMSE: 8.542101860046387