In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import StratifiedKFold

from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import FunctionTransformer

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, roc_curve

from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

import xgboost as xgb

from copy import deepcopy

from hyperopt import hp
from hyperopt import STATUS_OK
from hyperopt import fmin, tpe, Trials

from bayes_opt import BayesianOptimization
from bayes_opt.logger import JSONLogger
from bayes_opt.event import Events
from bayes_opt.util import load_logs

import os
import math

ModuleNotFoundError: No module named 'bayes_opt'

In [2]:
TRAIN_PATH = 'C:/Users/User/Documents/New folder/Desktop/train.csv'
TEST_PATH = 'C:/Users/User/Documents/New folder/Desktop/train.csv'
SUB_PATH = 'SampleSubmission.csv'


In [5]:
train_data = pd.read_csv(TRAIN_PATH).set_index("ID")
test_data = pd.read_csv(TEST_PATH).set_index("ID")
# sub = pd.read_csv(SUB_PATH).set_index("ID")

In [31]:
train_data.head()

Unnamed: 0_level_0,Datetime,Sensor1_PM2.5,Sensor2_PM2.5,Temperature,Relative_Humidity,Offset_fault
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ID_QF0ZTQJ2SF5Q,2021-11-03 04:06:31,52.58,49.52,17.4,96.0,0
ID_4GTK689CNX5S,2021-11-08 18:43:23,35.25,33.4,25.0,75.0,0
ID_DL7VVKW9U7XQ,2021-11-07 09:50:33,19.18,23.5,24.9,75.0,0
ID_6XQOMBXM2DG3,2022-01-01 18:55:15,19.4,15.48,24.9,70.0,0
ID_UQZW9ISJY9QE,2021-11-05 22:23:48,38.3,34.77,20.9,89.0,0


In [4]:
train_data.isnull().sum()

Datetime                0
Sensor1_PM2.5        3614
Sensor2_PM2.5        3614
Temperature          3549
Relative_Humidity    3549
Offset_fault            0
dtype: int64

In [9]:
train_data.shape

(297177, 6)

In [10]:
# train_data.dropna(axis=0, inplace=True)
# train_data.isnull().sum()


# test_data.dropna(axis=0, inplace=True)


In [11]:
train_data.shape


(297177, 6)

In [6]:
train_data['Datetime'] = pd.to_datetime(train_data['Datetime'], errors='coerce')
train_data.head()

test_data['Datetime'] = pd.to_datetime(test_data['Datetime'], errors='coerce')


In [7]:
train_data['hour'] = train_data['Datetime'].apply(lambda x: x.hour)
train_data.head()

test_data['hour'] = test_data['Datetime'].apply(lambda x: x.hour)


In [8]:
max_value = 23
train_data["hour_sin"] = train_data["hour"].apply(lambda x: np.sin((2 * np.pi * x) / max_value))
train_data["hour_cos"] = train_data["hour"].apply(lambda x: np.cos((2 * np.pi * x) / max_value))

test_data["hour_sin"] = test_data["hour"].apply(lambda x: np.sin((2 * np.pi * x) / max_value))
test_data["hour_cos"] = test_data["hour"].apply(lambda x: np.cos((2 * np.pi * x) / max_value))

train_data.head()


Unnamed: 0_level_0,Datetime,Sensor1_PM2.5,Sensor2_PM2.5,Temperature,Relative_Humidity,Offset_fault,hour,hour_sin,hour_cos
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ID_QF0ZTQJ2SF5Q,2021-11-03 04:06:31,52.58,49.52,17.4,96.0,0,4,0.887885,0.460065
ID_4GTK689CNX5S,2021-11-08 18:43:23,35.25,33.4,25.0,75.0,0,18,-0.979084,0.203456
ID_DL7VVKW9U7XQ,2021-11-07 09:50:33,19.18,23.5,24.9,75.0,0,9,0.631088,-0.775711
ID_6XQOMBXM2DG3,2022-01-01 18:55:15,19.4,15.48,24.9,70.0,0,18,-0.979084,0.203456
ID_UQZW9ISJY9QE,2021-11-05 22:23:48,38.3,34.77,20.9,89.0,0,22,-0.269797,0.962917


In [9]:
train_data.drop(["hour", "Datetime"], axis=1, inplace=True)
train_data.head()
test_data.drop(["hour", "Datetime"], axis=1, inplace=True)


In [13]:
numeric_pipeline = Pipeline(
    steps=[
        ("impute", SimpleImputer(strategy="most_frequent")),
        ("scale", MinMaxScaler()),
    ]
)


In [27]:
X = train_data.drop('Offset_fault', axis=1)
y = train_data['Offset_fault']

num_cols = X.select_dtypes(include="number").columns

full_processor = ColumnTransformer(
    transformers=[
        ("numeric", numeric_pipeline, num_cols),
    ]
)

X_processed = full_processor.fit_transform(X)
y_processed = SimpleImputer(strategy="most_frequent").fit_transform(
    y.values.reshape(-1, 1)
)

test_processed = full_processor.transform(test_data)

X_train, X_valid, y_train, y_valid = train_test_split(
    X_processed, y_processed, stratify=y_processed, train_size=0.8, test_size=0.2
)


ValueError: X has 6 features, but ColumnTransformer is expecting 5 features as input.

In [36]:
print(test_data.shape)
test_data
X

(297177, 6)


Unnamed: 0_level_0,Temperature,Relative_Humidity,hour_sin,hour_cos,proportion
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ID_QF0ZTQJ2SF5Q,17.4,96.0,0.887885,0.460065,0.059941
ID_4GTK689CNX5S,25.0,75.0,-0.979084,0.203456,0.053897
ID_DL7VVKW9U7XQ,24.9,75.0,0.631088,-0.775711,0.202437
ID_6XQOMBXM2DG3,24.9,70.0,-0.979084,0.203456,0.224771
ID_UQZW9ISJY9QE,20.9,89.0,-0.269797,0.962917,0.096620
...,...,...,...,...,...
ID_AQFTZIKVX2QL,28.6,62.0,-0.136167,-0.990686,0.036942
ID_2X0AJPWOMJNA,21.6,86.0,0.269797,0.962917,0.209146
ID_0OAQMV7USTI4,20.2,94.0,0.269797,0.962917,0.170862
ID_Y8FDAO06VXGJ,20.6,93.0,-0.269797,0.962917,0.172603


In [62]:
pbounds = {
    'learning_rate': (0.01, 1.0),
    'n_estimators': (100, 1000),
    'max_depth': (1, 12),
    'subsample': (0.0, 1.0),  # Change for big datasets
    'colsample': (0.0, 1.0),  # Change for datasets with lots of features
    'gamma': (0, 10),
    "min_child_weight": (3, 10),
    "scale_pos_weight": (3, 10),
    "colsample_bytree": (0.0, 1)
}


def xgboost_hyper_param(learning_rate,
                        n_estimators,
                        max_depth,
                        subsample,
                        colsample,
                        gamma,
                        min_child_weight,
                        scale_pos_weight,
                        colsample_bytree):

    max_depth = int(max_depth)
    n_estimators = int(n_estimators)

    clf = xgb.XGBClassifier(
        max_depth=max_depth,
        learning_rate=learning_rate,
        n_estimators=n_estimators,
        gamma=gamma,
        min_child_weight=min_child_weight,
        scale_pos_weight=scale_pos_weight,
        colsample_bytree=colsample_bytree,
        use_label_encoder=False)
    return np.mean(cross_val_score(clf, X_train, y_train.ravel(), cv=5, scoring='neg_mean_squared_error'))


optimizer = BayesianOptimization(
    f=xgboost_hyper_param,
    pbounds=pbounds,
    random_state=100
)

logger = JSONLogger(path="logs.json")
optimizer.subscribe(Events.OPTIMIZATION_STEP, logger)

NameError: name 'BayesianOptimization' is not defined

In [105]:
if os.path.isfile('logs.json'):
    load_logs(optimizer, logs=["./logs.json"])

optimizer.maximize(
    init_points=5,
    n_iter=5,
)










In [112]:
param_grid = optimizer.max['params']
param_grid['max_depth'] = int(np.round(param_grid['max_depth']))
param_grid['n_estimators'] = int(np.round(param_grid['n_estimators']))

param_grid


{'colsample': 0.25242635344484043,
 'colsample_bytree': 0.7956625084732873,
 'gamma': 0.1525497124633901,
 'learning_rate': 0.602854943159208,
 'max_depth': 8,
 'min_child_weight': 3.736033797884394,
 'n_estimators': 444,
 'scale_pos_weight': 3.2553323961479825,
 'subsample': 0.8904115634420757}

In [107]:
final_cl = xgb.XGBClassifier(**param_grid, random_state=100)

final_cl.fit(X_train, y_train)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Parameters: { "colsample" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




XGBClassifier(base_score=0.5, booster='gbtree', colsample=0.25242635344484043,
              colsample_bylevel=1, colsample_bynode=1,
              colsample_bytree=0.7956625084732873, enable_categorical=False,
              gamma=0.1525497124633901, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.602854943159208,
              max_delta_step=0, max_depth=8, min_child_weight=3.736033797884394,
              missing=nan, monotone_constraints='()', n_estimators=444,
              n_jobs=4, num_parallel_tree=1, predictor='auto', random_state=100,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=3.2553323961479825,
              subsample=0.8904115634420757, tree_method='exact',
              validate_parameters=1, verbosity=None)

In [108]:
from sklearn.metrics import mean_squared_error


preds = final_cl.predict(X_valid)

metrics.mean_squared_error(y_valid, preds)


0.04090113735783027

In [110]:
final_cl.fit(X_processed, y_processed)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Parameters: { "colsample" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




XGBClassifier(base_score=0.5, booster='gbtree', colsample=0.25242635344484043,
              colsample_bylevel=1, colsample_bynode=1,
              colsample_bytree=0.7956625084732873, enable_categorical=False,
              gamma=0.1525497124633901, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.602854943159208,
              max_delta_step=0, max_depth=8, min_child_weight=3.736033797884394,
              missing=nan, monotone_constraints='()', n_estimators=444,
              n_jobs=4, num_parallel_tree=1, predictor='auto', random_state=100,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=3.2553323961479825,
              subsample=0.8904115634420757, tree_method='exact',
              validate_parameters=1, verbosity=None)

In [111]:
y_hat = final_cl.predict(test_processed)
sub["Offset_fault"] = y_hat
sub.to_csv("sub.csv")

In [43]:
train_data

Unnamed: 0_level_0,Sensor1_PM2.5,Sensor2_PM2.5,Temperature,Relative_Humidity,Offset_fault,hour_sin,hour_cos
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ID_QF0ZTQJ2SF5Q,52.58,49.52,17.4,96.0,0,0.887885,0.460065
ID_4GTK689CNX5S,35.25,33.40,25.0,75.0,0,-0.979084,0.203456
ID_DL7VVKW9U7XQ,19.18,23.50,24.9,75.0,0,0.631088,-0.775711
ID_6XQOMBXM2DG3,19.40,15.48,24.9,70.0,0,-0.979084,0.203456
ID_UQZW9ISJY9QE,38.30,34.77,20.9,89.0,0,-0.269797,0.962917
...,...,...,...,...,...,...,...
ID_AQFTZIKVX2QL,36.40,37.77,28.6,62.0,0,-0.136167,-0.990686
ID_2X0AJPWOMJNA,50.12,40.63,21.6,86.0,1,0.269797,0.962917
ID_0OAQMV7USTI4,53.68,45.23,20.2,94.0,1,0.269797,0.962917
ID_Y8FDAO06VXGJ,96.45,114.67,20.6,93.0,0,-0.269797,0.962917


In [16]:
train_data['Sensors_difference'] = train_data['Sensor1_PM2.5']-train_data['Sensor2_PM2.5']
test_data['Sensors_difference'] = test_data['Sensor1_PM2.5']-test_data['Sensor2_PM2.5']


In [17]:
train_data['proportion']=abs(train_data['Sensors_difference']/((train_data['Sensor1_PM2.5']+train_data['Sensor2_PM2.5'])/2))
test_data['proportion']=abs(test_data['Sensors_difference']/((test_data['Sensor1_PM2.5']+test_data['Sensor2_PM2.5'])/2))

In [18]:
train_data

Unnamed: 0_level_0,Sensor1_PM2.5,Sensor2_PM2.5,Temperature,Relative_Humidity,Offset_fault,hour_sin,hour_cos,Sensors_difference,proportion
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ID_QF0ZTQJ2SF5Q,52.58,49.52,17.4,96.0,0,0.887885,0.460065,3.06,0.059941
ID_4GTK689CNX5S,35.25,33.40,25.0,75.0,0,-0.979084,0.203456,1.85,0.053897
ID_DL7VVKW9U7XQ,19.18,23.50,24.9,75.0,0,0.631088,-0.775711,-4.32,0.202437
ID_6XQOMBXM2DG3,19.40,15.48,24.9,70.0,0,-0.979084,0.203456,3.92,0.224771
ID_UQZW9ISJY9QE,38.30,34.77,20.9,89.0,0,-0.269797,0.962917,3.53,0.096620
...,...,...,...,...,...,...,...,...,...
ID_AQFTZIKVX2QL,36.40,37.77,28.6,62.0,0,-0.136167,-0.990686,-1.37,0.036942
ID_2X0AJPWOMJNA,50.12,40.63,21.6,86.0,1,0.269797,0.962917,9.49,0.209146
ID_0OAQMV7USTI4,53.68,45.23,20.2,94.0,1,0.269797,0.962917,8.45,0.170862
ID_Y8FDAO06VXGJ,96.45,114.67,20.6,93.0,0,-0.269797,0.962917,-18.22,0.172603


In [19]:
train_data.drop('Sensors_difference',axis=1,inplace=True)
test_data.drop('Sensors_difference',axis=1,inplace=True)



In [20]:
train_data.drop('Sensor1_PM2.5',axis=1,inplace=True)
test_data.drop('Sensor1_PM2.5',axis=1,inplace=True)

In [24]:
train_data.drop('Sensor2_PM2.5',axis=1,inplace=True)
test_data.drop('Sensor2_PM2.5',axis=1,inplace=True)

KeyError: "['Sensor2_PM2.5'] not found in axis"

In [25]:
test_data

Unnamed: 0_level_0,Temperature,Relative_Humidity,Offset_fault,hour_sin,hour_cos,proportion
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ID_QF0ZTQJ2SF5Q,17.4,96.0,0,0.887885,0.460065,0.059941
ID_4GTK689CNX5S,25.0,75.0,0,-0.979084,0.203456,0.053897
ID_DL7VVKW9U7XQ,24.9,75.0,0,0.631088,-0.775711,0.202437
ID_6XQOMBXM2DG3,24.9,70.0,0,-0.979084,0.203456,0.224771
ID_UQZW9ISJY9QE,20.9,89.0,0,-0.269797,0.962917,0.096620
...,...,...,...,...,...,...
ID_AQFTZIKVX2QL,28.6,62.0,0,-0.136167,-0.990686,0.036942
ID_2X0AJPWOMJNA,21.6,86.0,1,0.269797,0.962917,0.209146
ID_0OAQMV7USTI4,20.2,94.0,1,0.269797,0.962917,0.170862
ID_Y8FDAO06VXGJ,20.6,93.0,0,-0.269797,0.962917,0.172603


In [28]:
test_data.drop('Offset_fault',inplace=True)

KeyError: "['Offset_fault'] not found in axis"

In [69]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 297177 entries, ID_QF0ZTQJ2SF5Q to ID_3XEXH72VWK4J
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Temperature        293628 non-null  float64
 1   Relative_Humidity  293628 non-null  float64
 2   Offset_fault       297177 non-null  int64  
 3   hour_sin           297177 non-null  float64
 4   hour_cos           297177 non-null  float64
 5   proportion         293563 non-null  float64
dtypes: float64(5), int64(1)
memory usage: 25.9+ MB
