In [None]:
pip install xgboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import xgboost as xgb
import numpy as np
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

# New Section

In [None]:
train = pd.read_csv('/content/drive/MyDrive/train_v1.csv')
test = pd.read_csv('/content/drive/MyDrive/test_v1.csv')

In [None]:
features = list(train.columns)
target = 'Anomaly'
features.remove(target)
features.remove('Index')
features

['Station',
 'Lag1',
 'Lag2',
 'Lag5',
 'Lag10',
 'Lag30',
 'Lag50',
 'diff1',
 'diff2',
 'diff5',
 'diff30',
 'diff50',
 'RollingMean_2',
 'RollingSD_2',
 'RollingMean_3',
 'RollingSD_3',
 'RollingMean_4',
 'RollingSD_4',
 'RollingMean_5',
 'RollingSD_5',
 'RollingMean_25',
 'RollingSD_25',
 'RollingMean_35',
 'RollingSD_35',
 'RollingMean_50',
 'RollingSD_50',
 'RollingMean_100',
 'RollingSD_100',
 'RollingMean_200',
 'RollingSD_200',
 'RollingMean_400',
 'RollingSD_400',
 'RollingMean_800',
 'RollingSD_800',
 'meanaddsd',
 'meandiffsd']

In [None]:
dtrain = xgb.DMatrix(train[features], label=train[target])
dtest = xgb.DMatrix(test[features], label=test[target])

params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'max_depth': 6,
    'learning_rate': 0.1,
    'seed': 42,
    'tree_method': 'gpu_hist'
}




In [None]:
bst = xgb.train(params, dtrain, num_boost_round=100)

In [None]:
y_pred = bst.predict(dtest)
y_pred_labels = np.round(y_pred)


In [None]:
precision = precision_score(test[target], y_pred_labels,zero_division = 0)
recall = recall_score(test[target], y_pred_labels)
f1 = f1_score(test[target], y_pred_labels)

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Precision: 0.5986196424874043
Recall: 0.6816245697310721
F1 Score: 0.6374312843578211


In [None]:

xgb_clf = xgb.XGBClassifier(
    objective='binary:logistic',
    n_jobs=-1,
    random_state=42,
    tree_method = 'gpu_hist'
)

In [None]:
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [9,12,15],
    'learning_rate': [0.02, 0.04],
    'subsample': [0.6, 1.0],
    'colsample_bytree': [0.6, 1.0],
    'gamma': [0.1, 0.2]
}


In [None]:
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
tscv = TimeSeriesSplit(n_splits=5)
grid_search = GridSearchCV(
    xgb_clf,
    param_grid,
    scoring='f1',
    n_jobs=-1,
    cv=tscv,
    verbose=1
)

In [None]:
grid_search.fit(train[features], train[target])

Fitting 5 folds for each of 144 candidates, totalling 720 fits




In [None]:
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_


In [None]:
best_model

In [None]:
best_params

{'colsample_bytree': 1.0,
 'gamma': 0.1,
 'learning_rate': 0.02,
 'max_depth': 12,
 'n_estimators': 50,
 'subsample': 0.6}

In [None]:
test_predictions = best_model.predict(test[features])

#y_pred = bst.predict(dtest)
#y_pred_labels = np.round(y_pred)
precision = precision_score(test[target], test_predictions,zero_division = 0)
recall = recall_score(test[target], test_predictions)
f1 = f1_score(test[target], test_predictions)

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Precision: 0.6219464711956798
Recall: 0.5774484070226176
F1 Score: 0.598871990480382


In [None]:

xgb_clf = xgb.XGBClassifier(
    objective='binary:logistic',
    n_jobs=-1,
    random_state=42,
    tree_method = 'gpu_hist',
    max_depth = 12,
    gamma = 1
)
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.0005, 0.0008,0.001],
    'subsample': [0.05,0.1]
}
tscv = TimeSeriesSplit(n_splits=3)
grid_search = GridSearchCV(
    xgb_clf,
    param_grid,
    scoring='f1',
    n_jobs=-1,
    cv=tscv,
    verbose=1
)
grid_search.fit(train[features], train[target])


Fitting 3 folds for each of 18 candidates, totalling 54 fits


In [None]:
best_2model = grid_search.best_estimator_
best_2params = grid_search.best_params_
test_2predictions = best_2model.predict(test[features])

#y_pred = bst.predict(dtest)
#y_pred_labels = np.round(y_pred)
precision = precision_score(test[target], test_2predictions,zero_division = 0)
recall = recall_score(test[target], test_2predictions)
f1 = f1_score(test[target], test_2predictions)

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Precision: 0.7152134032501583
Recall: 0.6924382691793848
F1 Score: 0.7036415908001915


In [None]:
best_2params

{'learning_rate': 0.001, 'n_estimators': 50, 'subsample': 0.1}

In [None]:
xgb_clf = xgb.XGBClassifier(
    objective='binary:logistic',
    n_jobs=-1,
    random_state=42,
    tree_method = 'gpu_hist',
    learning_rate = 0.001,
    n_estimators = 50,
    subsample = 0.1,
    gamma = 1
)
param_grid = {
    'max_depth' : [5,8,10,12,15,18]
}
tscv = TimeSeriesSplit(n_splits=3)
grid_search = GridSearchCV(
    xgb_clf,
    param_grid,
    scoring='f1',
    n_jobs=-1,
    cv=tscv,
    verbose=1
)
grid_search.fit(train[features], train[target])


Fitting 3 folds for each of 6 candidates, totalling 18 fits


In [None]:
best_3model = grid_search.best_estimator_
best_3params = grid_search.best_params_
test_3predictions = best_3model.predict(test[features])

#y_pred = bst.predict(dtest)
#y_pred_labels = np.round(y_pred)
precision = precision_score(test[target], test_3predictions,zero_division = 0)
recall = recall_score(test[target], test_3predictions)
f1 = f1_score(test[target], test_3predictions)

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Precision: 0.7158327816135439
Recall: 0.7137513163478616
F1 Score: 0.7147905336806731
