In [2]:
! pip install xgboost



In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from collections import Counter
%matplotlib inline

In [14]:
combined_df = pd.read_csv('combined_df.csv', index_col='date', infer_datetime_format=True, parse_dates=True)

In [15]:
combined_df.head()

Unnamed: 0_level_0,class,volume,google trend,Reddit positive,Reddit negative,Google positive,Google negative,reddit buzzword score,google buzzword score
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2017-08-18,0,1199.888264,1967,0.132,0.082,0.15,0.0,146.333495,9.736044
2017-08-19,0,381.309763,1844,0.122,0.09,0.059,0.027,187.989746,6.715108
2017-08-20,0,467.083022,1704,0.128,0.109,0.076,0.017,200.210838,7.921181
2017-08-21,0,691.74306,1702,0.129,0.104,0.111,0.076,162.376347,7.465451
2017-08-22,0,966.684858,1893,0.131,0.108,0.065,0.036,164.7605,8.167459


In [16]:
X = combined_df.drop(columns = 'class')
y = combined_df['class']

In [17]:
# check class balance
y.value_counts()

 0    1391
 1      36
-1      23
Name: class, dtype: int64

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [19]:
# standardize X values
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [20]:
# resample the training data with SMOTEENN
from imblearn.combine import SMOTEENN
X_resampled, y_resampled = SMOTEENN(random_state=1).fit_resample(X_train, y_train)
# view the count of target classes with Counter
Counter(y_resampled)

Counter({-1: 1110, 0: 847, 1: 1100})

In [21]:
# define model
model = xgb.XGBClassifier()

In [28]:
# set parameters to compare
search_params = {'eta': [0.05, 0.1],
             'min_child_weight': [1, 2, 4, 6],
             'max_depth': [5, 6, 7, 8],
             'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
             'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
             'objective':['multi:sofprob']   
             }

In [29]:
gsx = GridSearchCV(
    model,
    search_params,
    cv = 3,
    n_jobs = -1,
    verbose=True)

In [30]:
gsx.fit(X_resampled, y_resampled)

Fitting 3 folds for each of 1152 candidates, totalling 3456 fits






GridSearchCV(cv=3,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     enable_categorical=False, gamma=None,
                                     gpu_id=None, importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,...
                                     random_state=None, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weight=None,
                                     subsample=None, tree_method=None,
                                     validate_parameters=None, v

In [31]:
gsx.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.9,
              enable_categorical=False, eta=0.1, gamma=0, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.100000001, max_delta_step=0, max_depth=8,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=16, num_parallel_tree=1,
              objective='multi:softprob', predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=None, subsample=0.9,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [32]:
score = gsx.best_estimator_.score(X_test, y_test)
print("%0.2f accuracy" % (score))

0.85 accuracy


In [33]:
predicted = gsx.best_estimator_.predict(X_test)

In [34]:
classification_report(y_test, predicted)

'              precision    recall  f1-score   support\n\n          -1       0.11      0.40      0.17         5\n           0       0.97      0.88      0.92       277\n           1       0.00      0.00      0.00         8\n\n    accuracy                           0.85       290\n   macro avg       0.36      0.43      0.37       290\nweighted avg       0.93      0.85      0.89       290\n'