In [23]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

from xgboost import XGBClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

In [2]:
df = pd.DataFrame(np.random.normal(0, 1, (1010, 5)))
df[5] = df.sum(axis = 1)
df.head()

Unnamed: 0,0,1,2,3,4,5
0,-0.936439,-0.728845,0.456236,-0.612133,2.23483,0.413649
1,-0.172024,-0.424435,2.054798,-1.547247,-0.227933,-0.316841
2,0.731107,-0.74969,0.041676,-2.760639,-0.193917,-2.931463
3,0.09776,1.407724,0.911,-0.023172,0.663986,3.057298
4,-0.200179,-0.127733,0.108732,0.496057,0.604089,0.880965


In [3]:
df[6] = [1 if i>4 else 0 for i in df[5]]
df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,-0.936439,-0.728845,0.456236,-0.612133,2.23483,0.413649,0
1,-0.172024,-0.424435,2.054798,-1.547247,-0.227933,-0.316841,0
2,0.731107,-0.74969,0.041676,-2.760639,-0.193917,-2.931463,0
3,0.09776,1.407724,0.911,-0.023172,0.663986,3.057298,0
4,-0.200179,-0.127733,0.108732,0.496057,0.604089,0.880965,0


In [4]:
df.drop(columns=[5], inplace = True)
df.head()

Unnamed: 0,0,1,2,3,4,6
0,-0.936439,-0.728845,0.456236,-0.612133,2.23483,0
1,-0.172024,-0.424435,2.054798,-1.547247,-0.227933,0
2,0.731107,-0.74969,0.041676,-2.760639,-0.193917,0
3,0.09776,1.407724,0.911,-0.023172,0.663986,0
4,-0.200179,-0.127733,0.108732,0.496057,0.604089,0


In [5]:
df.rename(columns = {6:'label'}, inplace = True)
df.head()

Unnamed: 0,0,1,2,3,4,label
0,-0.936439,-0.728845,0.456236,-0.612133,2.23483,0
1,-0.172024,-0.424435,2.054798,-1.547247,-0.227933,0
2,0.731107,-0.74969,0.041676,-2.760639,-0.193917,0
3,0.09776,1.407724,0.911,-0.023172,0.663986,0
4,-0.200179,-0.127733,0.108732,0.496057,0.604089,0


In [6]:
df.label.value_counts()

0    972
1     38
Name: label, dtype: int64

In [7]:
X = df.drop(columns = ['label'])
y = df['label']
print(X.head())

          0         1         2         3         4
0 -0.936439 -0.728845  0.456236 -0.612133  2.234830
1 -0.172024 -0.424435  2.054798 -1.547247 -0.227933
2  0.731107 -0.749690  0.041676 -2.760639 -0.193917
3  0.097760  1.407724  0.911000 -0.023172  0.663986
4 -0.200179 -0.127733  0.108732  0.496057  0.604089


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=42, stratify=y)

In [9]:
xgb = XGBClassifier()
xgb.fit(X_train, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [10]:
print(f1_score(y_test, xgb.predict(X_test)))
print(accuracy_score(y_test, xgb.predict(X_test)))

0.7000000000000001
0.9820359281437125


In [20]:
class_weight = int(y_train.value_counts()[0]/y_train.value_counts()[1])
xgb = XGBClassifier(scale_pos_weight = class_weight, seed = 42)

In [21]:
xgb.fit(X_train, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=25, seed=42,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [22]:
print(f1_score(y_test, xgb.predict(X_test)))
print(accuracy_score(y_test, xgb.predict(X_test)))

0.761904761904762
0.9850299401197605


In [26]:
params = {'colsample_bytree' : .4, 'learning_rate' : 0.001}