In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import pickle
import statsmodels.api as sm
import fracdiff
import xgboost as xgb
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import RFE
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.DataFrame()
for i in range(1, 8):
    temp = pd.read_csv(f'DOGEUSDT-trades-2023-08-0{i}.zip', header=None)
    df = pd.concat([df, temp])

del temp
gc.collect()

df.columns = ['id', 'price', 'qty', 'quote_qty', 'time', 'is_buyer_maker', 'unknown']
df.drop(columns=['id', 'unknown'], inplace=True)
len(df)

832244

In [3]:
assert df.time.is_monotonic_increasing

In [4]:
df['buy_or_sell'] = df.is_buyer_maker.map({True: -1, False: 1})
df.drop(columns=['is_buyer_maker'], inplace=True)

In [5]:
df['cumvol'] = df.qty.cumsum()

In [6]:
df.head()

Unnamed: 0,price,qty,quote_qty,time,buy_or_sell,cumvol
0,0.07786,5056.0,393.66016,1690848000650,1,5056.0
1,0.07786,21744.0,1692.98784,1690848000650,1,26800.0
2,0.07785,4327.0,336.85695,1690848003405,-1,31127.0
3,0.07785,1248.0,97.1568,1690848003405,-1,32375.0
4,0.07785,57843.0,4503.07755,1690848003405,-1,90218.0


In [7]:
multiples = 5e5

price_df = df.price.groupby(df.cumvol//multiples).agg(['ohlc', 'mean', 'std', 'count'])
price_df.columns = ['open', 'high', 'low', 'close', 'price_mean', 'price_std', 'count']
price_df

Unnamed: 0_level_0,open,high,low,close,price_mean,price_std,count
cumvol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0.0,0.07786,0.07786,0.07751,0.07776,0.077682,0.000089,916
1.0,0.07776,0.07800,0.07776,0.07800,0.077852,0.000067,669
2.0,0.07799,0.07799,0.07771,0.07795,0.077860,0.000064,705
3.0,0.07795,0.07812,0.07789,0.07807,0.078003,0.000056,990
4.0,0.07807,0.07819,0.07797,0.07812,0.078086,0.000052,941
...,...,...,...,...,...,...,...
1075.0,0.07360,0.07366,0.07335,0.07335,0.073545,0.000077,614
1076.0,0.07335,0.07355,0.07329,0.07340,0.073413,0.000063,607
1077.0,0.07340,0.07354,0.07332,0.07332,0.073453,0.000063,895
1078.0,0.07332,0.07344,0.07320,0.07344,0.073299,0.000058,804


In [8]:
signed_qty_df = (df.buy_or_sell * df.qty).groupby(df.cumvol//multiples).agg([('sell_vol' , lambda x : x[x < 0].sum()) , ('buy_vol' , lambda x : x[x > 0].sum())])
signed_qty_df.head()

Unnamed: 0_level_0,sell_vol,buy_vol
cumvol,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,-400461.0,80412.0
1.0,-197895.0,300288.0
2.0,-303069.0,215336.0
3.0,-390421.0,110778.0
4.0,-275204.0,223533.0


In [9]:
signed_dollar_df = (df.buy_or_sell * df.quote_qty).groupby(df.cumvol//multiples).agg([('sell_dollar' , lambda x : x[x < 0].sum()) , ('buy_dollar' , lambda x : x[x > 0].sum())])
signed_dollar_df.head()

Unnamed: 0_level_0,sell_dollar,buy_dollar
cumvol,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,-31166.73932,6257.18176
1.0,-15392.64322,23358.15637
2.0,-23571.41901,16747.60677
3.0,-30357.26267,8614.53582
4.0,-21379.99699,17368.8054


In [10]:
qty_df = df.qty.groupby(df.cumvol//multiples).agg(['sum', 'mean', 'std'])
qty_df.columns = ['qty_sum', 'qty_mean', 'qty_std']
qty_df.head()

Unnamed: 0_level_0,qty_sum,qty_mean,qty_std
cumvol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,480873.0,6969.173913,8296.672566
1.0,498183.0,7548.227273,8044.121624
2.0,518405.0,6027.965116,9794.749725
3.0,501199.0,5389.236559,7424.55626
4.0,498737.0,5037.747475,7815.394582


In [11]:
dollar_df = df.quote_qty.groupby(df.cumvol//multiples).agg(['sum', 'mean', 'std'])
dollar_df.columns = ['dollar_sum', 'dollar_mean', 'dollar_std']
dollar_df.head()

Unnamed: 0_level_0,dollar_sum,dollar_mean,dollar_std
cumvol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,37423.92108,542.375668,645.834526
1.0,38750.79959,587.133327,625.730106
2.0,40319.02578,468.825881,761.770294
3.0,38971.79849,419.051597,577.276239
4.0,38748.80239,391.402044,607.286561


In [12]:
flow_df = pd.get_dummies(df.buy_or_sell).groupby(df.cumvol//multiples).agg('sum')
flow_df.columns = ['num_sell', 'num_buy']
flow_df.head()

Unnamed: 0_level_0,num_sell,num_buy
cumvol,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,60.0,9.0
1.0,29.0,37.0
2.0,54.0,32.0
3.0,76.0,17.0
4.0,63.0,36.0


In [13]:
time_df = df.time.groupby(df.cumvol//multiples).agg(['first', 'last'])
time_df.columns = ['time_first', 'time_last']
time_df.head()

Unnamed: 0_level_0,time_first,time_last
cumvol,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,1690848000650,1690848015647
1.0,1690848016644,1690848072541
2.0,1690848072541,1690848204938
3.0,1690848204938,1690848278570
4.0,1690848278570,1690848323619


In [14]:
vwap_df = pd.DataFrame(dollar_df.dollar_sum / qty_df.qty_sum).rename(columns={0: 'vwap'})
vwap_df.head()

Unnamed: 0_level_0,vwap
cumvol,Unnamed: 1_level_1
0.0,0.077825
1.0,0.077784
2.0,0.077775
3.0,0.077757
4.0,0.077694


In [15]:
def round_lot_proportion(x, n):
    return np.isclose(x.qty.values/n, x.qty.values//n).sum()/len(x)

round_lot_df = df.groupby(df.cumvol//multiples).apply(round_lot_proportion, 50).rename('order_size_x50_proportion')

In [16]:
new_df = pd.concat([price_df, qty_df, signed_qty_df, dollar_df, signed_dollar_df, flow_df, time_df, vwap_df, round_lot_df], axis=1)

In [17]:
new_df

Unnamed: 0_level_0,open,high,low,close,price_mean,price_std,count,qty_sum,qty_mean,qty_std,...,dollar_mean,dollar_std,sell_dollar,buy_dollar,num_sell,num_buy,time_first,time_last,vwap,order_size_x50_proportion
cumvol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,0.07786,0.07786,0.07778,0.07780,0.077818,0.000022,69,480873.0,6969.173913,8296.672566,...,542.375668,645.834526,-31166.73932,6257.18176,60.0,9.0,1690848000650,1690848015647,0.077825,0.072464
1.0,0.07779,0.07779,0.07776,0.07778,0.077782,0.000009,66,498183.0,7548.227273,8044.121624,...,587.133327,625.730106,-15392.64322,23358.15637,29.0,37.0,1690848016644,1690848072541,0.077784,0.015152
2.0,0.07778,0.07779,0.07776,0.07777,0.077776,0.000009,86,518405.0,6027.965116,9794.749725,...,468.825881,761.770294,-23571.41901,16747.60677,54.0,32.0,1690848072541,1690848204938,0.077775,0.046512
3.0,0.07777,0.07780,0.07771,0.07772,0.077753,0.000025,93,501199.0,5389.236559,7424.556260,...,419.051597,577.276239,-30357.26267,8614.53582,76.0,17.0,1690848204938,1690848278570,0.077757,0.225806
4.0,0.07772,0.07773,0.07763,0.07763,0.077685,0.000023,99,498737.0,5037.747475,7815.394582,...,391.402044,607.286561,-21379.99699,17368.80540,63.0,36.0,1690848278570,1690848323619,0.077694,0.202020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10792.0,0.07350,0.07350,0.07345,0.07346,0.073485,0.000014,44,531825.0,12086.931818,18098.182022,...,888.223805,1330.011553,-32684.80547,6397.04193,33.0,11.0,1691451891858,1691451924631,0.073486,0.090909
10793.0,0.07346,0.07347,0.07340,0.07341,0.073449,0.000018,90,499438.0,5549.311111,9106.283361,...,407.601154,668.872650,-20012.03220,16672.07165,46.0,44.0,1691451924631,1691452317896,0.073451,0.088889
10794.0,0.07341,0.07352,0.07340,0.07350,0.073476,0.000037,89,500442.0,5622.943820,9354.668344,...,413.107222,687.331329,-20574.31140,16192.23139,52.0,37.0,1691452317896,1691452467725,0.073468,0.123596
10795.0,0.07349,0.07355,0.07348,0.07355,0.073508,0.000022,94,493093.0,5245.670213,9869.879492,...,385.562189,725.403926,-21642.07818,14600.76763,50.0,44.0,1691452467870,1691452656221,0.073501,0.117021


### extract new features

In [18]:
# extract new features
df = new_df
window = 100

abs_log_diff_close = np.abs(np.log(df.close).diff())
def amihud(X):
    idx = X.index
    y = abs_log_diff_close[idx].to_numpy()
    X = df.dollar_sum[idx].to_numpy()
    X, y = X[~np.isnan(y)], y[~np.isnan(y)]
    res = sm.OLS(y, X).fit()
    #print(res.summary())
    return res.tvalues[0]

In [19]:
df.loc[df.isna().any(1)]
df.fillna(0, inplace=True)

In [20]:
amihud_df = df[['dollar_sum']].rolling(window).apply(amihud, raw=False).rename(columns={'dollar_sum': 'amihud'})

In [21]:
def vpin(X):
    idx = X.index
    buy = df.buy_vol[idx].to_numpy()
    sell = df.sell_vol[idx].to_numpy()
    qty = df.qty_sum[idx].to_numpy()
    return np.sum(np.abs(buy+sell))/window/(np.sum(qty))

In [22]:
vpin_df = df[['buy_vol']].rolling(window).apply(vpin, raw=False).rename(columns={'buy_vol': 'vpin'})
vpin_df

Unnamed: 0_level_0,vpin
cumvol,Unnamed: 1_level_1
0.0,
1.0,
2.0,
3.0,
4.0,
...,...
10792.0,0.005488
10793.0,0.005439
10794.0,0.005351
10795.0,0.005270


In [23]:
def pakinson(X):
    idx = X.index
    high = df.high[idx].to_numpy()
    low = df.low[idx].to_numpy()
    return np.sum(np.log(high/low)**2)/window/4/np.log(2)

In [24]:
pakinson_df = df[['high']].rolling(window).apply(pakinson, raw=False).rename(columns={'high': 'pakinson'})
pakinson_df

Unnamed: 0_level_0,pakinson
cumvol,Unnamed: 1_level_1
0.0,
1.0,
2.0,
3.0,
4.0,
...,...
10792.0,3.108596e-07
10793.0,3.087312e-07
10794.0,3.159574e-07
10795.0,3.181630e-07


In [25]:
df = pd.concat([df, amihud_df, vpin_df, pakinson_df], axis=1)

In [26]:
df['buy_ratio'] = df.buy_vol / df.qty_sum

In [27]:
feat1 = ['open', 'high', 'low', 'close', 'price_mean', 'price_std',
       'qty_sum', 'qty_mean', 'qty_std', 'sell_vol', 'buy_vol', 'dollar_sum',
       'dollar_mean', 'dollar_std', 'sell_dollar', 'buy_dollar', 'vwap',
       ]
feat2  = ['order_size_x50_proportion', 'amihud', 'vpin', 'pakinson',
       'buy_ratio']

In [28]:
fracdiffed_vwap = fracdiff.fdiff(df[['vwap']], n = 0.5)
fracdiffed_vwap = fracdiffed_vwap.reshape(1, -1)[0]

In [29]:
diffed_feat_df = df[feat1].diff().rename(columns={x: x+'_diff' for x in feat1})

In [30]:
signal = np.sign(np.log(df.vwap).diff()).shift(-1).rename('signal').to_frame()
print(signal.value_counts())
signal.head()

signal
-1.0      5418
 1.0      5201
 0.0       102
dtype: int64


Unnamed: 0_level_0,signal
cumvol,Unnamed: 1_level_1
0.0,-1.0
1.0,-1.0
2.0,-1.0
3.0,-1.0
4.0,-1.0


In [31]:
complete_df = pd.concat([diffed_feat_df, df[feat2], signal], axis=1)

In [32]:
complete_df['fracdiffed_vwap'] = fracdiffed_vwap

In [33]:
complete_df.dropna(inplace=True)
complete_df

Unnamed: 0_level_0,open_diff,high_diff,low_diff,close_diff,price_mean_diff,price_std_diff,qty_sum_diff,qty_mean_diff,qty_std_diff,sell_vol_diff,...,sell_dollar_diff,buy_dollar_diff,vwap_diff,order_size_x50_proportion,amihud,vpin,pakinson,buy_ratio,signal,fracdiffed_vwap
cumvol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
99.0,-0.00008,-0.00008,-0.00004,-0.00007,-0.000037,0.000004,1436.0,1537.453061,4055.074832,11476.0,...,897.81645,998.77287,-0.000020,0.112245,14.104725,0.004583,3.239762e-07,0.149909,-1.0,0.077393
100.0,-0.00007,-0.00005,-0.00006,-0.00006,-0.000054,-0.000003,-3614.0,-1064.067695,-3073.269320,1960.0,...,178.92558,-132.62134,-0.000064,0.065041,14.273816,0.004588,3.240247e-07,0.147663,-1.0,0.077328
101.0,-0.00005,-0.00007,-0.00011,-0.00011,-0.000099,0.000011,13049.0,-809.376505,-2509.913863,-23244.0,...,-1758.33176,-795.30758,-0.000091,0.126582,14.346049,0.004643,3.321935e-07,0.123816,-1.0,0.077238
102.0,-0.00012,-0.00004,-0.00001,0.00008,-0.000013,-0.000005,-10160.0,2443.688867,474.069092,299985.0,...,23172.03889,22377.55925,-0.000020,0.102273,14.596748,0.004669,3.365569e-07,0.708133,-1.0,0.077218
103.0,0.00008,0.00001,-0.00009,-0.00016,-0.000066,0.000029,-7238.0,-2839.512670,-771.979070,-147531.0,...,-11374.93743,-11969.03640,-0.000072,0.063218,14.140644,0.004633,3.535857e-07,0.403311,-1.0,0.077147
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10791.0,0.00008,0.00001,0.00006,-0.00002,0.000039,-0.000011,36277.0,11469.854953,11333.144051,-200069.0,...,-14708.75867,-12022.41864,0.000041,0.156250,13.135709,0.005434,3.115955e-07,0.352394,-1.0,0.073513
10792.0,-0.00002,-0.00003,-0.00005,-0.00004,-0.000028,0.000003,21759.0,-3852.630682,-84.091868,-114438.0,...,-8403.90617,-6818.56718,-0.000027,0.090909,13.294043,0.005488,3.108596e-07,0.163710,-1.0,0.073486
10793.0,-0.00004,-0.00003,-0.00005,-0.00005,-0.000036,0.000004,-32387.0,-6537.620707,-8991.898661,172322.0,...,12672.77327,10275.02972,-0.000036,0.088889,13.289583,0.005439,3.087312e-07,0.454511,1.0,0.073451
10794.0,-0.00005,0.00005,0.00000,0.00009,0.000026,0.000019,1004.0,73.632709,248.384983,-7531.0,...,-562.27920,-479.84026,0.000017,0.123596,13.276845,0.005351,3.159574e-07,0.440557,1.0,0.073468


In [34]:
complete_df.signal = complete_df.signal.map({0:-1, 1:1, -1:-1})

In [35]:
returns = df.vwap.pct_change().shift(-1)
returns = returns.loc[complete_df.index]

returns = returns.reset_index(drop=True)


### model training

In [36]:
complete_df.reset_index(inplace=True)

In [37]:
X_raw = complete_df.drop(columns=['signal'])
y = complete_df['signal']

In [38]:
# examine multicollinearity issue

X1 = add_constant(X_raw)

vif = pd.DataFrame()
vif["Feature"] = X1.columns
vif["VIF"] = [variance_inflation_factor(X1.values, i) for i in range(X1.shape[1])]
vif["VIF"] = vif["VIF"].round(4)
vif.round(4)

Unnamed: 0,Feature,VIF
0,const,6094.6512
1,cumvol,1.5074
2,open_diff,4.0627
3,high_diff,20.7422
4,low_diff,20.2726
5,close_diff,3.694
6,price_mean_diff,45.3024
7,price_std_diff,9.9258
8,qty_sum_diff,inf
9,qty_mean_diff,6894.3371


In [39]:
# only include features that have VIF < 15
vif_threshold = 15.
selected_cols = vif[vif['VIF'] <= vif_threshold]['Feature'].tolist()
X = X_raw[selected_cols]
X.head()

Unnamed: 0,cumvol,open_diff,close_diff,price_std_diff,order_size_x50_proportion,amihud,vpin,pakinson,buy_ratio,fracdiffed_vwap
0,99.0,-8e-05,-7e-05,4e-06,0.112245,14.104725,0.004583,3.239762e-07,0.149909,0.077393
1,100.0,-7e-05,-6e-05,-3e-06,0.065041,14.273816,0.004588,3.240247e-07,0.147663,0.077328
2,101.0,-5e-05,-0.00011,1.1e-05,0.126582,14.346049,0.004643,3.321935e-07,0.123816,0.077238
3,102.0,-0.00012,8e-05,-5e-06,0.102273,14.596748,0.004669,3.365569e-07,0.708133,0.077218
4,103.0,8e-05,-0.00016,2.9e-05,0.063218,14.140644,0.004633,3.535857e-07,0.403311,0.077147


In [40]:
ratio = 0.8
X_train = X.iloc[:int(len(X)*ratio)]
X_test = X.iloc[int(len(X)*ratio):]
y_train = y.iloc[:int(len(y)*ratio)]
y_test = y.iloc[int(len(y)*ratio):]

In [41]:
len(X_train), len(X_test)

(8497, 2125)

In [42]:
def getTEvents(feature, h=.0005):
    tEvents, sPos, sNeg=[],0,0
    diff=feature.diff()
    for i in diff.index[1:]:
        sPos,sNeg=max(0, sPos+diff.loc[i]), min(0, sNeg+diff.loc[i])
        if sNeg<-h:
            sNeg=0; tEvents.append(i)
        elif sPos>h:
            sPos=0; tEvents.append(i)
    return pd.Index(tEvents)

In [43]:
idx = getTEvents(X_train.vpin) # get tail events

In [44]:
w = np.abs(returns[idx])/np.sum(np.abs(returns[idx])) * len(idx)

### RFE

In [45]:
# estimator = RandomForestClassifier(n_estimators=500, random_state=42)
# selector = RFE(estimator, n_features_to_select=10, step=1)
# selector = selector.fit(X_train, y_train)

# selected_features = np.where(selector.support_)[0]
# print("Selected Features:", selected_features)

In [46]:
# X.columns[selected_features]

In [47]:
# X = X_raw[X_raw.columns[selected_features]]
# X_train = X.iloc[:int(len(X)*ratio)]
# X_test = X.iloc[int(len(X)*ratio):]
# y_train = y.iloc[:int(len(y)*ratio)]
# y_test = y.iloc[int(len(y)*ratio):]

### XGBoost Classifier

In [48]:
model = xgb.XGBClassifier(n_estimators=500, tree_method='gpu_hist')

In [None]:
param_grid = {
    'n_estimators': [100, 300, 500, 800],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.05, 0.1, 0.20],
    'min_child_weight': [1, 10, 100]
}

random_cv = RandomizedSearchCV(estimator=model,
                              param_distributions=param_grid,
                              cv=5, n_iter=100,
                              scoring='f1_macro',
                              verbose=1,
                              return_train_score=True,
                              random_state=42)

random_cv.fit(X_train.loc[idx], y_train.map({-1: 0, 1: 1}).loc[idx], sample_weight=w)
best_model = random_cv.best_estimator_

In [None]:
best_model = random_cv.best_estimator_
y_pred_proba = random_cv.predict_proba(X_test)
y_pred = np.argmax(y_pred_proba, axis=1)
y_pred[y_pred == 0] = -1
print(classification_report(y_test, y_pred))

In [None]:
y_pred_confident = y_pred.copy()
y_pred_confident[np.max(y_pred_proba, axis=1) < 0.75] = 0
y_pred_confident

In [None]:
print(classification_report(y_test[y_pred_confident != 0], y_pred[y_pred_confident != 0])) 

In [None]:
np.abs(returns.loc[y_test.index].loc[y_pred_confident != 0]).describe().apply(lambda x: format(x, 'f'))

In [None]:
np.abs(returns.loc[y_test.index].loc[y_pred_confident == 0]).describe().apply(lambda x: format(x, 'f'))

In [None]:
np.abs(returns.loc[y_test.index]).describe()

In [None]:
X_train.vpin.describe()

### GradientBoostingClassifier

In [None]:
model = GradientBoostingClassifier(n_estimators=500)

param_grid = {
    'n_estimators': [100, 300, 500, 800],
    'min_samples_split': [2, 4, 6],
    'learning_rate': [0.05, 0.1, 0.20],
    'max_depth': [3, 6, 9]
}

random_cv = RandomizedSearchCV(estimator=model,
                              param_distributions=param_grid,
                              cv=5, n_iter=100,
                              scoring='f1_macro',
                              verbose=1,
                              return_train_score=True,
                              random_state=42)

random_cv.fit(X_train.loc[idx], y_train.map({-1: 0, 1: 1}).loc[idx], sample_weight=w)
random_cv.best_estimator_

In [None]:
best_model = random_cv.best_estimator_
y_pred_proba = best_model.predict_proba(X_test)
y_pred = np.argmax(y_pred_proba, axis=1)
y_pred[y_pred == 0] = -1
print(classification_report(y_test, y_pred))

In [None]:
y_pred_confident = y_pred.copy()
y_pred_confident[np.max(y_pred_proba, axis=1) < .85] = 0
y_pred_confident

In [None]:
print(classification_report(y_test[y_pred_confident != 0], y_pred[y_pred_confident != 0])) 

### AdaBoost

In [None]:
base_estimator = DecisionTreeClassifier(random_state=42)
model = AdaBoostClassifier(n_estimators=100)

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.05, 0.1, 0.20]
}

gs_cv = GridSearchCV(model,
                     param_grid,
                     cv=5,
                     scoring='f1_macro',
                     verbose=1,
                     return_train_score=True)

gs_cv.fit(X_train.loc[idx], y_train.map({-1: 0, 1: 1}).loc[idx], sample_weight=w)
gs_cv.best_estimator_

In [None]:
y_pred_proba = gs_cv.predict_proba(X_test)
y_pred = np.argmax(y_pred_proba, axis=1)
y_pred[y_pred == 0] = -1
print(classification_report(y_test, y_pred))

In [None]:
y_pred_confident = y_pred.copy()
y_pred_confident[np.max(y_pred_proba, axis=1) < .60] = 0

print(classification_report(y_test[y_pred_confident != 0], y_pred[y_pred_confident != 0]))

### Random Forests

In [None]:
model = RandomForestClassifier(n_estimators=500)

In [None]:

param_grid = {
    'n_estimators': [100, 300, 500, 800],
    'min_samples_split': [2, 4, 6, 8],
    'max_depth': [3, 6, 9, 12]
}

random_cv = RandomizedSearchCV(estimator=model,
                              param_distributions=param_grid,
                              cv=5, n_iter=100,
                              scoring='f1_macro',
                              verbose=1,
                              return_train_score=True,
                              random_state=42)

random_cv.fit(X_train.loc[idx], y_train.map({-1: 0, 1: 1}).loc[idx], sample_weight=w)
random_cv.best_estimator_

In [None]:
y_pred_proba = random_cv.predict_proba(X_test)
y_pred = np.argmax(y_pred_proba, axis=1)
y_pred[y_pred == 0] = -1
print(classification_report(y_test, y_pred))

In [None]:
y_pred_confident = y_pred.copy()
y_pred_confident[np.max(y_pred_proba, axis=1) < 0.6] = 0

print(classification_report(y_test[y_pred_confident != 0], y_pred[y_pred_confident != 0])) 