In [2]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-1.7.6-py3-none-manylinux2014_x86_64.whl (200.3 MB)
[K     |████████████████████████████████| 200.3 MB 5.4 MB/s eta 0:00:01    |███████████                     | 69.4 MB 6.2 MB/s eta 0:00:21
Installing collected packages: xgboost
Successfully installed xgboost-1.7.6


In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_curve, auc
import xgboost as xgb

In [41]:
from sklearn.ensemble import GradientBoostingClassifier

In [19]:
train = pd.read_csv('./training.csv')
test = pd.read_csv('./test.csv')
check_agreement = pd.read_csv('./check_agreement.csv')
check_correlation = pd.read_csv('./check_correlation.csv')

In [22]:
def add_features(df):
    df['flight_dist_sig'] = df['FlightDistance']/df['FlightDistanceError']
    #df['flight_dist_sig2'] = (df['FlightDistance']/df['FlightDistanceError'])**2
    df['NEW_IP_dira'] = df['IP']*df['dira']
    df['NEW_FD_SUMP'] = df['FlightDistance']/(df['p0_p']+df['p1_p']+df['p2_p'])
    df['NEW5_lt'] = df['LifeTime']*(df['p0_IP']+df['p1_IP']+df['p2_IP'])/3
    df['p_track_Chi2Dof_MAX'] = df.loc[:,['p0_track_Chi2Dof','p1_track_Chi2Dof','p2_track_Chi2Dof']].max(axis=1)
    #df['p0p2_ip_ratio'] = df['IP']/df['IP_p0p2']
    #df['p1p2_ip_ratio'] = df['IP']/df['IP_p1p2']
    #df['DCA_MAX'] = df.loc[:, ['DOCAone','DOCAtwo','DOCAthree']].max(axis=1)
    #df['iso_bdt_min'] = df.loc[:,['p0_IsoBDT','p1_IsoBDT','p2_IsoBDT']].min(axis=1)
    #df['iso_min'] = df.loc[:,['isolationa','isolationb','isolationc','isolationd','isolatione','isolationf']].min(axis=1)
    #df['NEW_FD_LT'] = df['FlightDistance']/df['LifeTime']
    
    return df

In [23]:
train = add_features(train)
test = add_features(test)

In [24]:
check_agreement = add_features(check_agreement)
check_correlation = add_features(check_correlation)

In [25]:
filter_out =['id', 'min_ANNmuon', 'production', 
             'mass', 'signal','SPDhits','p0_track_Chi2Dof','CDF1', 
             'CDF2', 'CDF3','isolationb', 'isolationc','p0_pt', 
             'p1_pt', 'p2_pt', 'p0_p', 'p1_p', 'p2_p', 'p0_eta',
             'p1_eta', 'p2_eta','DOCAone', 'DOCAtwo', 'DOCAthree']

features = list(f for f in train.columns if f not in filter_out)

In [26]:
train_eval = train[train['min_ANNmuon']>0.4]

In [44]:
rf = GradientBoostingClassifier(n_estimators=575, max_depth=6,
                                learning_rate=0.15, subsample=0.7, random_state=369)

In [45]:
rf.fit(train[features], train['signal'])

GradientBoostingClassifier(learning_rate=0.15, max_depth=6, n_estimators=575,
                           random_state=369, subsample=0.7)

In [40]:
params = {"objective": "binary:logistic",
          "learning_rate": 0.2,
          "max_depth": 8,
          'gamma': 0.01,
          "min_child_weight": 3,
          "subsample": 0.7,
          "colsample_bytree": 0.7,
          'nthread': 4,
          "seed": 1}
model = xgb.XGBClassifier(**params)
model.fit(train[features], train['signal'])

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.7, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=0.01, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.2, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=8, max_leaves=None,
              min_child_weight=3, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, nthread=4, num_parallel_tree=None,
              predictor=None, ...)

In [47]:
pred_rf = rf.predict_proba(test[features])
pred_xgb = model.predict_proba(test[features])

sub = pd.read_csv('./sample_submission.csv')

In [59]:
pred_rf

array([[0.99254109, 0.00745891],
       [0.98042736, 0.01957264],
       [0.9606825 , 0.0393175 ],
       ...,
       [0.99682636, 0.00317364],
       [0.99856819, 0.00143181],
       [0.58447035, 0.41552965]])

In [56]:
pred_xgb[:,0]

array([0.9394928 , 0.966336  , 0.03287816, ..., 0.95944935, 0.97166973,
       0.33059698], dtype=float32)

In [58]:
train['signal']

0        0
1        0
2        0
3        0
4        0
        ..
67548    1
67549    1
67550    1
67551    1
67552    1
Name: signal, Length: 67553, dtype: int64

In [60]:
test_prob = 0.5*pred_xgb + 0.5*pred_rf

In [61]:
test_prob

array([[0.96601696, 0.03398304],
       [0.97338168, 0.0266183 ],
       [0.49678033, 0.50321967],
       ...,
       [0.97813786, 0.02186214],
       [0.98511896, 0.01488103],
       [0.45753367, 0.54246633]])

In [64]:
sub['prediction'] = test_prob[:,1]
sub

Unnamed: 0,id,prediction
0,14711831,0.033983
1,16316387,0.026618
2,6771382,0.503220
3,686045,0.023452
4,8755882,0.055916
...,...,...
855814,6977440,0.372549
855815,5731566,0.023732
855816,559204,0.021862
855817,10478880,0.014881


In [66]:
sub.to_csv('sub_Ensemble1.csv',index=False)