In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import torch, os
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import utils, scoring
from torch.nn import BCEWithLogitsLoss
from torch.optim import lr_scheduler
from tqdm import tqdm
from matplotlib import pyplot as plt
%matplotlib inline
from copy import deepcopy
from torch.nn.functional import relu
os.environ["CUDA_VISIBLE_DEVICES"]= "3" 

# reading data

In [4]:
DATA_PATH = "../data"
train_df, test_df = utils.load_data_csv(DATA_PATH, utils.SIMPLE_FEATURE_COLUMNS)

train_hit = pd.read_csv('../data/train_closest_hits_features.csv')
test_hit = pd.read_csv('../data/test_closest_hits_features.csv')

print ('train shape {} test shape {}'.format(train_df.shape, test_df.shape))
print ('train shape {} test shape {}'.format(train_hit.shape, test_hit.shape))


train_df = pd.concat([train_df, train_hit], axis=1)
test_df = pd.concat([test_df, test_hit], axis=1)
del train_hit, test_hit
print ('After concating train shape {} test shape {}'.format(train_df.shape, test_df.shape))

train_df.drop('Unnamed: 0', axis = 1, inplace=True)
test_df.drop('Unnamed: 0', axis = 1, inplace=True)

train shape (5445705, 67) test shape (726095, 65)
train shape (5445705, 25) test shape (726095, 25)
After concating train shape (5445705, 92) test shape (726095, 90)


In [28]:
train_cols = [x.replace('[','').replace(']','') for x in train_df.columns.tolist() ]
test_cols = [x.replace('[','').replace(']','') for x in test_df.columns.tolist() ]
train_df.columns = train_cols
test_df.columns = test_cols

## rank gauss transformation

In [31]:
def rank_gauss(x):
    from scipy.special import erfinv
    N = x.shape[0]
    temp = x.argsort()
    rank_x = temp.argsort() / N
    rank_x -= rank_x.mean()
    rank_x *= 2
    efi_x = erfinv(rank_x)
    efi_x -= efi_x.mean()
    return efi_x

In [32]:
train_df.head(1)

Unnamed: 0,ncl0,ncl1,ncl2,ncl3,avg_cs0,avg_cs1,avg_cs2,avg_cs3,ndof,MatchedHit_TYPE0,...,14,15,16,17,18,19,20,21,22,23
0,-0.242089,0.690287,0.202455,0.34211,-0.497902,-0.466256,-0.564253,-0.464437,-0.766638,2,...,-0.631916,-0.644717,0.604388,0.604201,0.587477,0.592659,0.592075,0.591224,0.589062,0.587932


In [33]:
type_cols = [x for x in train_df.columns if 'TYPE' in x]

In [34]:
for i, col in enumerate(train_df.drop( type_cols + ['label', 'weight'], axis = 1)):
    train_df[col] = rank_gauss(train_df[col].values)

In [36]:
test_df.columns

Index(['ncl0', 'ncl1', 'ncl2', 'ncl3', 'avg_cs0', 'avg_cs1', 'avg_cs2',
       'avg_cs3', 'ndof', 'MatchedHit_TYPE0', 'MatchedHit_TYPE1',
       'MatchedHit_TYPE2', 'MatchedHit_TYPE3', 'MatchedHit_X0',
       'MatchedHit_X1', 'MatchedHit_X2', 'MatchedHit_X3', 'MatchedHit_Y0',
       'MatchedHit_Y1', 'MatchedHit_Y2', 'MatchedHit_Y3', 'MatchedHit_Z0',
       'MatchedHit_Z1', 'MatchedHit_Z2', 'MatchedHit_Z3', 'MatchedHit_DX0',
       'MatchedHit_DX1', 'MatchedHit_DX2', 'MatchedHit_DX3', 'MatchedHit_DY0',
       'MatchedHit_DY1', 'MatchedHit_DY2', 'MatchedHit_DY3', 'MatchedHit_DZ0',
       'MatchedHit_DZ1', 'MatchedHit_DZ2', 'MatchedHit_DZ3', 'MatchedHit_T0',
       'MatchedHit_T1', 'MatchedHit_T2', 'MatchedHit_T3', 'MatchedHit_DT0',
       'MatchedHit_DT1', 'MatchedHit_DT2', 'MatchedHit_DT3', 'Lextra_X0',
       'Lextra_X1', 'Lextra_X2', 'Lextra_X3', 'Lextra_Y0', 'Lextra_Y1',
       'Lextra_Y2', 'Lextra_Y3', 'NShared', 'Mextra_DX20', 'Mextra_DX21',
       'Mextra_DX22', 'Mextra_DX23', 'Me

In [37]:
for i, col in enumerate(test_df.drop(type_cols, axis = 1)):
    test_df[col] = rank_gauss(test_df[col].values)

In [39]:
test_df.head(1)

Unnamed: 0_level_0,ncl0,ncl1,ncl2,ncl3,avg_cs0,avg_cs1,avg_cs2,avg_cs3,ndof,MatchedHit_TYPE0,...,14,15,16,17,18,19,20,21,22,23
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-0.055707,-0.344936,-0.076376,0.35326,-0.325523,-1.153343,0.213742,0.711829,-1.001479,2,...,-0.329172,-0.325262,0.190187,0.198749,0.206819,0.208897,0.403182,0.202787,0.208456,0.211086


In [40]:
# train_df = all_df[all_df.istrain == 1].drop('istrain', axis = 1)
# test_df = all_df[all_df.istrain == 0].drop(['istrain','label', 'weight'] , axis = 1)
train_part, valid_df = train_test_split(train_df, test_size = 0.10, random_state = 0, shuffle=True)
train_features = [x for x in train_df.columns.tolist() if x not in ['label', 'weight'] ]
X_train = train_part[train_features]
y_train = train_part[['label']]
w_train = train_part[['weight']]

X_valid = valid_df[train_features]
y_valid = valid_df['label']
w_valid = valid_df['weight']

In [41]:
test_df.shape, train_df.shape, X_valid.shape, X_train.shape

((726095, 89), (5445705, 91), (544571, 89), (4901134, 89))

In [42]:
X_train.head(2)

Unnamed: 0,ncl0,ncl1,ncl2,ncl3,avg_cs0,avg_cs1,avg_cs2,avg_cs3,ndof,MatchedHit_TYPE0,...,14,15,16,17,18,19,20,21,22,23
5342123,-0.981942,0.651043,-0.760991,-0.77534,-0.97517,0.766722,-0.967815,-0.418826,-0.312014,2,...,-0.529434,-0.525633,-0.215047,-0.164716,-0.210305,-0.15064,-0.178574,-0.250939,-0.209326,-0.311239
4394750,0.152488,0.234626,0.381504,0.466911,-0.329981,-0.537258,0.16002,0.234259,-0.164703,2,...,-0.673702,-0.673254,-0.739832,-0.822045,-0.824762,-0.68923,-0.744055,-0.819817,-0.856018,-0.786825


## Model

In [60]:
class MyModel(torch.nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.fc1 = torch.nn.Linear(input_size, 32)
        self.fc2 = torch.nn.Linear(32, 16)
        self.fc3 = torch.nn.Linear(16, 4) 
        self.fc4 = torch.nn.Linear(4, 1) 
            
    def forward(self, x):
        fc1_op = relu(self.fc1(x))
        fc2_op = relu(self.fc2(fc1_op))
        fc3_op = relu(self.fc3(fc2_op))
        return self.fc4(fc3_op)

In [2]:
## 
# 4901134/2048

In [62]:
def train(x, y, model, epochs, bs, lr):
    iters = len(x)//bs
    x , y = torch.from_numpy(x.values).float().cuda(), torch.from_numpy(y.values).float().cuda()
    optim = torch.optim.Adam(model.parameters(), lr=lr)
    floss = BCEWithLogitsLoss()
    losses = []
    best_score = 0
    
    for e in tqdm(range(epochs)):
        cosine_scheduler = lr_scheduler.CosineAnnealingLR(optim, 2527, eta_min=0)
        ind = np.random.permutation(range(len(x)))
        x, y = x[ind], y[ind]
        for i in range(iters):
            cosine_scheduler.step()
            batch_x, batch_y = x[i*bs:(i+1)*bs], y[i*bs:(i+1)*bs]
            optim.zero_grad()
            loss = floss(model(batch_x), batch_y)
            losses.append(loss.item())
            loss.backward()
            optim.step()
            
        temp_model = deepcopy(model)
        temp_model.eval()
        train_pred = predict(X_train, temp_model)
        valid_pred = predict(X_valid, temp_model)
        auc_valid = roc_auc_score(y_valid.values, valid_pred, sample_weight = w_valid.values)
        auc_train = roc_auc_score(y_train.label.values, train_pred, sample_weight = w_train.values)

        valid_score = scoring.rejection90(y_valid.values, valid_pred, sample_weight = w_valid.values)
        train_score = scoring.rejection90(y_train.label.values, train_pred, sample_weight = w_train.values)
        if valid_score > best_score:
            best_score = valid_score
            best_model = temp_model
            best_model_wts = temp_model.state_dict()
            save_checkpoint(e,{
                    'epoch': e + 1,
                    'state_dict': temp_model.state_dict(),
                    'best_score': best_score,
                    'optimizer' : optim.state_dict(),
                })
            print(f'Epoch {e}: valid score {best_score}, train_score {train_score}, train_auc {auc_train}, valid_auc {auc_valid}')
            
    return best_model

In [63]:
path_to_checkpoint = '../models/NN_models_3_32init'
def save_checkpoint(epoch, state, path_to_checkpoint = path_to_checkpoint, filename = '_ckpt.pth.tar'):
    filepath = os.path.join(path_to_checkpoint, "epoch_" + str(epoch) + filename)
    torch.save(state, filepath)

In [64]:
def predict(x, model):
    x = torch.from_numpy(x.values).float().cuda()
    return torch.nn.functional.sigmoid(model(x)).cpu().detach().numpy()[:, 0]

In [66]:
X_train.shape

(4901134, 89)

In [67]:
np.random.seed(0)
torch.manual_seed(0)
model = MyModel(X_train.shape[1]).cuda()
my_best_model = train(X_train, y_train, model, epochs= 250, bs = 2048, lr = 0.01 )

  0%|          | 1/250 [00:22<1:34:56, 22.88s/it]

Epoch 0: valid score 0.6747979020046352, train_score 0.6703029689198009, train_auc 0.8444870170745933, valid_auc 0.8481092741159438


  1%|          | 2/250 [00:44<1:32:21, 22.34s/it]

Epoch 1: valid score 0.6892409605665334, train_score 0.6913952185497753, train_auc 0.8553819854026998, valid_auc 0.8561496435297118


  1%|          | 3/250 [01:06<1:31:55, 22.33s/it]

Epoch 2: valid score 0.704508150902683, train_score 0.7062492399022919, train_auc 0.8611747080550567, valid_auc 0.8623623763234125


  2%|▏         | 4/250 [01:29<1:32:01, 22.45s/it]

Epoch 3: valid score 0.7231799174200926, train_score 0.7188314556126008, train_auc 0.866709938387376, valid_auc 0.8696666756432866


  2%|▏         | 5/250 [01:52<1:31:44, 22.47s/it]

Epoch 4: valid score 0.7344688043624708, train_score 0.725435874526409, train_auc 0.8706665546315574, valid_auc 0.8744852676587198


  2%|▏         | 6/250 [02:14<1:31:12, 22.43s/it]

Epoch 5: valid score 0.7377304846198529, train_score 0.7303722315346607, train_auc 0.8721480030118717, valid_auc 0.8750312119190619


  3%|▎         | 8/250 [03:03<1:32:36, 22.96s/it]

Epoch 7: valid score 0.7390204845881514, train_score 0.7374604591066385, train_auc 0.8735483607871942, valid_auc 0.8778261606310922


  4%|▎         | 9/250 [04:02<1:48:21, 26.98s/it]

Epoch 8: valid score 0.7435984004052196, train_score 0.7399824668895671, train_auc 0.8756390744090293, valid_auc 0.8801622688638795


  4%|▍         | 10/250 [05:00<2:00:03, 30.01s/it]

Epoch 9: valid score 0.7479433830490428, train_score 0.7387686485596948, train_auc 0.8756517262941075, valid_auc 0.8806555142580179


  7%|▋         | 18/250 [13:02<2:48:02, 43.46s/it]

Epoch 17: valid score 0.7486857409678113, train_score 0.7416555034267215, train_auc 0.8763367082647321, valid_auc 0.8810230429611378


  8%|▊         | 20/250 [13:53<2:39:48, 41.69s/it]

Epoch 19: valid score 0.7490085803217209, train_score 0.7409321025040049, train_auc 0.8765706990553934, valid_auc 0.881487460133393


 10%|▉         | 24/250 [15:35<2:26:51, 38.99s/it]

Epoch 23: valid score 0.7495640373920736, train_score 0.7394743127238018, train_auc 0.8764240849732953, valid_auc 0.8805086486347348


 11%|█         | 28/250 [17:18<2:17:14, 37.09s/it]

Epoch 27: valid score 0.7499921302299118, train_score 0.7423946719805112, train_auc 0.8773388619194787, valid_auc 0.8816102343439828


 19%|█▉        | 48/250 [25:27<1:47:10, 31.83s/it]

Epoch 47: valid score 0.7516971396416262, train_score 0.7427171616396189, train_auc 0.8775952336831055, valid_auc 0.8811686204440564


 21%|██        | 52/250 [26:43<1:41:45, 30.83s/it]

Epoch 51: valid score 0.7553783676907857, train_score 0.7418217524413059, train_auc 0.8773859324223199, valid_auc 0.8825284331712115


100%|██████████| 250/250 [1:48:22<00:00, 26.01s/it]


In [None]:
X_train.shape, test_df.shape

In [69]:
my_best_model

MyModel(
  (fc1): Linear(in_features=89, out_features=32, bias=True)
  (fc2): Linear(in_features=32, out_features=16, bias=True)
  (fc3): Linear(in_features=16, out_features=4, bias=True)
  (fc4): Linear(in_features=4, out_features=1, bias=True)
)

In [68]:
my_best_model.eval()
train_pred = predict(X_train, my_best_model)
valid_pred = predict(X_valid, my_best_model)
print ('auc', roc_auc_score(y_valid.values, valid_pred, sample_weight = w_valid.values))
print ('auc', roc_auc_score(y_train.label.values, train_pred, sample_weight = w_train.values))

print( "validation score = ", scoring.rejection90(y_valid.values, valid_pred, sample_weight = w_valid.values))
print( "train score = ", scoring.rejection90(y_train.label.values, train_pred, sample_weight = w_train.values))
test_pred = predict(test_df, my_best_model)

auc 0.8825284331712115
auc 0.8773859324223199
validation score =  0.7553783676907857
train score =  0.7418217524413059


In [70]:
# test_pred = predict(test_df, model)
test_pred[:20]

array([0.9852594 , 0.9842867 , 0.9930386 , 0.97217464, 0.9774964 ,
       0.92686456, 0.9637913 , 0.98121256, 0.9723598 , 0.97658247,
       0.9820066 , 0.95516145, 0.988545  , 0.961777  , 0.99406505,
       0.9758035 , 0.9793133 , 0.98250836, 0.9863214 , 0.9896046 ],
      dtype=float32)

In [71]:
pd.DataFrame(data={"prediction":  test_pred}, index=test_df.index).to_csv(
    "../submissions/NN_90perc_32_16_4_7553.csv", index_label=utils.ID_COLUMN)