Credit to https://www.kaggle.com/shahules/stocks-rapids-nn-starter.
I just did some changes:
1. Remove the cudf.
2. Do the scaler for nn's features
3. Fill the nan value of test
4. Speed up the inference. 

In [None]:
import sys
# !cp ../input/rapids/rapids.0.15.0 /opt/conda/envs/rapids.tar.gz
# !cd /opt/conda/envs/ && tar -xzvf rapids.tar.gz > /dev/null
# sys.path = ["/opt/conda/envs/rapids/lib/python3.7/site-packages"] + sys.path
# sys.path = ["/opt/conda/envs/rapids/lib/python3.7"] + sys.path
# sys.path = ["/opt/conda/envs/rapids/lib"] + sys.path 
# !cp /opt/conda/envs/rapids/lib/libxgboost.so /opt/conda/lib/
import warnings
warnings.filterwarnings("ignore")

## <font size='4' color='blue'><a> Imports </a></font>

In [None]:
# import cudf
import torch
import joblib
import janestreet
import numpy as np
# import cupy as cp
from time import time
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from contextlib import contextmanager
from sklearn.metrics import roc_auc_score
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import QuantileTransformer
# from cupyx.scipy.special import erfinv as cupy_erfinv
from tqdm import tqdm
import pandas as pd
import os
import random

In [None]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
seed_everything(34)

In [None]:
EPOCHS = 10#10
LEARNING_RATE = 1e-3
WEIGHT_DECAY = 1e-5
EARLY = 4
DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')

## <font size='4' color='blue'><a> Read Data </a></font>

In [None]:
@contextmanager
def timer(name):
    t0 = time()
    yield
    print(f'[{name}] done in {time() - t0:.2f} s')


In [None]:
with timer('load_data'):

    train = pd.read_csv('../input/jane-street-market-prediction/train.csv',nrows=1e6)
    test = pd.read_csv("../input/jane-street-market-prediction/example_test.csv")


In [None]:
drop_cols = list(np.setdiff1d(train.columns,test.columns)) + ['ts_id','date']+['weight']
train.head(3)

## <font size='4' color='blue'><a> Dataset </a></font>

In [None]:
class janeDataset(Dataset):
    
    def __init__(self,df,target,mode="train"):
        
        self.df = df.values
        self.mode = mode
        if self.mode == 'train':
            self.target = target.values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self,idx):
        
        if self.mode=="train":

            return {'x':torch.FloatTensor(self.df[idx,:]),
                    'y':torch.FloatTensor([self.target[idx]])}
        else:
            
            return {'x':torch.FloatTensor(self.df[idx,:])}
            
    
    

## <font size='4' color='blue'><a> Model </a></font>

In [None]:
class JaneModel(nn.Module):
    
    def __init__(self):
        super(JaneModel,self).__init__()
        
        self.hidden = [130,64,16]#[131,64,16]
        self.batch1 = nn.BatchNorm1d(self.hidden[0])
        self.dropout1 = nn.Dropout(0.2)
        self.dense1 = nn.utils.weight_norm(nn.Linear(self.hidden[0],self.hidden[1]))
        
        self.batch2 = nn.BatchNorm1d(self.hidden[1])
        self.dropout2 = nn.Dropout(0.15)
        self.dense2 = nn.utils.weight_norm(nn.Linear(self.hidden[1],self.hidden[2]))
        
        
        self.batch3 = nn.BatchNorm1d(self.hidden[2])
        self.dense3 = nn.utils.weight_norm(nn.Linear(self.hidden[2],1))
        
        
    def forward(self,x):
        
        x = self.batch1(x)
        x = self.dropout1(x)
        x = F.leaky_relu(self.dense1(x))
        
        x = self.batch2(x)
        x = self.dropout2(x)
        x = F.leaky_relu(self.dense2(x))
    
        x = self.batch3(x)
        x = self.dense3(x)
        
        return x
        
        
        
        

### <font size='4' ><a> Preprocess </a></font>

In [None]:
train=train[train['weight']!=0]
target = (train['resp']>0)*1
print(train.shape[0])

In [None]:
import math

In [None]:
from sklearn.preprocessing import StandardScaler
features = [f'feature_{i}' for i in range(1,130)]
# def do_preprocess(train,mode=1):
    
    
    
#     def to_labels(x):
#         if x==1:
#             return 0
#         else:
#             return 1
    
    
#     for col in features :
#         mean_value=train[col].mean()
#         if math.isnan(mean_value):
# #             print(f'nan:{col}')
#             mean_value=0.0
#         train[col].fillna(mean_value,inplace=True)
        
#     if mode:

#             transformer = StandardScaler()
#             matrix = train[features]
#             scaled_data = transformer.fit_transform(matrix)

#             joblib.dump(transformer,f'{col}.pkl')

#     else:
#             transformer = joblib.load(f'{col}.pkl')
#             matrix = train[features]
#             scaled_data = transformer.transform(matrix)
            

#     train[features]=scaled_data
#     train['feature_0'].fillna(-1,inplace=True)
#     train['feature_0']=train['feature_0'].apply(to_labels).values
   
#     return train

matrix = train[features]
scaler = StandardScaler().fit(matrix)
train[features] = scaler.transform(matrix)



In [None]:
import pickle
with open('sl.pkl',mode='wb') as fout:
    pickle.dump(scaler,fout)

In [None]:
def train_fn(model, optimizer, scheduler, loss_fn, dataloader, device):
    model.train()
    final_loss = 0
    final_auc = 0
    
    for data in tqdm(dataloader):
        optimizer.zero_grad()
        inputs, targets = data['x'].to(device), data['y'].to(device)
        outputs = model(inputs).squeeze()
        targets=targets.squeeze()
        loss = loss_fn(outputs, targets)
        outputs=torch.sigmoid(outputs)
        auc = roc_auc_score(targets.detach().cpu().numpy(),outputs.detach().cpu().numpy())
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        final_loss += loss.item()
        final_auc += auc
        
    final_loss /= len(dataloader)
    final_auc /= len(dataloader)
    
    return final_loss,final_auc

def valid_fn(model, loss_fn, dataloader, device):
    model.eval()
    final_loss = 0
    final_auc = 0
    valid_preds = []
    
    for data in dataloader:
        inputs, targets = data['x'].to(device), data['y'].to(device)
        outputs = model(inputs).squeeze()
        targets=targets.squeeze()
        loss = loss_fn(outputs, targets)
        outputs=torch.sigmoid(outputs)
        auc = roc_auc_score(targets.detach().cpu().numpy(),outputs.detach().cpu().numpy())
        
        final_loss += loss.item()
        final_auc += auc
        valid_preds.append(outputs.sigmoid().detach().cpu().numpy())
        
    final_loss /= len(dataloader)
    final_auc /= len(dataloader)
    valid_preds = np.concatenate(valid_preds)
    
    return final_loss,final_auc,valid_preds

def inference_fn(model, dataloader, device):
    model.eval()
    preds = []
    
    for data in dataloader:
        inputs = data['x'].to(device)

        with torch.no_grad():
            outputs = model(inputs)
            outputs=torch.sigmoid(outputs)
        
        preds.append(outputs.detach().cpu().numpy())
        
    preds = np.concatenate(preds)
    
    return preds

## <font size='4' color='blue'><a> Training </a></font>

In [None]:
def train_model(train,target):
    
    train.fillna(-1,inplace=True)
    X_train,X_valid,y_train,y_valid  = train_test_split(train.drop(drop_cols,axis=1),target,test_size=0.15)
    
    train_data = janeDataset(X_train,y_train)
    valid_data = janeDataset(X_valid,y_valid)
    
    train_data = DataLoader(train_data,batch_size=2**12,shuffle=True)
    valid_data = DataLoader(valid_data,batch_size=2**12,shuffle=False)
    
    model = JaneModel()
    model.to(DEVICE)
    
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer, pct_start=0.1, div_factor=1e3, 
                                              max_lr=1e-2, epochs=EPOCHS, steps_per_epoch=len(train_data))
    loss_fn = nn.BCEWithLogitsLoss()
    best_loss = np.inf
    
    for epoch in range(EPOCHS):
            
            train_loss,train_auc = train_fn(model, optimizer, scheduler, loss_fn, train_data, DEVICE)
            final_loss,valid_auc,valid_pred = valid_fn(model, loss_fn, valid_data, DEVICE)
            print(f" Epoch {epoch} train loss {train_loss : .5f} valid loss {final_loss : .5f} train_auc {train_auc: .4f} valid_auc {valid_auc : .4f}")
            
            if final_loss<best_loss:
                
                best_loss = final_loss
                torch.save(model.state_dict(),f'jane_model.pth')
                early_stop=0
            if EARLY:
                early_stop+=1
                if early_stop>EARLY:
                    break
        
        

train_model(train,target)
    
    

## <font size='4' color='blue'><a> Inference </a></font>

In [None]:
model = JaneModel()
model.load_state_dict(torch.load("jane_model.pth"))
model.to(DEVICE)


In [None]:
env = janestreet.make_env() 
iter_test = env.iter_test()

In [None]:
cols=train.drop(drop_cols,axis=1).columns

In [None]:
model.eval()
opt_th = 0.5#0.5
for (test,sample_pred) in tqdm(iter_test):
    
    if test['weight'].item() > 0:
        test = test[cols]
#         test = do_preprocess(test,mode=0)
        test.fillna(0,inplace=True)
        test[features] = scaler.transform(test[features])

        with torch.no_grad():
            X=torch.tensor(test.values).float().to(DEVICE)
            preds=model(X)
            preds=torch.sigmoid(preds).cpu().numpy()[0,0]> opt_th

        sample_pred.action = int(preds)
    else:
        sample_pred.action =0
    env.predict(sample_pred)


## <font size='4' color='green'><a> WORK IN PROGRESS !!! DO AN UPVOTE IF YOU LIKED IT </a></font>

In [None]:

print('done') #