This notebook is inspired by [Yirun Zhang's great rapids svm kernel](https://www.kaggle.com/gogo827jz/rapids-svm-on-gpu-6000-models-in-1-hour).

We go a step further to use rapids libraries exclusively for the entire pipeline including data preprocessing, training and scoring. To make a point, we don't even import `numpy`, `pandas` and `sklearn`. We use simple `LogisticRegression` models and **in less than 10 mins** it gets a better score than previous best rapids demo. 

In [None]:
import warnings, sys
warnings.filterwarnings("ignore")

# Thanks to Chris's RAPIDS dataset, it only takes around 1 min to install offline
!cp ../input/rapids/rapids.0.15.0 /opt/conda/envs/rapids.tar.gz
!cd /opt/conda/envs/ && tar -xzvf rapids.tar.gz > /dev/null
sys.path = ["/opt/conda/envs/rapids/lib/python3.7/site-packages"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib/python3.7"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib"] + sys.path 
!cp /opt/conda/envs/rapids/lib/libxgboost.so /opt/conda/lib/

In [None]:
import cudf as gd
import cupy as cp
from cuml.svm import SVC
from cuml.preprocessing import LabelEncoder
from cuml.linear_model import LogisticRegression
from numba import cuda
from cuml.metrics import log_loss, roc_auc_score
from tqdm import tqdm

In [None]:
path = '../input/lish-moa'

In [None]:
%%time
train = gd.read_csv(f'{path}/train_features.csv')
test = gd.read_csv(f'{path}/test_features.csv')
fea_cols = train.columns.values[1:]
print(train.shape, test.shape)
train.head()

In [None]:
%%time

lbl = LabelEncoder()
train['cp_type'] = lbl.fit_transform(train['cp_type'])
test['cp_type'] = lbl.transform(test['cp_type'])
train['cp_type'].value_counts()
print('0 means control group')

In [None]:
%%time

print(train['cp_time'].value_counts())
print(test['cp_time'].value_counts())
train['cp_time'] = train['cp_time']/24 - 2
test['cp_time'] = test['cp_time']/24 - 2

In [None]:
%%time

lbl = LabelEncoder()
train['cp_dose'] = lbl.fit_transform(train['cp_dose'])
test['cp_dose'] = lbl.transform(test['cp_dose'])
train['cp_dose'].value_counts()

In [None]:
%%time

# confirm there is no missing values

for col in train.columns:
    nasum = train[col].isnull().sum() + test[col].isnull().sum()
    if nasum: print(col, nasum)       

In [None]:
%%time

# normalize 

for col in train.columns[4:]:
    mean, std = train[col].mean(), train[col].std()
    train[col] = (train[col] - mean)/std
    test[col] = (test[col] - mean)/std
train.head()

In [None]:
%%time

train_targets = gd.read_csv(f'{path}/train_targets_scored.csv')
print(train_targets.shape)
train_targets.head()

In [None]:
%%time

train = train.merge(train_targets, on='sig_id', how='left')
print(train.shape)
train.head()

In [None]:
%%time

# confirm control group has all 0 targets

mask = train.cp_type == 0
tcols = train_targets.columns[1:].values
ycontrol = train.loc[mask, tcols].values
ycontrol.max()

In [None]:
%%time
print(train.shape)
train = train.loc[train.cp_type>0]
train.shape

In [None]:
%%time

X = train[fea_cols].values
Xt = test[fea_cols].values

In [None]:
class StratifiedKFold_gpu:
    
    def __init__(self,n_splits=3,shuffle=True,random_state=42,tpb=32,mode='relax'):
        self.n_splits = n_splits
        self.shuffle = shuffle
        self.seed = random_state
        self.tpb = tpb # threads per thread block
        self.mode = mode
        
    def get_n_splits(self, X=None, y=None):
        return self.n_splits
              
    def split(self,x,y):
        
        assert x.shape[0] == y.shape[0]
        df = gd.DataFrame()
        ids = cp.arange(x.shape[0])
        
        if self.shuffle:
            cp.random.seed(self.seed)
            cp.random.shuffle(ids)
            x = x[ids]
            y = y[ids]
        
        cols = []
        df['y'] = y
        df['ids'] = ids
    
        grpby = df.groupby(['y'])
        if self.mode == 'sklearn':
            dg = grpby.agg({'y':'count'})
            #print(dg.columns)
            col = dg.columns[0]
            msg = 'n_splits=%d cannot be greater than the number of members in each class.'%self.n_splits
            assert dg[col].min()>=self.n_splits,msg

        def get_order_in_group(y,ids,order):
            for i in range(cuda.threadIdx.x, len(y), cuda.blockDim.x):
                order[i] = i

        got = grpby.apply_grouped(get_order_in_group,incols=['y','ids'],
                                  outcols={'order': 'int32'},
                                  tpb=self.tpb)

        got = got.sort_values('ids')
        
        for i in range(self.n_splits):
            mask = got['order']%self.n_splits==i
            train = got.loc[~mask,'ids'].values
            test = got.loc[mask,'ids'].values
            if len(test)==0:
                break
            yield train,test

In [None]:
%%time

def calibrate(y, mean, eps = 1e-4):
    ymean = y.mean()
    eps = min(eps, ymean)
    y = y - ymean + mean
    return cp.clip(y, eps, 1-eps)
    
def cv(X, y, Xt, folds = 4):
    skf = StratifiedKFold_gpu(n_splits=folds)#, random_state=None, shuffle=False)

    scores = 0
    ysub = 0
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        model = LogisticRegression(C=0.01)
        model.fit(X_train, y_train)
        yp = model.predict_proba(X_test)[:,1]
        yp = cp.asarray(yp, order='C')
        yp = calibrate(yp, y_train.mean())
        
        yps = model.predict_proba(Xt)[:,1]
        yps = cp.asarray(yps, order='C')
        yps = calibrate(yps, y_train.mean())
        ysub += yps
        
        score = log_loss(y_test, yp)

        scores += score
    return scores/folds, ysub/folds

In [None]:
%%time

scores = []
mean_scores = []
targets = train_targets.columns.values[1:]

sub = test[['sig_id', 'cp_type']]

for ycol in tqdm(targets, total=len(targets)):
    y = train[ycol].values
    try:
        score, ysub = cv(X, y, Xt, folds=8)
    except:
        ysub = cp.ones(Xt.shape[0])*y.mean()
        score = log_loss(y, cp.ones_like(y)*y.mean())
    mean_score = log_loss(y, cp.ones_like(y)*y.mean())
    sub[ycol] = ysub #if score < mean_score else y.mean()

    scores.append(score)
    mean_scores.append(mean_score)

scores = gd.DataFrame({'model_score': scores, 'target': targets, 'mean_score':mean_scores})
scores['best'] = cp.minimum(scores['model_score'].values, scores['mean_score'].values)

scores = scores.sort_values(by='model_score', ascending=False)
print(f"best:{scores['best'].mean():.5f}, mean:{scores['mean_score'].mean():.5f}, model:{scores['model_score'].mean():.5f}")
scores.head()

In [None]:
%%time

mask = sub['cp_type'] == 0
control_sum = mask.sum()
print(sub.shape[0], control_sum)

for col in targets:
    ys = sub[col].values.copy()
    sub.loc[mask, col] = 0
    assert (ys != sub[col].values).sum() == control_sum

In [None]:
sub = sub.drop('cp_type', axis=1)
sub.head()

In [None]:
sub.to_csv('submission.csv', index=False)