In [None]:
import sys

!cp ../input/rapids/rapids.0.15.0 /opt/conda/envs/rapids.tar.gz
!cd /opt/conda/envs/ && tar -xzvf rapids.tar.gz > /dev/null
sys.path = ["/opt/conda/envs/rapids/lib/python3.7/site-packages"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib/python3.7"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib"] + sys.path
!cp /opt/conda/envs/rapids/lib/libxgboost.so /opt/conda/lib/

In [None]:
import cudf, cuml
from sklearn.metrics import log_loss
import numpy as np
import cupy
from tqdm import tqdm
import pandas as pd, xgboost as xgb
from sklearn.metrics import log_loss
from tqdm import tqdm

In [None]:
def get_svm_sub():
    DIR = "/kaggle/input/lish-moa"

    df = cudf.read_csv(f"{DIR}/train_features.csv")

    target_df = cudf.read_csv(f"{DIR}/train_targets_scored.csv")
    targets = [col for col in target_df.columns if col != "sig_id"]

    tns_df = cudf.read_csv(f"{DIR}/train_targets_nonscored.csv")
    tns = [col for col in tns_df.columns if col != "sig_id"]

    df = df.merge(target_df, on="sig_id").merge(tns_df, on="sig_id")

    test_df = cudf.read_csv(f"{DIR}/test_features.csv")

    gene_features = [col for col in df.columns if col.startswith("g-")]
    cell_features = [col for col in df.columns if col.startswith("c-")]

    for data in [df, test_df]:
        data["cp_time"] = data["cp_time"]/72
        data["cp_dose"] = 1.0*(data["cp_dose"] == "D1")


    smooth_df = cudf.DataFrame()
    smooth_df["cp_dose"] = [0.0, 1.0]*3
    smooth_df["cp_time"] = [0.33, 0.66, 1.0]*2

    for t in targets:
        smooth_df[t] = 0

    smooth_df2 = smooth_df.copy()
    for t in targets:
        smooth_df2[t] = 1
    smooth_df = smooth_df.append(smooth_df2)

    for f in gene_features + cell_features:
        smooth_df[f] = 0.0

    df = df[df["cp_type"] == "trt_cp"]
    df = df.append(smooth_df)

    progress_bar = tqdm(range(len(targets)))


    features = ["cp_time", "cp_dose"] + gene_features + cell_features

    y_test = np.zeros((test_df.shape[0], len(targets)))
    test_filter = cupy.asnumpy((test_df["cp_type"] == "trt_cp").values)

    for i in progress_bar:
        target = targets[i]
        y_real = cupy.asnumpy(df[target].values)


        svc_model = cuml.SVC(C=100.0, cache_size=3000.0, probability=True)

        svc_model.fit(df[features], df[target])

        y_test[:, i] = test_filter*cupy.asnumpy(svc_model.predict_proba(test_df[features]).values)[:, 1]

        test_df[target] = y_test[:, i]
        
    return test_df
        
svm_sub_df = get_svm_sub()

In [None]:
def get_xgb_sub():
    DIR = "/kaggle/input/lish-moa"

    df = pd.read_csv(f"{DIR}/train_features.csv")

    target_df = pd.read_csv(f"{DIR}/train_targets_scored.csv")
    targets = [col for col in target_df.columns if col != "sig_id"]

    tns_df = pd.read_csv(f"{DIR}/train_targets_nonscored.csv")
    tns = [col for col in tns_df.columns if col != "sig_id"]

    df = df.merge(target_df, on="sig_id").merge(tns_df, on="sig_id")

    test_df = pd.read_csv(f"{DIR}/test_features.csv")

    gene_features = [col for col in df.columns if col.startswith("g-")]
    cell_features = [col for col in df.columns if col.startswith("c-")]

    for data in [df, test_df]:
        data["cp_time"] = data["cp_time"]/72
        data["cp_dose"] = 1.0*(data["cp_dose"] == "D1")

    df = df[df["cp_type"] == "trt_cp"]

    
    for n, f in [("cell", cell_features), ("gene", gene_features)]:
        df[f] = df[f].rank(axis=1)
        test_df[f] = test_df[f].rank(axis=1)
        
    df["w"] = 0.999

    df2 = df.copy()
    df2[targets] = 1 - df2[targets]
    df2["w"] = 0.001
    df = df.append(df2)
    df.shape
    
    params = {"objective": "binary:logistic",
          "learning_rate" : 0.02,
          "max_depth": 4,
          #'n_estimators': 500,
          'min_child_weight': 5,
          "colsample_bytree": 0.5,
          "tree_method": 'gpu_hist', "gpu_id": 0}

    n_estimators = pd.read_csv("/kaggle/input/moa-xgb-params/xgb_res.csv").set_index("target")
    print(n_estimators["best_iter"].min())

    n_estimators.loc[n_estimators["best_iter"] == 0, "best_iter"] = None
    n_estimators["best_iter"] = n_estimators["best_iter"].fillna(n_estimators["best_iter"].min())
    print(n_estimators["best_iter"].min())

    n_estimators = n_estimators["best_iter"].to_dict()
    print(len(n_estimators), n_estimators["acat_inhibitor"])
    
    progress_bar = tqdm(range(len(targets)))


    features = gene_features + cell_features

    y_test = np.zeros((test_df.shape[0], len(targets)))
    test_filter = (test_df["cp_type"] == "trt_cp").values

    for i in progress_bar:
        target = targets[i]
        y_real = df[target].values

        base_pred = np.log((df["w"]*df[target]).mean())


        xgb_model = xgb.XGBClassifier(**params, n_estimators=int(n_estimators[target] + 5))

        xgb_model.fit(df[features], df[target], sample_weight=df["w"], base_margin=np.ones(df.shape[0])*base_pred)

        y_test[:, i] = test_filter*xgb_model.predict_proba(test_df[features], base_margin=np.ones(test_df.shape[0])*base_pred)[:, 1]

        test_df[target] = y_test[:, i]
        
    return test_df, targets

xgb_sub_df, targets = get_xgb_sub()

In [None]:
dfs = [xgb_sub_df, svm_sub_df.to_pandas()]
W = [0.6, 0.4]

for t in targets:
    dfs[0][t] = W[0]*dfs[0][t] + W[1]*dfs[1][t]

In [None]:
dfs[0].to_csv("submission.csv", columns=["sig_id"] + targets, index=False)