In [None]:
import cudf

df = cudf.read_parquet("../input/ubiquant-parquet/train_low_mem.parquet")
print(df.shape)
df.head()

In [None]:
df["time_id"].max(), df.shape

In [None]:
DROP_BEFORE = 950

df = df[df["time_id"] > DROP_BEFORE].reset_index(drop=True)
df.shape

In [None]:
import cupy
import cuml

print("cuML version:", cuml.__version__)

WINDOW = 20
START = 1100
N_SPLITS = 6

cv = []

for i in range(N_SPLITS):
    train_ind = cupy.where(df["time_id"].values <= START + i*WINDOW)[0]
    val_ind = cupy.where((df["time_id"].values > START + i*WINDOW) & (df["time_id"].values <= START + (i+1)*WINDOW))[0]
    cv.append((cupy.asnumpy(train_ind), cupy.asnumpy(val_ind)))
    print(len(train_ind), len(val_ind))

In [None]:
features = [col for col in df.columns if col not in {"row_id", "target", "investment_id", "time_id"}]
features += ["investment_te"]
len(features)

In [None]:
class RAPIDSModel:
    def __init__(self):
        self.te = cuml.preprocessing.TargetEncoder()
        self.rf = cuml.ensemble.RandomForestRegressor(n_estimators=256, split_criterion="mse", bootstrap=True,
                                                      max_samples=0.6, min_samples_leaf=64, max_features=0.6, n_bins=512)
        self.svr = cuml.SVR(C=0.1, epsilon=0.4)
        
    def calculate_sample_weight(self, train_df):
        time_mean = train_df.groupby("time_id")["target"].mean().reset_index().rename(columns={"target": "target_mean"})
        time_std = train_df.groupby("time_id")["target"].std().reset_index().rename(columns={"target": "target_std"})

        train_df = train_df.merge(time_mean, on="time_id", how="left").merge(time_std, on="time_id", how="left")
        train_df = train_df.sort_values(["time_id", "investment_id"]).reset_index(drop=True)

        train_df["norm_target"] = (train_df["target"] - train_df["target_mean"])/train_df["target_std"]
        train_df["sw"] = (train_df["norm_target"].abs() + 1)/2
                
        return train_df
        
    def fit(self, train_df):
        train_df["investment_te"] = self.te.fit_transform(train_df["investment_id"], train_df["target"]).astype("float32")
        train_df = self.calculate_sample_weight(train_df)
        
        self.svr.fit(train_df[features], train_df["target"], sample_weight=train_df["sw"])
        self.rf.fit(train_df[features], train_df["target"])

        return self
        
    def predict(self, test_df):
        test_df["investment_te"] = self.te.transform(test_df["investment_id"]).astype("float32").get()
        return 0.7*self.rf.predict(test_df[features]) + 0.3*self.svr.predict(test_df[features])

In [None]:
from tqdm import tqdm


def evaluate(val_df):
    scores = []
    for time_id in val_df["time_id"].unique().values_host:
        time_df = val_df[val_df["time_id"] == time_id]
        scores.append(time_df["target"].corr(time_df["pred"]))

    return cupy.mean(cupy.array(scores))


val_scores = []


for f, (train_ind, val_ind) in tqdm(enumerate(cv), total=len(cv)):
    train_df, val_df = df.iloc[train_ind], df.iloc[val_ind]

    model = RAPIDSModel().fit(train_df)
    y_pred = model.predict(val_df)
    val_df["pred"] = y_pred.values
    
    val_scores.append(evaluate(val_df).item())
    
val_scores = cupy.array(val_scores)

In [None]:
print("Validation scores:", val_scores)
print("Mean:", cupy.mean(val_scores))
print("STD:", cupy.std(val_scores))

In [None]:
model = RAPIDSModel().fit(df)

In [None]:
import ubiquant
env = ubiquant.make_env()
iter_test = env.iter_test() 

for (test_df, sample_prediction_df) in iter_test:
    sample_prediction_df['target'] = model.predict(test_df)
    env.predict(sample_prediction_df) 