In [None]:
import os
import gc
import joblib
import random
import numpy as np
import pandas as pd
from pathlib import Path
from argparse import Namespace
from collections import defaultdict

import cudf
import cupy as cp
from cuml.ensemble import RandomForestRegressor


SEED = 1111

np.random.seed(SEED)


import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_columns', 64)

def seed_everything(seed: int = 42) -> None:
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


data_path=Path("../input/ubiquant-parquet/")
samples=2500000

#%%time
#train = pd.read_parquet(data_path.joinpath("train_low_mem.parquet"))
train = pd.read_pickle("../input/ump-train-picklefile/train.pkl")
assert train.isnull().any().sum() == 0, "null exists."


test_cols = ['investment_id']
featuresxx = [c for c in train.columns if "f_" in c]

features = test_cols + featuresxx

train = reduce_mem_usage(train)

gc.collect()
len(features)

train.fillna(train.mean(),inplace=True)

#X_train = train.loc[:, train.columns.str.contains('f_')]
X_train = train.loc[:, features]
    
y_train = train['target']

model = RandomForestRegressor(n_estimators = 50, max_depth=9,random_state=1111 )
model.fit(cudf.from_pandas(X_train), cudf.from_pandas(y_train))


import ubiquant
env = ubiquant.make_env()  
iter_test = env.iter_test()

for (test_df, sample_prediction_df) in iter_test:
    
   # x_tt = test_df.loc[:, features].values
    x_tt = test_df.loc[:, features]
    
    #y_pred = model.predict(cudf.from_pandas(pd.DataFrame([x_test])).astype('float32'))
    
    sample_prediction_df['target'] = model.predict(x_tt).astype('float32')
    env.predict(sample_prediction_df) 
    display(sample_prediction_df)