In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import xgboost as xgb
from scipy import stats
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn import clone

import time
from functools import reduce
from operator import mul
import os
import glob
import math
import logging
# from utils.logging import log_and_warn


import argparse
import warnings
warnings.filterwarnings("ignore")

In [None]:
book_example = pd.read_parquet('../input/optiver-realized-volatility-prediction/book_train.parquet/stock_id=0')
trade_example =  pd.read_parquet('../input/optiver-realized-volatility-prediction/trade_train.parquet/stock_id=0')
stock_id = '0'
book_example = book_example[book_example['time_id']==5]
book_example.loc[:,'stock_id'] = stock_id
trade_example = trade_example[trade_example['time_id']==5]
trade_example.loc[:,'stock_id'] = stock_id

In [None]:
book_example['wap'] = (book_example['bid_price1'] * book_example['ask_size1'] +
                                book_example['ask_price1'] * book_example['bid_size1']) / (
                                       book_example['bid_size1']+ book_example['ask_size1'])

In [None]:
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff() 


In [None]:
book_example.loc[:,'log_return'] = log_return(book_example['wap'])
book_example = book_example[~book_example['log_return'].isnull()]
book_example.head()

In [None]:
def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

realized_vol = realized_volatility(book_example['log_return'])
print(f'Realized volatility for stock_id 0 on time_id 5 is {realized_vol}')

In [None]:

list_order_book_file_train = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_train.parquet/*')

In [None]:
def realized_volatility_per_time_id(file_path, prediction_column_name):
    df_book_data = pd.read_parquet(file_path)
    df_book_data['wap'] =(df_book_data['bid_price1'] * df_book_data['ask_size1']+df_book_data['ask_price1'] * df_book_data['bid_size1'])  / (
                                      df_book_data['bid_size1']+ df_book_data[
                                  'ask_size1'])
    df_book_data['log_return'] = df_book_data.groupby(['time_id'])['wap'].apply(log_return)
    df_book_data = df_book_data[~df_book_data['log_return'].isnull()]
    df_realized_vol_per_stock =  pd.DataFrame(df_book_data.groupby(['time_id'])['log_return'].agg(realized_volatility)).reset_index()
    df_realized_vol_per_stock = df_realized_vol_per_stock.rename(columns = {'log_return':prediction_column_name})
    stock_id = file_path.split('=')[1]
    df_realized_vol_per_stock['row_id'] = df_realized_vol_per_stock['time_id'].apply(lambda x:f'{stock_id}-{x}')
    return df_realized_vol_per_stock[['row_id',prediction_column_name]]

In [None]:
def past_realized_volatility_per_stock(list_file,prediction_column_name):
    df_past_realized = pd.DataFrame()
    for file in list_file:
        df_past_realized = pd.concat([df_past_realized,
                                     realized_volatility_per_time_id(file,prediction_column_name)])
    return df_past_realized

# df_past_realized_train = past_realized_volatility_per_stock(list_file=list_order_book_file_train,
#                                                            prediction_column_name='pred')
# df_past_realized_train

In [None]:
train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
train.sample(10)

In [None]:
train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
train = train[['row_id','target']]
# df_joined = train.merge(df_past_realized_train[['row_id','pred']], on = ['row_id'], how = 'left')
train.sample(10)

In [None]:
# df_joined

In [None]:
# from sklearn.metrics import r2_score
# def rmspe(y_true, y_pred):
#     return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))
# R2 = round(r2_score(y_true = df_joined['target'], y_pred = df_joined['pred']),3)
# RMSPE = round(rmspe(y_true = df_joined['target'], y_pred = df_joined['pred']),3)
# print(f'Performance of the naive prediction: R2 score: {R2}, RMSPE: {RMSPE}')

In [None]:
def data_prep(list_order_book_file):
    df_past_realized = pd.DataFrame()
    for file in list_order_book_file:
        df_book_data = pd.read_parquet(file)
        df_book_data['wap'] =(df_book_data['bid_price1'] * df_book_data['ask_size1']+df_book_data['ask_price1'] * df_book_data['bid_size1'])  / (
                                              df_book_data['bid_size1']+ df_book_data[
                                          'ask_size1'])
        df_book_data['log_return'] = df_book_data.groupby(['time_id'])['wap'].apply(log_return)
        df_book_data = df_book_data[~df_book_data['log_return'].isnull()]
#     return df_book_data
        df_realized_vol_per_stock =  pd.DataFrame(df_book_data.groupby(['time_id'])['log_return'].agg(realized_volatility)).reset_index()
        df_realized_vol_per_stock = df_realized_vol_per_stock
        stock_id = file.split('=')[1]
        df_realized_vol_per_stock['row_id'] = df_realized_vol_per_stock['time_id'].apply(lambda x:f'{stock_id}-{x}')
        df_past_realized = pd.concat([df_past_realized, df_realized_vol_per_stock[['row_id','log_return']]])
    return df_past_realized[['row_id','log_return']]

In [None]:
"""
StratificationSplitter class
"""

class StratificationSplitter:
    def __init__(self, data, stratification_column_names, max_categories=10,
                 test_size=None, val_size=None, n_train_cv_splits=3, random_state=None):
    
        logging.info(
            "Initializing StratificationSplitter with test_size {}, val_size {} and {} train data cross-validation "
            "splits. Stratification by {} columns with max_categories {}".format(
                test_size if test_size else 0, val_size if val_size else 0, n_train_cv_splits,
                stratification_column_names, max_categories
            )
        )
        assert data.shape[0] > n_train_cv_splits
        self._data = data[~data.index.duplicated(keep="last")]
        self.stratification_column_names = stratification_column_names
        assert max_categories > 2
        self.max_categories = max_categories
        self.test_size = test_size
        self.val_size = val_size
        if test_size is not None:
            self.test_size = math.ceil(self._data.shape[0] * test_size) if test_size < 1 else int(test_size)
        if val_size is not None:
            self.val_size = math.ceil(self._data.shape[0] * val_size) if val_size < 1 else int(val_size)
        assert self.test_size is None or 0 < self.test_size <= self._data.shape[0] - n_train_cv_splits
        assert self.val_size is None or 0 < self.val_size <= self._data.shape[0] - n_train_cv_splits
        if test_size is not None and val_size is not None:
            if self.test_size + self.val_size > data.shape[0] - n_train_cv_splits:
                raise ValueError(
                    "Sum of test and validation sizes must be less than dataset size minus `n_train_cv_splits`"
                )
        assert n_train_cv_splits >= 2
        self.n_train_cv_splits = n_train_cv_splits
        self.random_state = random_state
        self._stratification = pd.DataFrame([], index=self._data.index.values)
        self._train_ids = None
        self._test_ids = None
        self._val_ids = None
        self._cv_ids = None
        for col in stratification_column_names:
            if col not in self._data.columns:
                raise ValueError("Column `{}` wasn't found in dataset".format(col))
            self._add_stratification_column(self._data[col])
        np.random.seed(self.random_state)
        if self._stratification.empty:
            random_col = np.zeros_like(self._stratification.index.values, dtype=np.int64)
            random_col[:len(random_col) // 2] = 1
            np.random.shuffle(random_col)
            self._stratification["no_stratification (random)"] = random_col

    def _add_stratification_column(self, col):
        unique, unique_counts = np.unique(col, return_counts=True)
        if len(unique) <= self.max_categories and np.all(unique_counts >= self.n_train_cv_splits):
            self._stratification[col.name] = col
        elif len(unique) > self.max_categories and col.dtype.kind in {"f", "u", "i"}:
            self._stratification[col.name] = pd.qcut(col, self.max_categories, duplicates="drop")
            buckets = list(zip(*np.unique(self._stratification[col.name], return_counts=True)))
            logging.warning(
                "Column `{}` is numeric with more than {} unique values, so it was quantile-discretized "
                "into {} buckets: {}".format(col.name, self.max_categories, len(buckets), buckets)
            )
        elif ~np.all(unique_counts >= self.n_train_cv_splits):
            print(
                "Column `{}` was removed from stratification because it had categories with less than {} members. The "
                "minimum number of members in any category cannot be less than number of cross-validation "
                "splits".format(col.name, self.n_train_cv_splits)
            )
        else:
            print(
                "Column `{}` was removed from stratification because it had more than {} categories and couldn't be "
                "discretized".format(col.name, self.max_categories)
            )

    def _split(self):
        # train/test/validation split
        logging.info("Creating train/test/validation set splits...")
        train_ids = self._stratification.index.values
        all_train_test_val_ids = [self._stratification.index.values]
        all_train_test_val_ids_names = ["full dataset"]
        if self.test_size is not None:
            train_ids, test_ids = train_test_split(
                train_ids, stratify=self._stratification.loc[train_ids],
                test_size=self.test_size, random_state=self.random_state
            )
            self._test_ids = test_ids
            all_train_test_val_ids.append(test_ids)
            all_train_test_val_ids_names.append("test")
        if self.val_size is not None:
            train_ids, val_ids = train_test_split(
                train_ids, stratify=self._stratification.loc[train_ids],
                test_size=self.val_size, random_state=self.random_state
            )
            self._val_ids = val_ids
            all_train_test_val_ids.append(val_ids)
            all_train_test_val_ids_names.append("validation")
        self._train_ids = train_ids
        all_train_test_val_ids.append(train_ids)
        all_train_test_val_ids_names.append("train")
        # log stratification percentages
        for feature in self._stratification.columns:
            messages = list()
            messages.append("Stratification on feature: %s" % feature)
            for i, ids in enumerate(all_train_test_val_ids):
                messages.append(
                    "%s sample:  Shape: %s, statistics: %s" % (
                        all_train_test_val_ids_names[i].upper(), len(ids),
                        (self._stratification.loc[ids, feature].value_counts().sort_index() / len(ids)).to_dict()
                    )
                )
            logging.info("\n".join(messages))

        # train cross validation split
        logging.info("Creating KFold cross-validation train data splits...")
        cv = []
        np.random.seed(self.random_state)
        groups = self._stratification.loc[train_ids].groupby(list(self._stratification.columns)).groups.items()
        reminder = 0
        for key, idx in sorted(groups, key=lambda x: x[0]):
            group_arr = idx.values.copy()
            min_examples = int(len(group_arr) / self.n_train_cv_splits)
            if min_examples < 1:
                print(
                    "Combined category `{}` of columns {} cannot be split into cross-validation folds equally because "
                    "the minimum number of members in any category cannot be less than number of cross-validation "
                    "splits: {} < {}".format(
                        key, list(self._stratification.columns), len(group_arr), self.n_train_cv_splits
                    )
                )
            np.random.shuffle(group_arr)
            split = np.array_split(group_arr, self.n_train_cv_splits)
            cv.append(split[self.n_train_cv_splits - reminder:] + split[0:self.n_train_cv_splits - reminder])
            reminder = (reminder + len(group_arr)) % self.n_train_cv_splits
            # log stratification percentages
            message = (
                "KFold cross-validation stratification on combined category `{}` of columns {} split "
                "sizes: {}".format(key, list(self._stratification.columns), list(map(len, split)))
            )
            logging.info(message)
        cv = list(map(np.concatenate, zip(*cv)))
        fold_sizes = list(map(len, cv))
        assert sum(fold_sizes) == len(train_ids)
        message = "KFold cross-validation split sizes: {}".format(fold_sizes)
        logging.info(message)
        self._cv_ids = []
        for idx in range(len(cv)):
            val = cv[idx]
            train = np.concatenate(cv[0:idx] + cv[idx+1:])
            assert len(set(train).intersection(set(val))) == 0, "No such indices that are in both train and val sets"
            self._cv_ids.append((train, val))

    @property
    def train_ids(self):
        if self._train_ids is None:
            self._split()
        return self._train_ids

    @property
    def test_ids(self):
        if self._test_ids is None and self.test_size is not None:
            self._split()
        return self._test_ids

    @property
    def val_ids(self):
        if self._val_ids is None and self.val_size is not None:
            self._split()
        return self._val_ids

    @property
    def cv_ids(self):
        if self._cv_ids is None:
            self._split()
        return self._cv_ids


In [None]:
def format_time(seconds):
    """
    Format time in seconds to time string, including minutes and hours when appropriate
    :param seconds:                     float, seconds
    :return:                            formatted time string
    """
    if seconds < 60:
        return "0:{:0>2}".format(int(seconds))
    elif seconds < 3600:
        minutes = int(seconds / 60)
        seconds = int(seconds % 60)
        return "{}:{:0>2}".format(minutes, seconds)
    else:
        hours = int(seconds / 3600)
        minutes = int((seconds % 3600) / 60)
        seconds = int((seconds % 3600) % 60)
        return "{}:{:0>2}:{:0>2}".format(hours, minutes, seconds)

In [None]:
"""
Functions for hyper-parameter optimization
"""

def tune_hyperparameters_cv(estimator, x, y, search_spaces, optimizers, n_iter, scoring=None,
                            cv=3, n_jobs=1, 
                            verbose_output=0, 
                            random_state=None, sample_weight=None, **kwargs):
    allowed_optimizers = {"grid", "random"}
    invalid_optimizers = [optimizer for optimizer in optimizers if optimizer not in allowed_optimizers]
    if len(invalid_optimizers) > 0:
        raise ValueError(
            "Values in `optimizers` must be one of {}, found: {}".format(allowed_optimizers, invalid_optimizers)
        )
    if len(search_spaces) != len(optimizers):
        raise ValueError(
            "`search_spaces` and `optimizers` must have the same length, found: "
            "{} != {}".format(len(search_spaces), len(optimizers))
        )
    n_steps = len(search_spaces)
    cv_splits = cv if isinstance(cv, int) else len(cv)
    processed_best_params = {}
    processed_best_score = -float("inf")
    for step in range(n_steps):
        step_start_time = time.time()
        print(
            "Step {} of {} of hyper-parameter optimization "
            "(`{}` optimizer)".format(step + 1, n_steps, optimizers[step])
        )
        if optimizers[step] == "grid":
            n_fits = reduce(mul, [len(par_values) for par_values in search_spaces[step].values()]) * cv_splits
            optimizer_kwargs = {}
            optimizer = GridSearchCV
        else:
            n_fits = n_iter * cv_splits
            optimizer_kwargs = {"random_state": random_state, "n_iter": n_iter}
            optimizer = RandomizedSearchCV
        print("Models fitting time start: {}".format(time.strftime('%H:%M:%S', time.gmtime(time.time()))))
        print("Optimizer is fitting {} models...".format(n_fits))
        estimator_clone = clone(estimator)
        estimator_clone.set_params(**processed_best_params)
        model = optimizer(
            estimator_clone, search_spaces[step], scoring=scoring, cv=cv,
            n_jobs=n_jobs, refit=False, verbose=0, **optimizer_kwargs
        )
        model.fit(x, y, sample_weight, **kwargs)
        iter_params = model.cv_results_["params"]
        iter_scores = model.cv_results_["mean_test_score"]
        iter_score_stds = model.cv_results_["std_test_score"]
        for iter_idx in range(len(iter_params)):
            print(
                "Parameters {}: mean-score={:.12f}, "
                "std-score={:.12f}".format(iter_params[iter_idx], iter_scores[iter_idx], iter_score_stds[iter_idx])
            )
        print(
            "Step {} optimization time: {}".format(step + 1, format_time(time.time() - step_start_time))
        )
        print(
            "Step {} completed. Best parameter tuning score is {:.12f} with "
            "parameters: {}".format(step + 1, model.best_score_, model.best_params_)
        )
        if model.best_score_ >= processed_best_score:
            processed_best_score = model.best_score_
            processed_best_params.update(model.best_params_)
        else:
            print(
                "Step {} best tuning score {:.12f} is worse than the previous step score {:.12f}, so parameters "
                "will be discarded and the correspondent default parameter values from the previous step will "
                "be used instead".format(step + 1, model.best_score_, processed_best_score)
            )
    print(
        "Best parameter tuning score overall is {:.12f} with "
        "parameters: {}".format(processed_best_score, processed_best_params)
    )
    print("Refitting model on the whole train dataset with best parameters..")
    estimator.set_params(**processed_best_params)
    estimator.fit(x, y, sample_weight, **kwargs)
    print("Hyper-parameter tuning completed")
    return estimator, processed_best_params


In [None]:
def get_data_by_ids(features, ids, target_col):
    samples = features.loc[ids, :]
    y = samples[target_col].values
    x = samples.drop(columns=[target_col])
    return x, y

def get_indices_by_ids(features, ids):
    """
    Retrieves indices of features for specified ids
    :param features:                    DataFrame of all features with id index
    :param ids:                         list of ids to retrieve indices for
    :return:                            array of indices for requested ids
    """
    features["indexing_column"] = np.arange(features.shape[0], dtype=np.int64)
    samples = features.loc[ids, :]
    indices = samples["indexing_column"].values
    features.drop(columns=["indexing_column"], inplace=True)
    return indices

In [None]:
# ap = argparse.ArgumentParser(description="Regression model training with cross-validation")
# ap.add_argument("-v", "--verbose", action="store_true", help="flag for verbose output")
# args = ap.parse_args()

In [None]:
df_book = data_prep(list_order_book_file=list_order_book_file_train)

df_train = train.merge(df_book[['row_id','log_return']], on = ['row_id'], how = 'left')
df_train = df_train.set_index('row_id')
df_train.head()

In [None]:
# df_train = df_book_data#.loc[df_book_data['time_id']==5]
# df_wap['row_id'] = df_wap['time_id'].apply(lambda x:f'{stock_id}-{x}')

# df_train["target"] = df_train["log_return"].copy()
# df_train.head()


In [None]:
splitter = StratificationSplitter(
        df_train, ["target"], val_size=0.1,
        test_size=0.2, n_train_cv_splits=10, max_categories=30,
        # random_state=42
    )

In [None]:
NON_FEATURE_COLUMNS = [] #"time_id", "wap", "log_return"
feature_columns = [col for col in df_train.columns if col not in NON_FEATURE_COLUMNS]
non_feature_data = df_train[NON_FEATURE_COLUMNS + ["target"]].rename(columns={"target": "y_true"})
df_train = df_train[feature_columns]

In [None]:
x_train, y_train = get_data_by_ids(df_train, splitter.train_ids, "target")
x_val, y_val = get_data_by_ids(df_train, splitter.val_ids, "target")
x_test, y_test = get_data_by_ids(df_train, splitter.test_ids, "target")
cv_splits = []
for train_ids, val_ids in splitter.cv_ids:
    train_idxs = get_indices_by_ids(x_train, train_ids)
    val_idxs = get_indices_by_ids(x_train, val_ids)
    cv_splits.append((train_idxs, val_idxs))

In [None]:
XGB_PARAMETERS = {
            "booster": "gbtree", "verbosity": 1,
            "objective": "reg:squaredlogerror", "eval_metric": ["rmse","rmsle"],
            "importance_type": "gain",
            "learning_rate": 0.05, "n_estimators": 2000,
            "max_depth": 5, "min_child_weight": 6, "gamma": 0,
            "subsample": 0.8, "colsample_bytree": 0.2,
            "reg_lambda": 10, "reg_alpha": 10
        }
HYPER_PARAMETER_TUNE_RANDOM_N_ITER = 20
XGB_TUNABLE_PARAMETERS_STEP_1 = {"max_depth": [4], "min_child_weight": [6]}#list(range(1, 8))
XGB_TUNABLE_PARAMETERS_STEP_2 = {
        "gamma": stats.uniform(0, 10), "colsample_bytree": stats.uniform(0.01, 0.99), "subsample": stats.uniform(0.7, 0.3)
    }
# XGB_TUNABLE_PARAMETERS_STEP_3 = {"reg_lambda": LogUniform(1, 1000), "reg_alpha": LogUniform(1, 1000)}
XGB_TUNABLE_PARAMETERS_STEP_4 = {"learning_rate": [0.4, 0.2, 0.1, 0.06, 0.03, 0.01, 0.005]}
XGB_TUNABLE_PARAMETERS = [
        XGB_TUNABLE_PARAMETERS_STEP_1,
#         XGB_TUNABLE_PARAMETERS_STEP_2,
#         XGB_TUNABLE_PARAMETERS_STEP_3,
#         XGB_TUNABLE_PARAMETERS_STEP_4
    ]
XGB_TUNING_OPTIMIZERS = ["grid"]# ,"random", "grid"

In [None]:
estimator = xgb.sklearn.XGBRegressor(**XGB_PARAMETERS)

In [None]:
model, best_params = tune_hyperparameters_cv(
    
    estimator, x_train, y_train, XGB_TUNABLE_PARAMETERS, XGB_TUNING_OPTIMIZERS,
    HYPER_PARAMETER_TUNE_RANDOM_N_ITER, scoring=make_scorer(mean_squared_error, greater_is_better=False),
    n_jobs=1, cv=cv_splits, #verbose_output=args.verbose,
    # model key-word params:
    eval_set=[(x_val, y_val)], early_stopping_rounds=50, verbose=False,
    # random_state=42
        )
XGB_PARAMETERS.update(best_params)

In [None]:
y_train_pred = model.predict(x_train, ntree_limit=model.best_ntree_limit)
y_val_pred = model.predict(x_val, ntree_limit=model.best_ntree_limit)
y_test_pred = model.predict(x_test, ntree_limit=model.best_ntree_limit)

In [None]:
# non_feature_data_test = non_feature_data.loc[splitter.test_ids].copy()
# non_feature_data_test["y_pred"] = y_test_pred

In [None]:
train_mae = mean_absolute_error(y_train, y_train_pred)
train_mse = mean_squared_error(y_train, y_train_pred)
train_r_2 = r2_score(y_train, y_train_pred)
val_mae = mean_absolute_error(y_val, y_val_pred)
val_mse = mean_squared_error(y_val, y_val_pred)
val_r_2 = r2_score(y_val, y_val_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
test_r_2 = r2_score(y_test, y_test_pred)
print(
        "Regression model, model trees={}, train_mae={:.12f}, train_mse={:.12f}, train_rmse={:.12f}, "
        "train_r_2={:.12f}, val_mae={:.12f}, val_mse={:.12f}, val_rmse={:.12f}, val_r_2={:.12f},"
        "test_mae={:.12f}, test_mse={:.12f}, test_rmse={:.12f}, test_r_2={:.12f}".format(
            model.best_ntree_limit, train_mae, train_mse, np.sqrt(train_mse), train_r_2,
            val_mae, val_mse, np.sqrt(val_mse), val_r_2,
            test_mae, test_mse, np.sqrt(test_mse), test_r_2
        )
    )

In [None]:
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))
# (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))



R2 = round(r2_score(y_true = y_test, y_pred = y_test_pred),3)
RMSPE = round(rmspe(y_true = y_test, y_pred = y_test_pred),3)
print(f'Performance: R2 score: {R2}, RMSPE: {RMSPE}')

# submit data

In [None]:
list_order_book_file_test = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_test.parquet/*')
test = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')

# for file in list_order_book_file_test:
#     df = pd.read_parquet(file)
#     print(df.head())
df_book = data_prep(list_order_book_file=list_order_book_file_test)
df_test = test.merge(df_book[['row_id','log_return']], on = ['row_id'], how = 'left').fillna(0)
df_test = df_test.set_index('row_id')

NON_FEATURE_COLUMNS = ['stock_id','time_id']
feature_columns = [col for col in df_test.columns if col not in NON_FEATURE_COLUMNS]
df_test = df_test[feature_columns]
# df_test = df_test.set_index('row_id')

y_test_pred = model.predict(df_test, ntree_limit=model.best_ntree_limit)

df_test = df_test.reset_index()
df_test['pred'] = pd.DataFrame(y_test_pred)
df_test[['row_id','pred']].to_csv('submission.csv',index = False)
df_test[['row_id','pred']]
# df_test