# Universal Custom Estimators

### Cross Validator

In [None]:
import pandas as pd
import datetime
from datetime import datetime as dt
from dateutil.relativedelta import *
from sklearn.model_selection import BaseCrossValidator

class TimeBasedCV(BaseCrossValidator):
    '''
    Parameters 
    ----------
    train_period: int
        number of time units to include in each train set
        default is 30
    test_period: int
        number of time units to include in each test set
        default is 7
    freq: string
        frequency of input parameters. possible values are: days, months, years, weeks, hours, minutes, seconds
        possible values designed to be used by dateutil.relativedelta class
        deafault is days
    '''
    
    def __init__(self, train_period=30, test_period=7, freq='days'):
        self.train_period = train_period
        self.test_period = test_period
        self.freq = freq

        
        
    def split(self, data, validation_split_date=None, date_column='date', gap=0):
        '''
        Generate indices to split data into training and test set
        
        Parameters 
        ----------
        data: pandas DataFrame
            your data, contain one column for the record date 
        validation_split_date: datetime.date()
            first date to perform the splitting on.
            if not provided will set to be the minimum date in the data after the first training set
        date_column: string, deafult='record_date'
            date of each record
        gap: int, default=0
            for cases the test set does not come right after the train set,
            *gap* days are left between train and test sets
        
        Returns 
        -------
        train_index ,test_index: 
            list of tuples (train index, test index) similar to sklearn model selection
        '''
        
        # check that date_column exist in the data:
        try:
            data[date_column]
        except:
            raise KeyError(date_column)
            
        train_indices_list = []
        test_indices_list = []

        if validation_split_date==None:
            validation_split_date = data[date_column].min().date() + relativedelta(**{self.freq: self.train_period})
        
        start_train = validation_split_date - relativedelta(**{self.freq: self.train_period})
        end_train = start_train + relativedelta(**{self.freq: self.train_period})
        start_test = end_train + relativedelta(**{self.freq: gap})
        end_test = start_test + relativedelta(**{self.freq: self.test_period})

        while end_test < data[date_column].max().date():
            # train indices:
            cur_train_indices = list(data[(data[date_column].dt.date>=start_train) & 
                                     (data[date_column].dt.date<end_train)].index)

            # test indices:
            cur_test_indices = list(data[(data[date_column].dt.date>=start_test) &
                                    (data[date_column].dt.date<end_test)].index)
            
            print("Train period:",start_train,"-" , end_train, ", Test period", start_test, "-", end_test,
                  "# train records", len(cur_train_indices), ", # test records", len(cur_test_indices))

            train_indices_list.append(cur_train_indices)
            test_indices_list.append(cur_test_indices)

            # update dates:
            
            start_train = start_train + relativedelta(**{self.freq: self.test_period})
            end_train = start_train + relativedelta(**{self.freq: self.train_period})
            start_test = end_train + relativedelta(**{self.freq: gap})
            end_test = start_test + relativedelta(**{self.freq: self.test_period})

        # mimic sklearn output  
        index_output = [(train,test) for train,test in zip(train_indices_list,test_indices_list)]

        self.n_splits = len(index_output)
        
        return index_output
    
    
    def get_n_splits(self):
        """Returns the number of splitting iterations in the cross-validator
        Returns
        -------
        n_splits : int
            Returns the number of splitting iterations in the cross-validator.
        """
        return self.n_splits

### Grid Search

In [None]:
from sklearn.datasets import make_classification
from dask_ml.datasets import random_date
from datetime import date

import numpy as np

X, y = make_classification(
    n_samples=100,
    n_features=5,
    random_state=123,
)
dates = (date(2020, 1, 1), date(2021, 1, 1))
columns = ["var" + str(i) for i in range(np.shape(X)[1])]
y_series = pd.Series(y, name='target')
X_df = pd.DataFrame(X, columns=columns)
X_df['date'] = [random_date(*dates) for _ in range(len(X_df))]
X_df['date'] = X_df['date'].astype('datetime64')
X_df

In [None]:
tscv = TimeBasedCV(train_period=90, test_period=30)
index_output = tscv.split(X_df, validation_split_date=date(2020,6,1))

In [None]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

est = xgb.XGBClassifier(use_label_encoder=False)

params = {
    'min_child_weight': [1, 5, 10],
    #'objective': ['binary:logistic'],
    #'eval_metric': ['logloss']
    # 'gamma': [0.5, 1, 1.5, 2, 5],
    # 'subsample': [0.6, 0.8, 1.0],
    # 'colsample_bytree': [0.6, 0.8, 1.0],
    # 'max_depth': [3, 4, 5]
}

grid_search = GridSearchCV(
    estimator = est,
    n_jobs = -1,
    param_grid = params,
    cv = index_output,
    scoring='roc_auc',
    verbose=3
)

In [None]:
%%time
grid_search.fit(X_df.drop('date', axis=1), y_series)

In [None]:
grid_search.best_params_, grid_search.best_score_