In [54]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tqdm import tqdm
from random import choices
import pickle
import seaborn as sns
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from itertools import product

In [55]:
#Function for loading data!

def load_data():
    
    df = pickle.load(open('df_down_sampled.p','rb'))
    df = df.drop(['resp_1', 'resp_2','resp_3','resp_4'], axis = 1)
    df = df.sort_values(by = 'ts_id')
    df['y'] = 0
    mask = df.resp > 0
    df.loc[mask,'y'] = 1    
    
    return df

In [56]:
#Function for splitting data into train/test set!

def train_test_split(test_share, data):
    
    #Split data into initial train/test
    
    train_share = 1 - test_share    
    train_size = int(len(data) * train_share)
    train_set = data[0:train_size]
    test_set = data[train_size:len(data)]    
    
    
    return (train_set, test_set)

In [57]:
def reduce_memory_usage(df):
    
    start_memory = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe is {start_memory} MB")
    
    for col in df.columns:
        
        col_type = df[col].dtype
        
        if col_type != 'object':
           
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == 'int':
                
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:                    
                    df[col] = df[col].astype(np.int8)
                
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            
            else:
                
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    pass
        else:
            df[col] = df[col].astype('category')
    
    end_memory = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe after reduction {end_memory} MB")
    print(f"Reduced by {100 * (start_memory - end_memory) / start_memory} % ")
    return df

In [58]:
def generate_train_set(df):
    
    y = df.y
    X = df.drop(['date','weight','y','ts_id', 'resp'], axis = 1)
    
    return X, y

In [59]:
def get_n_most_recent_observations(df, n):
    
    """Function takes as input a dataframe df, and returns the n most recent observations!"""
    
    cut_off = len(df) - n
    
    return df[cut_off:]    

In [60]:
def train_model(X_train, y_train, model):
    
    model.fit(X_train, y_train)
    
    return model   

In [61]:
def score_train_set_prediction(X_train, y_train, model):
    
    y_pred_train = model.predict_proba(X_train)[:,1]
    train_roc = roc_auc_score(y_train, y_pred_train)
    
    return train_roc        

In [62]:
def transform_test_set(df):
    
    test_df = df.drop(['date','weight','ts_id', 'resp'], axis = 1)
    
    return test_df

In [63]:
def score_test_set_prediction(test_df, model):    
    
    y_pred_test = model.predict_proba(test_df.drop('y', axis = 1))[:,1]
    test_roc = roc_auc_score(test_df.y, y_pred_test)
    
    return test_roc

In [64]:
def back_test_strategy(df, n, model):
    
    """
    Function for back-testing trading-strategy. Takes as input the dataset (df) hyperparameter n, a model and a dictionary
    for the model hyper-parameters. Df corresponds to the dataset for which we want to evaluate the trading-strategy. The
    hyperparameter n corresponds to the number of new datapoints that we need to go through prior to using the (n) most
    recent data-points to retrain the model.        
    """

    train_set_predictions = []   
    train_set_true = []
    
    test_set_predictions = []
    test_set_true = []
    
    number_of_steps = int(len(df)/n)
    print("The strategy consists of {} steps.".format(number_of_steps))
    
    for i in range(number_of_steps):
    
        print(i)
    
        #Generate markers for where training starts and stops!
    
        start = i*n
        stop = (i + 1)*n

        #Split data into train/test set!
    
        train_df = df[start:stop]
        
        try:
    
            test_df = df[stop:(stop + n)]
        
        except:
        
            test_df = df[stop:]
            
        if len(test_df) == 0:
            
            break
        
        else:
            
            pass
    
        #Train model and collect train roc!
    
        X_train, y_train = generate_train_set(train_df)
        model = train_model(X_train, y_train, model)
        y_pred_train = model.predict_proba(X_train)[:,1]
        train_set_predictions = train_set_predictions + y_pred_train.tolist()
        train_set_true = train_set_true + y_train.tolist()
        
        #Predict on test set and collect test-roc!
    
        test_df = transform_test_set(test_df)
        y_pred_test = model.predict_proba(test_df.drop('y', axis = 1))[:,1]
        test_set_predictions = test_set_predictions + y_pred_test.tolist() 
        test_set_true = test_set_true + test_df.y.tolist()
    
    
    train_set_predictions = np.array(train_set_predictions)
    train_set_true = np.array(train_set_true)
    
    test_set_predictions = np.array(test_set_predictions)
    test_set_true = np.array(test_set_true)
    
    train_roc = roc_auc_score(train_set_true, train_set_predictions)
    test_roc = roc_auc_score(test_set_true, test_set_predictions)     
    
    return train_roc, test_roc

In [65]:
def find_optimal_strategy(df, n, model):
    
    """
    Function takes as input a dataframe df, model and a list (n) of strategy-hyperparameters 
    to test.
    
    Returns a dictionary with parameter values and the associated train/test roc:s.    
    """
    
    result_dict = {}
    
    for x in n:    
        
        train_roc, test_roc = back_test_strategy(df = df, n = x, model = model)
        result_dict[x] = (train_roc, test_roc)
        
    return result_dict  

In [72]:
def implement_strategy_on_test_set(train_df, test_df, n, model):
    
    train_df = train_df.tail(n)
    df = pd.concat([train_df, test_df], axis = 0)
    
    number_of_steps = int(len(df)/n)
    print("The strategy consists of {} steps.".format(number_of_steps))
    
    test_set_predictions = []
    test_set_true = []
    
      
    for i in range(number_of_steps):
    
        print(i)
    
        #Generate markers for where training starts and stops!
    
        start = i*n
        stop = (i + 1)*n

        #Split data into train/test set!
    
        train_df = df[start:stop]
        
        try:
    
            test_df = df[stop:(stop + n)]
        
        except:
        
            test_df = df[stop:]
            
        if len(test_df) == 0:
            
            break
        
        else:
            
            pass
    
        #Train model and collect train roc!
    
        X_train, y_train = generate_train_set(train_df)
        model = train_model(X_train, y_train, model)

        #Predict on test set and collect test-roc!
    
        test_df = transform_test_set(test_df)
        y_pred_test = model.predict_proba(test_df.drop('y', axis = 1))[:,1]
        test_set_predictions = test_set_predictions + y_pred_test.tolist() 
        test_set_true = test_set_true + test_df.y.tolist()
        
    
    test_set_predictions = np.array(test_set_predictions)
    test_set_true = np.array(test_set_true)
    
    test_roc = roc_auc_score(test_set_true, test_set_predictions)     
    
    return test_roc

In [73]:
#Load data!

df = load_data()

In [74]:
train_set, test_set = train_test_split(test_share = 0.3, data = df)

In [75]:
train_set = reduce_memory_usage(train_set)

Memory usage of dataframe is 369.0548095703125 MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_gui

Memory usage of dataframe after reduction 94.29893112182617 MB
Reduced by 74.44852941176471 % 


In [76]:
test_set = reduce_memory_usage(test_set)

Memory usage of dataframe is 158.167236328125 MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_gui

Memory usage of dataframe after reduction 40.41405487060547 MB
Reduced by 74.44852941176471 % 


In [None]:
res_dict = find_optimal_strategy(df = train_set, n = n, model = XGBClassifier(n_trees = 1))

Splitting at n = 3000, n_trees = 1 seems to increase performance. Test on hold out!

In [77]:
test_roc = implement_strategy_on_test_set(train_df = train_set, test_df = test_set, n = 3000, model = XGBClassifier(n_trees = 1))

The strategy consists of 51 steps.
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50


0.5695695418421014