This kernel shows how to prepare lags separately for train and test phases

In [None]:
# Taken from Vadim's code https://www.kaggle.com/nareyko/fast-lags-calculation-concept-using-numpy-arrays
# comparing with my own code

# from twosigmanews import *
from kaggle.competitions import twosigmanews

In [None]:
import pandas as pd
import numpy as np
import gc
from resource import getrusage, RUSAGE_SELF
from datetime import date, datetime

import multiprocessing
from multiprocessing import Pool, cpu_count

import warnings
warnings.filterwarnings('ignore')

In [None]:
global STARTED_TIME
STARTED_TIME = datetime.now()

# It's better to use cpu_count from the system - who knows what happens during test phase
global N_THREADS
N_THREADS=multiprocessing.cpu_count()

print(f'N_THREADS: {N_THREADS}')

In [None]:
# FILTERDATE - start date for the train data
FILTERDATE = date(2007, 1, 1)

# SAMPLEDATE - I use it for sampling and fast sanity check of scripts
SAMPLEDATE = None
# SAMPLEDATE = date(2007, 1, 30)

In [None]:
global N_LAG, RETURN_FEATURES

# Let's try how it works for 1-year lags
N_LAG = np.sort([5, 10, 20, 252])

# Features for lags calculation
RETURN_FEATURES = [
    'returnsOpenPrevMktres10',
    'returnsOpenPrevRaw10',
    'open',
    'close']

In [None]:
# Tracking time and memory usage
global MAXRSS
MAXRSS = getrusage(RUSAGE_SELF).ru_maxrss
def using(point=""):
    global MAXRSS, STARTED_TIME
    print(str(datetime.now()-STARTED_TIME).split('.')[0], point, end=' ')
    max_rss = getrusage(RUSAGE_SELF).ru_maxrss
    if max_rss > MAXRSS:
        MAXRSS = max_rss
    print(f'max RSS {MAXRSS/1024/1024:.1f}Gib')
    gc.collect();

In [None]:
#Added this function
def charlies_calculation(market_train_df, only_last_day = False):
    from collections import OrderedDict
    cols = [
    'returnsOpenPrevMktres10',
    'returnsOpenPrevRaw10',
    'open',
    'close'] #source cols where you want to compute features on
    calculation = ['median', 'max', 'min']  #add any calculation you want
    my_feat_func = [] #add any function you want
    rolling = [5, 10, 20, 252]
    computed = OrderedDict()
    if only_last_day:
        for c in cols:
            #convert into matrix ndays x nassetcode, this what saves the time when calculating rolling features 
            computed[ c ] = market_train_df.pivot(index = 'time', columns = 'codeint', values = c ).astype('float32')
            for calc in calculation:
                for r in rolling:
                    computed[ c + str( r ) + str( calc )]  = computed[ c ].iloc[-r:].rolling(r).agg(calc).astype('float32')

            #if using a defined function
            for f in my_feat_func:
                for r in rolling:
                    computed[ c + str( r ) + str( f.__name__)]  = computed[ c ].iloc[-r:].rolling(r).agg(f).astype('float32')

            #when done with a column delete the source data to avoid duplicate later when merging with market_train_df
            del computed[ c ]
    else:
        for c in cols:
            #convert into matrix ndays x nassetcode, this what saves the time when calculating rolling features 
            computed[ c ] = market_train_df.pivot(index = 'time', columns = 'codeint', values = c ).astype('float32')
            for calc in calculation:
                for r in rolling:
                    computed[ c + str( r ) + str( calc )]  = computed[ c ].rolling(r).agg(calc).astype('float32')

            #if using a defined function
            for f in my_feat_func:
                for r in rolling:
                    computed[ c + str( r ) + str( f.__name__)]  = computed[ c ].rolling(r).agg(f).astype('float32')

            #when done with a column delete the source data to avoid duplicate later when merging with market_train_df
            del computed[ c ]

    #unstack pivots
    if only_last_day:
        for keys in computed.keys():
            computed[keys] = computed[keys].iloc[-1:]
            computed[keys] = computed[keys].unstack()
    else:
        for keys in computed.keys():
            computed[keys] = computed[keys].unstack()


    reshape = pd.concat([computed[c] for c in computed.keys()], axis=1)
    columns = computed.keys()
    reshape.columns = computed.keys()

    #now merging it back to market_train_df
    calculated_cols = [str(ccc) for ccc in reshape.columns]
    keepcols = ['time', 'codeint']
    #if there are same columns in market_train_df, drop them first
    todrop = [ccc for ccc in calculated_cols if ccc not in keepcols]
    if only_last_day:
        lastday = [ np.sort( market_train_df['time'].unique() )[-1]]
        
        market_train_df_last = market_train_df[market_train_df['time'].isin(lastday)]
        market_train_df_last = market_train_df_last.drop(columns = todrop,errors='ignore')
        market_train_df_last = market_train_df_last.merge(reshape, how='left', on=['time','codeint'])
        market_train_df = market_train_df.drop(market_train_df['time'].isin(lastday).index)
        market_train_df = market_train_df.append(market_train_df_last)
        market_train_df = market_train_df.fillna(0)
    else:
        market_train_df = market_train_df.drop(columns = todrop,errors='ignore')
        market_train_df = market_train_df.merge(reshape, how='left', on=['time','codeint'])
        market_train_df = market_train_df.fillna(0)
    return market_train_df

In [None]:


class doctorstring():
    def __init__(self):
        self.mainlist = []
        return
    
    def newitems(self, new):
        #pass a DF series
        temp = list(set(new))
        new = [n for n in temp if n not in self.mainlist]
        self.mainlist = self.mainlist + new
        self.dictionary = dict(enumerate(self.mainlist))
        self.inv_dict = dict(zip(self.dictionary.values(),self.dictionary.keys()))# get inverse mapping of above dictionary, replace key with values
        return
    
    def encode(self, toencode):
        try:
            return self.inv_dict[toencode]
        except:
            return 
    
    def encodeassetcodes(self, toencode):
        string = str(toencode)
        tmp = list(ast.literal_eval(string))
        enc = [self.encode(n) for n in tmp]
        return enc


In [None]:
# Pre-processing of dataframe, this functions is the same for train and test periods
# In production we had more calculations



Let's start

In [None]:
env = twosigmanews.make_env()
(market_train_df, news_train_df) = env.get_training_data()
using('Done')

In [None]:
print('Dataframe pre-processing')
market_train_df = market_train_df.drop(columns = ['assetName'])
stocklist = doctorstring() #manages all stock codes conversion
stocklist.newitems(market_train_df['assetCode'])
market_train_df['codeint'] = market_train_df['assetCode'].apply(stocklist.encode)

using('Done')

In [None]:
# Dataframe filtering
using('Done')

In [None]:
print('Lag features generation')
#here i use my code
market_train_df = charlies_calculation(market_train_df)
# market_train_df.tail()
# market_train_df.dtypes


using('Done')

In [None]:
print('keep only last n days to carry forward for test phase')

# suppose that training is done and we dont need to keep the whole market train df, we only keep 260 days since our largest lag is 252  or one year
keepdays = 252
uniquedates = np.sort( market_train_df['time'].unique() ) [-keepdays:]
market_train_df = market_train_df[market_train_df['time'].isin(uniquedates)]
using('Done')

In [None]:
print('Prediction')
#prediction
days = env.get_prediction_days()
n_days = 0
for (market_obs_df, news_obs_df, predictions_template_df) in days:
    n_days +=1
    if n_days % 100 == 0:
        using(f'{n_days}')
    # Test data preprocessing    
    stocklist.newitems(market_obs_df['assetCode'])
    market_obs_df['codeint'] = market_obs_df['assetCode'].apply(stocklist.encode)

    market_train_df = market_train_df.append(market_obs_df, ignore_index = True)
    market_train_df = charlies_calculation(market_train_df, only_last_day = True)
    # keep saving only last n days
    uniquedates = np.sort( market_train_df['time'].unique() ) [-keepdays:]
    market_train_df = market_train_df[market_train_df['time'].isin(uniquedates)]


    confidence = 0
    
    preds = pd.DataFrame({'assetCode':market_obs_df['assetCode'],'confidence':confidence})

    predictions_template_df = predictions_template_df.merge(preds,how='left')\
    .drop('confidenceValue',axis=1)\
    .fillna(0)\
    .rename(columns={'confidence':'confidenceValue'})
    
    env.predict(predictions_template_df)
    gc.collect()
    
using('Prediction done')

# env.write_submission_file()
# using('Done')