# Getting Started Kernel

In [None]:
start_date = '2015-01-01'
rolling_flag = True
num_of_estimators = 20

Import packages and create the special environment:

In [None]:
import numpy as np
import pandas as pd
import os
import sys
import datetime
import time
import matplotlib.pyplot as plt
from tqdm import tqdm

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

from kaggle.competitions import twosigmanews
env = twosigmanews.make_env()

Get the training data, consisting of market prices and market news:

In [None]:
(all_market_train_df, news_train_df) = env.get_training_data()
market_train_df = all_market_train_df[all_market_train_df['time'] > start_date].copy()
train_start_date = min(market_train_df.time)
train_end_date = max(market_train_df.time)
print("Training data has {} days from {} to {}".format(market_train_df.shape[0], train_start_date, train_end_date))
print("There are {} unique assets in training data".format(len(market_train_df["assetCode"].unique())))
mean_return = np.mean(market_train_df.returnsOpenNextMktres10)
print("Mean market return for this period was: {}".format(mean_return))

In [None]:
def prepare_market_data(market_df, feature_cols, train_test, rolling_flag):
    
    if train_test == 'train':
        interest_cols =  ['time', 'assetCode'] + feature_cols + ['returnsOpenNextMktres10']
    elif train_test == 'test':
        interest_cols =  ['time', 'assetCode'] + feature_cols
    
    if rolling_flag:
        assets_list = list(market_df["assetCode"].unique())
        #for i, asset in tqdm(enumerate(assets_list), total=len(assets_list)):
        for i, asset in enumerate(assets_list):
            cur_market_df = market_df.loc[market_df['assetCode'] == asset].copy()
            cur_market_df.sort_values(by='time', axis=0, ascending=True, inplace=True)
            cur_market_df['VA10'] = cur_market_df.volume.rolling(window=10).mean()   
            if i==0:
                new_market_train_df = cur_market_df
            else:
                new_market_train_df = pd.concat([new_market_train_df, cur_market_df])
        new_market_train_df['NV'] = new_market_train_df.volume / new_market_train_df.VA10
        interest_cols =  interest_cols + ['NV', 'VA10']
        market_df = new_market_train_df
    
    market_df['CO'] = market_df.close / market_df.open - 1       
    out_market_df = market_df[interest_cols]
    out_market_df = out_market_df.dropna(axis=0)
    return out_market_df

In [None]:
feature_cols = ['close', 'open', 'CO',
                'returnsClosePrevRaw1', 'returnsOpenPrevRaw1',
                'returnsClosePrevMktres1', 'returnsOpenPrevMktres1',
                'returnsClosePrevRaw10', 'returnsOpenPrevRaw10',
                'returnsClosePrevMktres10', 'returnsOpenPrevMktres10']

if rolling_flag:
    feature_cols = feature_cols + ['NV', 'VA10']
else:
    feature_cols = feature_cols + ['volume']
    
new_market_train_df = prepare_market_data(market_train_df, feature_cols, 'train', rolling_flag)
X = new_market_train_df[feature_cols]
y = new_market_train_df["returnsOpenNextMktres10"].values

## Train our Model:

In [None]:
t = time.time()
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
model = RandomForestRegressor(n_estimators = num_of_estimators)
model.fit(X_train, y_train)
y_val_pred = model.predict(X_val)
training_time = time.time() - t
print("The training time for the model is {} minutes".format(round(training_time, 0)/60))

## Calculate Validation Error vs. Random Errors:
Create dummy Random Predictions function, using only mean mark return:

In [None]:
def make_rand_pred(y_val, mean_return, way):
    if way == 1:
        y_val_pred = 2.0 * np.random.rand(len(y_val)) - 1.0
    else:
        y_val_pred = np.random.normal(mean_return, 0.5, len(y_val))
    y_val_pred[y_val_pred<-1] = -1
    y_val_pred[y_val_pred>1] = 1
    return y_val_pred

In [None]:
rf_err = round((abs(y_val_pred - y_val)).mean(),3)
y_val_pred_rand1 = make_rand_pred(y_val, mean_return,1)
y_val_pred_rand2 = make_rand_pred(y_val, mean_return,2)
rand1_err = round((abs(y_val_pred_rand1 - y_val)).mean(),3)
rand2_err = round((abs(y_val_pred_rand2 - y_val)).mean(),3)
print("RF Mean Abs Error is {}, random errors are {} and {}".format(rf_err, rand1_err, rand2_err ))

We will save the last month of the training data to use as rolling set:

In [None]:
delta_t = datetime.timedelta(days = 20)
month_market_df = market_train_df[market_train_df['time'] > '2016-12-10'].copy()
month_market_df.drop(['returnsOpenNextMktres10', 'universe'], axis=1, inplace=True)
month_market_df.head(1)

## Test Set Prediction:
Let's iterate over all available dates and assets:

In [None]:
days = env.get_prediction_days()

In [None]:
prep_time = 0
prediction_time = 0
packaging_time = 0
n_days = 0

for (daily_market_df, daily_news_df, predictions_template_df) in days:
    
    # Get Current date and this day assets:
    cur_date = daily_market_df.time.dt.date[0]
    """
    DEBUG:
    assets_list = list(daily_market_df["assetCode"].unique())    
    print("{} has {} assets".format(cur_date, len(assets_list)))    
    """
    n_days += 1
    if n_days % 20 == 0:
        print("{} days".format(n_days))
        print("Current Preparation time: {} seconds".format(round(prep_time, 0)))
        print("Current Prediction time: {} seconds".format(round(prediction_time, 0)))
        print("Current Packaging time: {} seconds".format(round(packaging_time, 0)))
        
        
    t = time.time()
    # Update monthly Data Frame with current day prices:
    month_market_df = pd.concat([month_market_df, daily_market_df], sort = False, ignore_index = True)
    month_market_df = month_market_df[month_market_df.time.dt.date > cur_date - delta_t].copy()
    #print("Updated month data frame has {} samples".format(len(month_market_df)))
    
    # Iterate over all assets and calculate the required features:
    out_market_df = prepare_market_data(month_market_df, feature_cols, 'test', rolling_flag)
    out_market_df = out_market_df[out_market_df.time.dt.date == cur_date].copy()    
    #print("Processed market data frame has {} samples".format(len(out_market_df)))
    prep_time += time.time() - t
    

    t = time.time()
    X_test = out_market_df[feature_cols]
    y_test_code = out_market_df['assetCode'].values
    y_test_pred = model.predict(X_test)
    y_test_pred[y_test_pred<-1] = -1
    y_test_pred[y_test_pred>1] = 1
    prediction_time += time.time() -t
    
    
    t = time.time()
    pred_df = pd.DataFrame({'assetCode': y_test_code, 'value': y_test_pred}, columns = ['assetCode', 'value'])
    temp = predictions_template_df.merge(pred_df, on = 'assetCode', how='left')
    predictions_template_df = temp.drop('confidenceValue',axis = 1).fillna(0).rename(columns = {'value':'confidenceValue'})
    env.predict(predictions_template_df)
    packaging_time += time.time() - t
    

print('Done!')
print("Total Preparation time: {} minutes".format(round(prep_time, 0)/60))
print("Total Prediction time: {} minutes".format(round(prediction_time, 0)/60))
print("Total Packaging time: {} minutes".format(round(packaging_time, 0)/60))

In [None]:
env.write_submission_file()
print([filename for filename in os.listdir('.') if '.csv' in filename])