Importar librerías

In [None]:
import pandas as pd
import numpy as np
import time
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from datetime import datetime, date, time, timedelta
import calendar

plt.style.use('seaborn')
sns.set(font_scale=2)

import warnings 
warnings.filterwarnings('ignore')
import os

Crear entorno

In [None]:
from kaggle.competitions import twosigmanews
# You can only call make_env() once, so don't lose it!
env = twosigmanews.make_env()
print('Done!')

Cargar datos de training

In [None]:
(market_train_df, news_train_df) = env.get_training_data()

Cargamos datos de test, para tener a mano la plantilla que en principio habría que rellenar para submitir la competicion...
Nosotros no submitiremos, pero... vamos a basarnos en ella para realizar el cálculo de los rendimientos que se obtendrían aplicando el modelo

In [None]:
days = env.get_prediction_days()
(market_test_df, news_test_df, predictions_template_df) = next(days)

Dadas las características del proyecto, de cara a poder medir la accuracy, simplemente consultamos las características de los ficheros de test... pero no vamos a usarlos en lo sucesivo, sino que vamos a coger un subconjunto de los ficheros de train, que tengan las mismas características: los datos de mercados del último día y los datos del fichero de noticias posteriores a las 22h del penúltimo día (más todos los del último día).

Para poder simular un seguimiento durante varios días en lugar del último día, cogeremos los días de la última semana para reservarlos como test.

In [None]:
del market_test_df, news_test_df

start = '2016-06-30 22:00:00+00:00'

market_test_df = market_train_df.loc[market_train_df['time'] > start]
news_test_df = news_train_df.loc[news_train_df['time'] > start]

market_train_df = market_train_df.loc[market_train_df['time'] <= start].reset_index(drop=True)
news_train_df = news_train_df.loc[news_train_df['time'] <= start].reset_index(drop=True)

market_train_df = market_train_df.loc[market_train_df['time'] >= '2012-01-03 22:00:00+0000']
news_train_df = news_train_df.loc[news_train_df['time'] >= '2011-12-30 22:00:00+0000']
#to make it fast
#market_train_df = market_train_df.loc[market_train_df['time'] >= '2016-01-01 22:00:00+0000']
#news_train_df = news_train_df.loc[news_train_df['time'] >= '2015-12-31 22:00:00+0000']

market_train_df['close_to_open'] =  np.abs(market_train_df['close'] / market_train_df['open'])
market_train_df = market_train_df.loc[market_train_df['close_to_open'] > 0.5]
market_train_df = market_train_df.loc[market_train_df['close_to_open'] < 2]

market_test_df['close_to_open'] =  np.abs(market_test_df['close'] / market_test_df['open'])

Para reproducir las condiciones reales, en las que el fichero de mercados de test no tendría las variables de los rendimientos en los siguientes 10 días y la de 'universe' (si la acción entraría en cotización), vamos a eliminar estas dos variables del fichero de test...
pero, para poder medir al final el resultado de la predicción, guardaremos estos datos en un fichero auxiliar.

In [None]:
aux_columns = ['assetCode', 'time', 'returnsOpenNextMktres10', 'universe']
market_test_aux = market_test_df[aux_columns]
market_test_aux.head(10)

In [None]:
drop_test = ['returnsOpenNextMktres10', 'universe']
market_test_df.drop(drop_test, axis=1, inplace=True)
market_test_df.head(10)

In [None]:
market_train_df.shape

De cara a ahorrar pasos, una vez preparado el fichero de mercados de test, volvemos a juntar los ficheros de training y test en uno sólo, para hacer las transformaciones una única vez

In [None]:
market_dfs = [market_train_df, market_test_df]

market_train_df = pd.concat(market_dfs)

market_train_df.shape

Y hacemos los mismo con los ficheros de noticias

In [None]:
news_dfs = [news_train_df, news_test_df]

news_train_df = pd.concat(news_dfs)

news_train_df.shape

In [None]:
del market_test_df, news_test_df

Preparamos los datos para hacer el merge que nos permitirá comenzar a ejecutar modelos

In [None]:
def preprocess_news(news_train):
    drop_list = [
        'audiences', 'subjects', 'assetName',
        'headline', 'firstCreated', 'sourceTimestamp',
    ]
    news_train.drop(drop_list, axis=1, inplace=True)
    
    # Factorize categorical columns
    for col in ['headlineTag', 'provider', 'sourceId']:
        news_train[col], uniques = pd.factorize(news_train[col])
        del uniques
    
    # Remove {} and '' from assetCodes column
    news_train['assetCodes'] = news_train['assetCodes'].apply(lambda x: x[1:-1].replace("'", ""))
    return news_train

news_train_df = preprocess_news(news_train_df)
#news_test_df = preprocess_news(news_test_df)

In [None]:
def unstack_asset_codes(news_train_df):
    codes = []
    indexes = []
    for i, values in news_train_df['assetCodes'].iteritems():
        explode = values.split(", ")
        codes.extend(explode)
        repeat_index = [int(i)]*len(explode)
        indexes.extend(repeat_index)
    index_df = pd.DataFrame({'news_index': indexes, 'assetCode': codes})
    del codes, indexes
#    gc.collect()
    return index_df

index_df = unstack_asset_codes(news_train_df)
#index_df2 = unstack_asset_codes(news_test_df)

def merge_news_on_index(news_train_df, index_df):
    news_train_df['news_index'] = news_train_df.index.copy()

    # Merge news on unstacked assets
    news_unstack = index_df.merge(news_train_df, how='left', on='news_index')
    news_unstack.drop(['news_index', 'assetCodes'], axis=1, inplace=True)
    return news_unstack

news_unstack = merge_news_on_index(news_train_df, index_df)
#news_unstack2 = merge_news_on_index(news_test_df, index_df2)
#del news_train_df, index_df, news_test_df, index_df2
del news_train_df, index_df
#gc.collect()
#news_unstack.head(3)


def group_news(news_frame):
#    news_frame['date'] = news_frame.time.dt.date  # Add date column
    news_frame['date'] = np.where(news_frame.time.dt.time < dt.time(22, 0, 0), news_frame.time.dt.date, news_frame.time.dt.date + timedelta(days=1))   
#    news_frame['date'] = np.where(datetime.weekday(news_frame['date'])==5, news_frame['date'] + timedelta(days=2), news_frame['date'])
#    news_frame['date'] = np.where(datetime.weekday(news_frame['date'])==6, news_frame['date'] + timedelta(days=1), news_frame['date'])
    news_frame['weekday'] = np.where(news_frame.time.dt.time < dt.time(22, 0, 0), news_frame.time.dt.dayofweek, news_frame.time.dt.dayofweek + 1)
    aggregations = ['mean']
    gp = news_frame.groupby(['assetCode', 'date', 'weekday']).agg(aggregations)
#    gp.columns = pd.Index(["{}_{}".format(e[0], e[1]) for e in gp.columns.tolist()])
    gp.columns = pd.Index([e[0] for e in gp.columns.tolist()])
    gp.reset_index(inplace=True)
    # Set datatype to float32
    float_cols = {c: 'float32' for c in gp.columns if c not in ['assetCode', 'date', 'weekday']}
    return gp.astype(float_cols)

news_agg = group_news(news_unstack)
del news_unstack
#news_agg2 = group_news(news_unstack2)
#del news_unstack2
#; gc.collect()
#news_agg.head(3)

news_agg['weekday'][news_agg.weekday == 7] = 0
#news_agg2['weekday'][news_agg2.weekday == 7] = 0

Tratamiento sobre los festivos y sobre fines de semana

In [None]:
news_agg['date'][news_agg.weekday == 5] = news_agg['date'][news_agg.weekday == 5] + timedelta(days=2)
news_agg['date'][news_agg.weekday == 6] = news_agg['date'][news_agg.weekday == 6] + timedelta(days=1)

news_agg['weekday'][news_agg.weekday == 5] = 0
news_agg['weekday'][news_agg.weekday == 6] = 0

news_agg2['date'][news_agg2.weekday == 5] = news_agg2['date'][news_agg2.weekday == 5] + timedelta(days=2)
news_agg2['date'][news_agg2.weekday == 6] = news_agg2['date'][news_agg2.weekday == 6] + timedelta(days=1)

news_agg2['weekday'][news_agg2.weekday == 5] = 0
news_agg2['weekday'][news_agg2.weekday == 6] = 0

In [None]:
news_agg['weekday'][news_agg.date == date(2010,1,1)] = news_agg['weekday'][news_agg.date == date(2010,1,1)] + 1
news_agg['date'][news_agg.date == date(2010,1,1)] = news_agg['date'][news_agg.date == date(2010,1,1)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2012,1,2)] = news_agg['weekday'][news_agg.date == date(2012,1,2)] + 1
news_agg['date'][news_agg.date == date(2012,1,2)] = news_agg['date'][news_agg.date == date(2012,1,2)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2013,1,1)] = news_agg['weekday'][news_agg.date == date(2013,1,1)] + 1
news_agg['date'][news_agg.date == date(2013,1,1)] = news_agg['date'][news_agg.date == date(2013,1,1)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2014,1,1)] = news_agg['weekday'][news_agg.date == date(2014,1,1)] + 1
news_agg['date'][news_agg.date == date(2014,1,1)] = news_agg['date'][news_agg.date == date(2014,1,1)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2015,1,1)] = news_agg['weekday'][news_agg.date == date(2015,1,1)] + 1
news_agg['date'][news_agg.date == date(2015,1,1)] = news_agg['date'][news_agg.date == date(2015,1,1)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2016,1,1)] = news_agg['weekday'][news_agg.date == date(2016,1,1)] + 1
news_agg['date'][news_agg.date == date(2016,1,1)] = news_agg['date'][news_agg.date == date(2016,1,1)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2010,1,18)] = news_agg['weekday'][news_agg.date == date(2010,1,18)] + 1
news_agg['date'][news_agg.date == date(2010,1,18)] = news_agg['date'][news_agg.date == date(2010,1,18)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2011,1,17)] = news_agg['weekday'][news_agg.date == date(2011,1,17)] + 1
news_agg['date'][news_agg.date == date(2011,1,17)] = news_agg['date'][news_agg.date == date(2011,1,17)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2012,1,16)] = news_agg['weekday'][news_agg.date == date(2012,1,16)] + 1
news_agg['date'][news_agg.date == date(2012,1,16)] = news_agg['date'][news_agg.date == date(2012,1,16)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2013,1,21)] = news_agg['weekday'][news_agg.date == date(2013,1,21)] + 1
news_agg['date'][news_agg.date == date(2013,1,21)] = news_agg['date'][news_agg.date == date(2013,1,21)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2014,1,20)] = news_agg['weekday'][news_agg.date == date(2014,1,20)] + 1
news_agg['date'][news_agg.date == date(2014,1,20)] = news_agg['date'][news_agg.date == date(2014,1,20)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2015,1,19)] = news_agg['weekday'][news_agg.date == date(2015,1,19)] + 1
news_agg['date'][news_agg.date == date(2015,1,19)] = news_agg['date'][news_agg.date == date(2015,1,19)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2016,1,18)] = news_agg['weekday'][news_agg.date == date(2016,1,18)] + 1
news_agg['date'][news_agg.date == date(2016,1,18)] = news_agg['date'][news_agg.date == date(2016,1,18)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2010,2,15)] = news_agg['weekday'][news_agg.date == date(2010,2,15)] + 1
news_agg['date'][news_agg.date == date(2010,2,15)] = news_agg['date'][news_agg.date == date(2010,2,15)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2011,2,21)] = news_agg['weekday'][news_agg.date == date(2011,2,21)] + 1
news_agg['date'][news_agg.date == date(2011,2,21)] = news_agg['date'][news_agg.date == date(2011,2,21)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2012,2,20)] = news_agg['weekday'][news_agg.date == date(2012,2,20)] + 1
news_agg['date'][news_agg.date == date(2012,2,20)] = news_agg['date'][news_agg.date == date(2012,2,20)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2013,2,18)] = news_agg['weekday'][news_agg.date == date(2013,2,18)] + 1
news_agg['date'][news_agg.date == date(2013,2,18)] = news_agg['date'][news_agg.date == date(2013,2,18)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2014,2,17)] = news_agg['weekday'][news_agg.date == date(2014,2,17)] + 1
news_agg['date'][news_agg.date == date(2014,2,17)] = news_agg['date'][news_agg.date == date(2014,2,17)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2015,2,16)] = news_agg['weekday'][news_agg.date == date(2015,2,16)] + 1
news_agg['date'][news_agg.date == date(2015,2,16)] = news_agg['date'][news_agg.date == date(2015,2,16)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2016,2,15)] = news_agg['weekday'][news_agg.date == date(2016,2,15)] + 1
news_agg['date'][news_agg.date == date(2016,2,15)] = news_agg['date'][news_agg.date == date(2016,2,15)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2010,4,2)] = news_agg['weekday'][news_agg.date == date(2010,4,2)] + 1
news_agg['date'][news_agg.date == date(2010,4,2)] = news_agg['date'][news_agg.date == date(2010,4,2)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2011,4,22)] = news_agg['weekday'][news_agg.date == date(2011,4,22)] + 1
news_agg['date'][news_agg.date == date(2011,4,22)] = news_agg['date'][news_agg.date == date(2011,4,22)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2012,4,6)] = news_agg['weekday'][news_agg.date == date(2012,4,6)] + 1
news_agg['date'][news_agg.date == date(2012,4,6)] = news_agg['date'][news_agg.date == date(2012,4,6)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2013,3,29)] = news_agg['weekday'][news_agg.date == date(2013,3,29)] + 1
news_agg['date'][news_agg.date == date(2013,3,29)] = news_agg['date'][news_agg.date == date(2013,3,29)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2014,4,18)] = news_agg['weekday'][news_agg.date == date(2014,4,18)] + 1
news_agg['date'][news_agg.date == date(2014,4,18)] = news_agg['date'][news_agg.date == date(2014,4,18)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2015,4,3)] = news_agg['weekday'][news_agg.date == date(2015,4,3)] + 1
news_agg['date'][news_agg.date == date(2015,4,3)] = news_agg['date'][news_agg.date == date(2015,4,3)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2016,3,25)] = news_agg['weekday'][news_agg.date == date(2016,3,25)] + 1
news_agg['date'][news_agg.date == date(2016,3,25)] = news_agg['date'][news_agg.date == date(2016,3,25)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2010,5,31)] = news_agg['weekday'][news_agg.date == date(2010,5,31)] + 1
news_agg['date'][news_agg.date == date(2010,5,31)] = news_agg['date'][news_agg.date == date(2010,5,31)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2011,5,30)] = news_agg['weekday'][news_agg.date == date(2011,5,30)] + 1
news_agg['date'][news_agg.date == date(2011,5,30)] = news_agg['date'][news_agg.date == date(2011,5,30)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2012,5,28)] = news_agg['weekday'][news_agg.date == date(2012,5,28)] + 1
news_agg['date'][news_agg.date == date(2012,5,28)] = news_agg['date'][news_agg.date == date(2012,5,28)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2013,5,27)] = news_agg['weekday'][news_agg.date == date(2013,5,27)] + 1
news_agg['date'][news_agg.date == date(2013,5,27)] = news_agg['date'][news_agg.date == date(2013,5,27)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2014,5,26)] = news_agg['weekday'][news_agg.date == date(2014,5,26)] + 1
news_agg['date'][news_agg.date == date(2014,5,26)] = news_agg['date'][news_agg.date == date(2014,5,26)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2015,5,25)] = news_agg['weekday'][news_agg.date == date(2015,5,25)] + 1
news_agg['date'][news_agg.date == date(2015,5,25)] = news_agg['date'][news_agg.date == date(2015,5,25)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2016,5,30)] = news_agg['weekday'][news_agg.date == date(2016,5,30)] + 1
news_agg['date'][news_agg.date == date(2016,5,30)] = news_agg['date'][news_agg.date == date(2016,5,30)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2010,7,5)] = news_agg['weekday'][news_agg.date == date(2010,7,5)] + 1
news_agg['date'][news_agg.date == date(2010,7,5)] = news_agg['date'][news_agg.date == date(2010,7,5)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2011,7,4)] = news_agg['weekday'][news_agg.date == date(2011,7,4)] + 1
news_agg['date'][news_agg.date == date(2011,7,4)] = news_agg['date'][news_agg.date == date(2011,7,4)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2012,7,4)] = news_agg['weekday'][news_agg.date == date(2012,7,4)] + 1
news_agg['date'][news_agg.date == date(2012,7,4)] = news_agg['date'][news_agg.date == date(2012,7,4)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2013,7,4)] = news_agg['weekday'][news_agg.date == date(2013,7,4)] + 1
news_agg['date'][news_agg.date == date(2013,7,4)] = news_agg['date'][news_agg.date == date(2013,7,4)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2014,7,4)] = news_agg['weekday'][news_agg.date == date(2014,7,4)] + 1
news_agg['date'][news_agg.date == date(2014,7,4)] = news_agg['date'][news_agg.date == date(2014,7,4)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2015,7,4)] = news_agg['weekday'][news_agg.date == date(2015,7,4)] + 1
news_agg['date'][news_agg.date == date(2015,7,4)] = news_agg['date'][news_agg.date == date(2015,7,4)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2016,7,4)] = news_agg['weekday'][news_agg.date == date(2016,7,4)] + 1
news_agg['date'][news_agg.date == date(2016,7,4)] = news_agg['date'][news_agg.date == date(2016,7,4)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2010,9,6)] = news_agg['weekday'][news_agg.date == date(2010,9,6)] + 1
news_agg['date'][news_agg.date == date(2010,9,6)] = news_agg['date'][news_agg.date == date(2010,9,6)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2011,9,5)] = news_agg['weekday'][news_agg.date == date(2011,9,5)] + 1
news_agg['date'][news_agg.date == date(2011,9,5)] = news_agg['date'][news_agg.date == date(2011,9,5)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2012,9,3)] = news_agg['weekday'][news_agg.date == date(2012,9,3)] + 1
news_agg['date'][news_agg.date == date(2012,9,3)] = news_agg['date'][news_agg.date == date(2012,9,3)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2013,9,2)] = news_agg['weekday'][news_agg.date == date(2013,9,2)] + 1
news_agg['date'][news_agg.date == date(2013,9,2)] = news_agg['date'][news_agg.date == date(2013,9,2)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2014,9,1)] = news_agg['weekday'][news_agg.date == date(2014,9,1)] + 1
news_agg['date'][news_agg.date == date(2014,9,1)] = news_agg['date'][news_agg.date == date(2014,9,1)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2015,9,7)] = news_agg['weekday'][news_agg.date == date(2015,9,7)] + 1
news_agg['date'][news_agg.date == date(2015,9,7)] = news_agg['date'][news_agg.date == date(2015,9,7)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2016,9,5)] = news_agg['weekday'][news_agg.date == date(2016,9,5)] + 1
news_agg['date'][news_agg.date == date(2016,9,5)] = news_agg['date'][news_agg.date == date(2016,9,5)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2010,11,25)] = news_agg['weekday'][news_agg.date == date(2010,11,25)] + 1
news_agg['date'][news_agg.date == date(2010,11,25)] = news_agg['date'][news_agg.date == date(2010,11,25)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2011,11,24)] = news_agg['weekday'][news_agg.date == date(2011,11,24)] + 1
news_agg['date'][news_agg.date == date(2011,11,24)] = news_agg['date'][news_agg.date == date(2011,11,24)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2012,11,22)] = news_agg['weekday'][news_agg.date == date(2012,11,22)] + 1
news_agg['date'][news_agg.date == date(2012,11,22)] = news_agg['date'][news_agg.date == date(2012,11,22)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2013,11,28)] = news_agg['weekday'][news_agg.date == date(2013,11,28)] + 1
news_agg['date'][news_agg.date == date(2013,11,28)] = news_agg['date'][news_agg.date == date(2013,11,28)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2014,11,27)] = news_agg['weekday'][news_agg.date == date(2014,11,27)] + 1
news_agg['date'][news_agg.date == date(2014,11,27)] = news_agg['date'][news_agg.date == date(2014,11,27)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2015,11,26)] = news_agg['weekday'][news_agg.date == date(2015,11,26)] + 1
news_agg['date'][news_agg.date == date(2015,11,26)] = news_agg['date'][news_agg.date == date(2015,11,26)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2016,11,24)] = news_agg['weekday'][news_agg.date == date(2016,11,24)] + 1
news_agg['date'][news_agg.date == date(2016,11,24)] = news_agg['date'][news_agg.date == date(2016,11,24)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2010,12,24)] = news_agg['weekday'][news_agg.date == date(2010,12,24)] + 1
news_agg['date'][news_agg.date == date(2010,12,24)] = news_agg['date'][news_agg.date == date(2010,12,24)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2011,12,26)] = news_agg['weekday'][news_agg.date == date(2011,12,26)] + 1
news_agg['date'][news_agg.date == date(2011,12,26)] = news_agg['date'][news_agg.date == date(2011,12,26)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2012,12,25)] = news_agg['weekday'][news_agg.date == date(2012,12,25)] + 1
news_agg['date'][news_agg.date == date(2012,12,25)] = news_agg['date'][news_agg.date == date(2012,12,25)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2013,12,25)] = news_agg['weekday'][news_agg.date == date(2013,12,25)] + 1
news_agg['date'][news_agg.date == date(2013,12,25)] = news_agg['date'][news_agg.date == date(2013,12,25)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2014,12,25)] = news_agg['weekday'][news_agg.date == date(2014,12,25)] + 1
news_agg['date'][news_agg.date == date(2014,12,25)] = news_agg['date'][news_agg.date == date(2014,12,25)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2015,12,25)] = news_agg['weekday'][news_agg.date == date(2015,12,25)] + 1
news_agg['date'][news_agg.date == date(2015,12,25)] = news_agg['date'][news_agg.date == date(2015,12,25)] + timedelta(days=1)

news_agg['weekday'][news_agg.date == date(2016,12,26)] = news_agg['weekday'][news_agg.date == date(2016,12,26)] + 1
news_agg['date'][news_agg.date == date(2016,12,26)] = news_agg['date'][news_agg.date == date(2016,12,26)] + timedelta(days=1)

#news_agg['weekday'][news_agg.weekday == 7] = 0

news_agg2['weekday'][news_agg2.date == date(2016,12,26)] = news_agg2['weekday'][news_agg2.date == date(2016,12,26)] + 1
news_agg2['date'][news_agg2.date == date(2016,12,26)] = news_agg2['date'][news_agg2.date == date(2016,12,26)] + timedelta(days=1)

#news_agg2['weekday'][news_agg2.weekday == 7] = 0

In [None]:
news_agg['date'][news_agg.weekday == 5] = news_agg['date'][news_agg.weekday == 5] + timedelta(days=2)
news_agg['date'][news_agg.weekday == 6] = news_agg['date'][news_agg.weekday == 6] + timedelta(days=1)

news_agg2['date'][news_agg2.weekday == 5] = news_agg2['date'][news_agg2.weekday == 5] + timedelta(days=2)
news_agg2['date'][news_agg2.weekday == 6] = news_agg2['date'][news_agg2.weekday == 6] + timedelta(days=1)

In [None]:
def group_news(news_frame):
    aggregations = ['mean']
    gp = news_frame.groupby(['assetCode', 'date']).agg(aggregations)
#    gp.columns = pd.Index(["{}_{}".format(e[0], e[1]) for e in gp.columns.tolist()])
    gp.columns = pd.Index([e[0] for e in gp.columns.tolist()])
    gp.reset_index(inplace=True)
    # Set datatype to float32
    float_cols = {c: 'float32' for c in gp.columns if c not in ['assetCode', 'date']}
    return gp.astype(float_cols)

news_aggr = group_news(news_agg)
del news_agg
#news_aggr2 = group_news(news_agg2)
#del news_agg2

Generamos el fichero con el que vamos a trabajar que resulta del merge de los ficheros de mercados y de noticias

In [None]:
market_train_df['date'] = market_train_df.time.dt.date
full_train_df = market_train_df.merge(news_aggr, how='left', on=['assetCode', 'date'])
del market_train_df, news_aggr
full_train_df.head(5)

market_test_df['date'] = market_test_df.time.dt.date
full_test_df = market_test_df.merge(news_aggr2, how='left', on=['assetCode', 'date'])
del market_test_df, news_aggr2
full_test_df.head(5)

## 5.2 Preparación de los datos

## pensar justificación en elección de variables... basada en el EDA... Y ver si quitar las variables calculadas y poner las que hay a pelo


Antes de nada, vamos a hacer encoding de los asset codes para poder incluirlos en nuestro modelo

In [None]:
from sklearn.preprocessing import LabelEncoder

full_train_df["assetToken"] = LabelEncoder().fit_transform(full_train_df["assetCode"])

A continuación, dividimos el fichero en train y test

In [None]:
start = '2016-06-30 22:00:00+00:00'

full_test_df = full_train_df.loc[full_train_df['time'] > start]
full_train_df = full_train_df.loc[full_train_df['time'] <= start].reset_index(drop=True)

Eliminamos del fichero de test las columnas de los rendimientos en los siguientes 10 días y de la variable 'universe' (que, por otro lado, están rellenas por nulos, ya que antes de juntar los ficheros, habíamos eliminado esas variables del fichero de test)

In [None]:
drop_test = ['returnsOpenNextMktres10', 'universe']
full_test_df.drop(drop_test, axis=1, inplace=True)
full_test_df.head(10)

Definimos una función para:

- eliminar variables que no consideramos relevantes
- generar variables nuevas que vemos que pueden resumir la información de las variables existentes... 

Y aplicamos la función sobre el fichero de train

In [None]:
def prepare_data(full_train_df):
#    full_train_df = full_train_df[pd.notnull(full_train_df['urgency'])]
    full_train_df['returnsClosePrevMktres1'] = np.where(np.isnan(full_train_df['returnsClosePrevMktres1']), full_train_df['returnsClosePrevRaw1'], full_train_df['returnsClosePrevMktres1'])
    full_train_df['returnsOpenPrevMktres1'] = np.where(np.isnan(full_train_df['returnsOpenPrevMktres1']), full_train_df['returnsOpenPrevRaw1'], full_train_df['returnsOpenPrevMktres1'])
    full_train_df['returnsClosePrevMktres10'] = np.where(np.isnan(full_train_df['returnsClosePrevMktres10']), full_train_df['returnsClosePrevRaw10'], full_train_df['returnsClosePrevMktres10']) 
    full_train_df['returnsOpenPrevMktres10'] = np.where(np.isnan(full_train_df['returnsOpenPrevMktres10']), full_train_df['returnsOpenPrevRaw10'], full_train_df['returnsOpenPrevMktres10'])
#    full_train_df['time'] = full_train_df.time.dt.strftime("%Y%m%d").astype(int)
#    full_train_df['bartrend'] = full_train_df['close'] / full_train_df['open']
    full_train_df['average'] = (full_train_df['close'] + full_train_df['open'])/2
    full_train_df['pricevolume'] = full_train_df['volume'] * full_train_df['close']
#    full_train_df['position'] = full_train_df['firstMentionSentence'] / full_train_df['sentenceCount']
#    full_train_df['coverage'] = full_train_df['sentimentWordCount'] / full_train_df['wordCount']

    # eliminar variables prescindibles
#    droplist = ['sourceId','takeSequence','provider','firstMentionSentence',
#                'sentenceCount','bodySize','headlineTag','marketCommentary',
#                'sentimentClass','urgency','wordCount','sentimentWordCount',
#                'weekday','assetName','volume','time']
    droplist = ['open', 'close', 
                'takeSequence', 
                'bodySize', 'companyCount',                   
                'sentenceCount', 'wordCount', 
                'firstMentionSentence',
                                       
                                      'noveltyCount12H','noveltyCount24H',
                'noveltyCount3D','noveltyCount5D', 'noveltyCount7D',
                'weekday','assetName','time',
                'sourceId', 'urgency', 'provider', 'marketCommentary',          
                'relevance', 'sentimentClass',                                                   
                'volumeCounts12H', 'volumeCounts24H', 'volumeCounts3D', 'volumeCounts5D',            
                'volumeCounts7D']  
    full_train_df.drop(droplist, axis=1, inplace=True)
    return full_train_df

cdf = prepare_data(full_train_df)    
del full_train_df

Creamos una variable adicional, la variable "return", que es la que vamos a tomar como variable objetivo, a la que le asignaremos un 1 para rendimientos por encima de 0.03 (percentil 75 de los rendimientos a 10 días en el futuro) y un 0 para los demás

In [None]:
cdf['return'] = np.where(cdf.returnsOpenNextMktres10 > 0, 1, 0)   

Dividimos en:

- training
- validation

Pero... inicialmente únicamente preparamos la variable objetivo... y la variable de rendimientos que nos servirá para calcular más adelante la bondad del modelo en validación

In [None]:
targetcols = ['return']
traincols = [col for col in cdf.columns if col not in ['date', 'assetCode', 'universe','returnsOpenNextMktres10'] + targetcols]

dates = cdf['date'].unique()
train = range(len(dates))[:int(0.85*len(dates))]
val = range(len(dates))[int(0.85*len(dates)):]

# we be classifyin
#cdf[targetcols[0]] = (cdf[targetcols[0]] > 0).astype(int)
Y0 = cdf[targetcols].fillna(0).values
Z0 = cdf['returnsOpenNextMktres10'].fillna(0).values

# train data
Yt = cdf[targetcols].fillna(0).loc[cdf['date'].isin(dates[train])].values
Zt = cdf['returnsOpenNextMktres10'].fillna(0).loc[cdf['date'].isin(dates[train])].values

# validation data
Yv = cdf[targetcols].fillna(0).loc[cdf['date'].isin(dates[val])].values
Zv = cdf['returnsOpenNextMktres10'].fillna(0).loc[cdf['date'].isin(dates[val])].values

print(Y0.shape)
print(Yt.shape, Yv.shape)
print(Zt.shape, Zv.shape)

Comprobamos los tipos de las variables del fichero

In [None]:
cdf.dtypes

Y hacemos la normalización de las variables numéricas para preparar los ficheros con las variables que nos van a servir para entrenar el modelo y para introducirlas como input para la predicción en validación

In [None]:
numcols = [col for col in cdf.columns if col not in ['date', 'assetCode', 'assetToken', 'returnsOpenNextMktres10', 'return', 'universe']]

In [None]:
from sklearn.preprocessing import StandardScaler

cdf[numcols] = StandardScaler().fit_transform(cdf[numcols])

In [None]:
X0 = cdf[traincols].fillna(0).values

# train data
Xt = cdf[traincols].fillna(0).loc[cdf['date'].isin(dates[train])].values

# validation data
Xv = cdf[traincols].fillna(0).loc[cdf['date'].isin(dates[val])].values

print(Xt.shape, Xv.shape)

Utilizamos sklearn para entrenar un modelo con los datos de training

from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier().fit(Xt, Yt)

Definimos la score function, que predecirá la categoría para cada registro

In [None]:
def score_function(data, model):
    predicted = model.predict(data)
    return predicted

Hacemos la predicción para training... medimos la precisión y comprobamos la matriz de confusión

from sklearn.metrics import accuracy_score, confusion_matrix

pred_trn = score_function(Xt, clf)
print('Accuracy train: ', accuracy_score(Yt, pred_trn))

pd.DataFrame(confusion_matrix(Yt, pred_trn))

import numpy as np
x = pred_trn
unique, counts = np.unique(x, return_counts=True)

np.asarray((unique, counts)).T

y, a continuación, medimos la precisión en validation

pred_val = score_function(Xv, clf)
print('Accuracy valid: ', accuracy_score(Yv, pred_val))

pd.DataFrame(confusion_matrix(Yv, pred_val))

Usamos la función de evaluación de resultados propuesta en la competición para ver cómo de rentable hubiera resultado la inversión en el conjunto de validación

# calculation of actual metric that is used to calculate final score
prod_val = pred_val * Zv.ravel()
u_val = cdf['universe'].loc[cdf['date'].isin(dates[val])].values
x_t_i = prod_val * u_val
data = {'day' : cdf['date'].loc[cdf['date'].isin(dates[val])], 'x_t_i' : x_t_i}
df = pd.DataFrame(data)
x_t = df.groupby('day').sum().values.flatten()
mean = np.mean(x_t)
std = np.std(x_t)
score_valid = mean / std
print('Validation score', score_valid)

Ahora realizaremos la predicción sobre el conjunto de datos de test...
pero, antes, juntamos training y validation en "train" y volvemos a entrenar el modelo

In [None]:
Xtrain = X0
Ytrain = Y0
del X0, Y0

#from sklearn.ensemble import GradientBoostingClassifier
#clf1 = GradientBoostingClassifier().fit(Xtrain, Ytrain)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from scipy.stats import randint
from sklearn.model_selection import GridSearchCV

param_test1 = {'learning_rate':[0.1,0.11], 'loss':['deviance', 'exponential'], 'n_estimators':[20,30]}
#learning_rate=0.1, min_samples_split=500,min_samples_leaf=50,max_depth=8,max_features='sqrt',subsample=0.8,random_state=10
gsearch1 = GridSearchCV(estimator = GradientBoostingClassifier(), 
param_grid = param_test1, scoring='recall',n_jobs=4,iid=False, cv=5)
gsearch1.fit(Xtrain,Ytrain)

gsearch1.best_params_