In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import datetime
import matplotlib.pyplot as plt
import matplotlib
import re
from scipy import stats

matplotlib.rcParams['figure.figsize'] = (10, 5)
matplotlib.rcParams['font.size'] = 12

import random
random.seed(1)
import time

import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import get_scorer
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.ensemble import VotingClassifier
import lightgbm as lgb
from sklearn.externals.joblib import Parallel, delayed
from sklearn.base import clone


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input/"))

# Any results you write to the current directory are saved as output.

In [None]:
from kaggle.competitions import twosigmanews
# You can only call make_env() once, so don't lose it!
env = twosigmanews.make_env()
print('Done!')

In [None]:
(mtrain, ntrain) = env.get_training_data()

In [None]:
market = mtrain.copy()
news = ntrain.copy()
market.time = mtrain.time.astype('datetime64[D, UTC]')

news.time = mtrain.time.astype('datetime64[D, UTC]')

print(market.shape)
print(news.shape)

In [None]:
market.head()

In [None]:
print("Total asset count: " + str(len(market['assetCode'].unique())))

In [None]:
def get_asset(df, asset=None):
    #get an asset, if none specified get random asset
    ass = asset
    if ass is None: #get random asset
        ass = df['assetCode'].unique()[random.randint(0, len(df['assetCode'].unique()))]
    ass_market = df[df['assetCode'] == ass]
    ass_market.index = ass_market.time
    return ass_market

In [None]:
plt.plot(get_asset(market).close) #Plots asset

In [None]:
#gets a sample of assets with all data present after 2009
#since a lot of companies went bankrupt then lol
def market_split(market, sample_size=100000):
    midx = market[market.time > '2009'][['time', 'assetCode']]
    midx = midx.sample(sample_size)
    midx = midx.sort_values(by=['time'])
    
    market_train, market_test = train_test_split(midx, shuffle=False, random_state=24)
    market_train, market_val = train_test_split(market_train, test_size=0.1, shuffle=False, random_state=24)
    
    return market_train, market_val, market_test

In [None]:
mtrain, mval, mtest = market_split(market)
print("market: ")
print("    train size: " + str(len(mtrain)))
print("    val size:   " + str(len(mval)))
print("    test size:  " + str(len(mtest)))


In [None]:
print(str(len(mtrain.assetCode.unique())))

In [None]:
class MarketPrepro:
    
    assetcode_encoded = []
    assetcode_train_count = 0
    time_cols = ['year', 'week', 'day', 'dayofweek']
    num_cols = ['volume', 'close', 'open', 'returnsClosePrevRaw1', 'returnsOpenPrevRaw1', 'returnsClosePrevMktres1',
                    'returnsOpenPrevMktres1', 'returnsClosePrevRaw10', 'returnsOpenPrevRaw10', 'returnsClosePrevMktres10',
                    'returnsOpenPrevMktres10']
    #all features
    feat_cols = ['assetCode_encoded'] + time_cols + num_cols
    
    label_cols = ['returnsOpenNextMktres10']
    
    def __init__(self):
        self.cats = {}
    
    def fit(self, mtrain):
        df = mtrain.copy()
        #fix/clean data
        mtrain = self.fix_train(mtrain)
        
        #get time cols
        mtrain = self.prep_time_cols(mtrain)
        
        #standardize features by using z = (x - u) / s
        self.num_scaler = StandardScaler()
        self.num_scaler.fit(mtrain[self.num_cols + self.time_cols].astype(float))
    
        mtrain = self.encode_asset(mtrain, True)
    
    def fix_train(self, mtrain):
        #fix/clean data
        max_ratio = 2 #removing outliers
        mtrain = mtrain[((mtrain['close'])/mtrain['open']).abs() <= max_ratio].loc[:]
        
        mtrain = self.safe_fix(mtrain)
        return mtrain
    
    def safe_fix(self, mtrain):
        #fill na and outliers, safe for train, no rows removed
        
        #fill na using bfill 
        mtrain[self.num_cols] = mtrain[['assetCode'] + self.num_cols].groupby('assetCode').transform(lambda g: g.fillna(method='bfill'))
        mtrain[self.num_cols] = mtrain[self.num_cols].fillna(0) #using 0
        
        #fix outliers based on quantiles
        mtrain[self.num_cols] = mtrain[self.num_cols].clip(mtrain[self.num_cols].quantile(0.01), mtrain[self.num_cols].quantile(0.99), axis=1)
        
        return mtrain
    
    def get_X(self, mtrain):
        #return x 
        mtrain = mtrain.copy()
        mtrain = self.safe_fix(mtrain)
        
        mtrain = self.prep_time_cols(mtrain)
        mtrain = self.encode_asset(mtrain, istrain=False)
        
        mtrain[self.num_cols + self.time_cols] = self.num_scaler.transform(mtrain[self.num_cols +self.time_cols].astype(float))
        
        return mtrain[self.feat_cols]
    
    def get_y(self, mtrain):
        y = (mtrain[self.label_cols]>=0).astype(float)
        return y
    
    def encode_asset(self, df, istrain):
        def encode(assetcode):
            try: 
                indx_val = self.assetcode_encoded.index(assetcode) + 1
            except ValueError: 
                self.assetcode_encoded.append(assetcode)
                indx_val = len(self.assetcode_encoded)
            
            indx_val = indx_val/ (self.assetcode_train_count + 1)
            return indx_val
        
        if istrain:
            self.assetcode_train_count = len(df['assetCode'].unique()) +1 
        df['assetCode_encoded'] = df['assetCode'].apply(lambda assetcode: encode(assetcode))
        return df
    
    def prep_time_cols(self, df): 
        #extract time cols, important for time series
        df = df.copy()
        df['year'] = df['time'].dt.year
        df['day'] = df['time'].dt.day
        df['week'] = df['time'].dt.week
        df['dayofweek'] = df['time'].dt.dayofweek
        return df
    
market_prepro = MarketPrepro()
print('market preprocessed lmao')
        
    

In [None]:
class NewsPrepro():
    news_cols_agg = {
        'urgency': ['min', 'count'],
        'takeSequence': ['max'],
        'bodySize': ['min', 'max', 'mean', 'std'],
        'wordCount': ['min', 'max', 'mean', 'std'],
        'sentenceCount': ['min', 'max', 'mean', 'std'],
        'companyCount': ['min', 'max', 'mean', 'std'],
        'marketCommentary': ['min', 'max', 'mean', 'std'],
        'relevance': ['min', 'max', 'mean', 'std'],
        'sentimentNegative': ['min', 'max', 'mean', 'std'],
        'sentimentNeutral': ['min', 'max', 'mean', 'std'],
        'sentimentPositive': ['min', 'max', 'mean', 'std'],
        'sentimentWordCount': ['min', 'max', 'mean', 'std'],
        'noveltyCount12H': ['min', 'max', 'mean', 'std'],
        'noveltyCount24H': ['min', 'max', 'mean', 'std'],
        'noveltyCount3D': ['min', 'max', 'mean', 'std'],
        'noveltyCount5D': ['min', 'max', 'mean', 'std'],
        'noveltyCount7D': ['min', 'max', 'mean', 'std'],
        'volumeCounts12H': ['min', 'max', 'mean', 'std'],
        'volumeCounts24H': ['min', 'max', 'mean', 'std'],
        'volumeCounts3D': ['min', 'max', 'mean', 'std'],
        'volumeCounts5D': ['min', 'max', 'mean', 'std'],
        'volumeCounts7D': ['min', 'max', 'mean', 'std']
            }
    news_cols_numeric = set(news_cols_agg.keys()) - set(['assetCode', 'time'])
    
    def fit(self, ntrain):
        ntrain = ntrain.copy()
        news_train_agg = self.aggregate_news(ntrain)
        news_train_agg.fillna(0, inplace=True)
        
        self.numeric_scaler = StandardScaler()
        self.numeric_scaler.fit(news_train_agg)
        self.feat_cols = list(news_train_agg.columns.values)
        
    def get_X(self, df):
        news_df = df.copy()
        news_df = self.aggregate_news(df)
        news_df.fillna(0, inplace=True)
        if not news_df.empty:
            news_df_numeric = news_df._get_numeric_data().astype(float)
            news_df[news_df_numeric.columns] = self.numeric_scaler.transform(news_df_numeric)
        return(news_df)
        
        
    def aggregate_news(self, df):
        # Fix asset codes (str -> list)
        df['assetCodes'] = df['assetCodes'].str.findall(f"'([\w\./]+)'")    

        # Leave only days in time
        if not df.empty: df.time = df.time.astype('datetime64[D, UTC]') #.tail()
        
        #Expand assetCodes
        assetCodes_expanded = list(chain(*df['assetCodes']))
        
        if(not df.empty): assetCodes_index = df.index.repeat(df['assetCodes'].apply(len)) 
        else: assetCodes_index = df.index
        assert len(assetCodes_index) == len(assetCodes_expanded)
        df_assetCodes = pd.DataFrame({'level_0': assetCodes_index, 'assetCode': assetCodes_expanded})

        # Create expanded news (will repeat every assetCodes' row)
        news_cols = ['time', 'assetCodes'] + sorted(list(self.news_cols_agg.keys()))
        df_expanded = pd.merge(df_assetCodes, df[news_cols], left_on='level_0', right_index=True, suffixes=(['','_old']))

        # Aggregate numerical news features
        df_aggregated = df_expanded.groupby(['time', 'assetCode']).agg(self.news_cols_agg)

        # Convert to float32 to save memory
        #df_aggregated = df_aggregated.apply(np.float32)

        # Flat columns
        df_aggregated.columns = ['_'.join(col).strip() for col in df_aggregated.columns.values]

        return df_aggregated    
    
news_prep = NewsPrepro()
print("news preprocessed!")
    

In [None]:
class JoinedPreprocessor:
    def __init__(self, market_prepro, news_prepro):
        self.market_prepro = market_prepro
        self.news_prepro = news_prepro
        
    def fit(self, market_train_idx, market, news):
        # market has index [time, assetCode]
        market_train_df = market.loc[market_train_idx.index]
        self.market_prepro.fit(market_train_df)
        # We select news in train time interval
        news_train_df = news.merge(market_train_idx, on=['time'])
        self.news_prepro.fit(news_train_df)
    
    def get_X(self, market_df, news_df):
        # Market should already has index (time, assetCode)
        # Preprocess market X
        market_X = market_prepro.get_X(market_df)
        market_X['time'] = market_df['time']
        market_X['assetCode'] = market_df['assetCode']
        
        #news_X will have index [time, assetCode]
        news_X = news_prepro.get_X(news_df)
        # Join by index, which is time, assetCode. Some assets have no news at all, so left join and 0 nans
        X = market_X.merge(news_X, how='left', left_on=['time', 'assetCode'], right_on=['time','assetCode'],  right_index=True)
        
        # Some market data can be without news, fill nans
        X.fillna(0, inplace=True)
        # Return features market + news from joined df
        features = X[market_prepro.feature_cols + news_prepro.feature_cols]
        return(features)

    def get_y(self, market_df): 
        return(self.market_prepro.get_y(market_df))
    
    def get_Xy(self, market_df, news_df):
        return(self.get_X(market_df, news_df), self.get_y(market_df))
    
    def fix_train(self, market_df, news_df):
        """
        Clean train data. Here we can remove bad rows
        """
        return(market_prepro.fix_train(market_df), news_df)

    
# Market and news preprocessor instance
prepro = JoinedPreprocessor(market_prepro, news_prep)
prepro.fit(market.loc[mtrain.index], market, news)
print('Preprocessor created, it is fit')

In [None]:
def get_merged_Xy(idx):
    """
    Show min/max and quantiles for given sample
    """
    market_df = market.loc[idx.index]
    # Select subset of news for future merge by assetCode and time. 
    news_df = news.merge(idx, on=['time'])
    X, y = prepro.get_Xy(market_df, news_df)
    return pd.concat([X,y], axis=1)

# Look at statistics of preprocessed sample
get_merged_Xy(market_test_idx.sample(10000)).describe()

In [None]:
class ModelFactory:
    """
    Generate different models. Actually only one of them is used in the kernel, 
    this factory is for experiments when debugging.
    """
    # LSTM look back window size
    look_back=90
    # In windows size look back each look_back_step days
    look_back_step=10

    def lstm_128():
        model = Sequential()
        # Add an input layer market + news
        input_size = len(market_prepro.feature_cols) + len(news_prepro.feature_cols)
        # input_shape=(timesteps, input features)
        model.add(LSTM(units=128, return_sequences=True, input_shape=(None,input_size)))
        model.add(LSTM(units=64, return_sequences=True ))
        model.add(LSTM(units=32, return_sequences=False))
        
        # Add an output layer 
        model.add(Dense(1, activation='sigmoid'))
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
        
        return(model)        

model = ModelFactory.lstm_128()
model.summary()