### A - Market
### B - News
### C - Combinational
### news data needs to be combined with market first to map with date and target.
### data_processing() outputs a df with combined data first.
### data_slice() then outputs df_news for B and a cleaned df for C.
### All missing data are dropped once merged. Because once mapped with data and assetCode, news_df will have lots of missing data.
### both classifier predict() bin_target [0,1] and the predict_proba() confidence value which can be converted to competition confidence value by predict_proba()[:, 1]*2-1.

In [None]:
import numpy as np
import pandas as pd
import gc

from sklearn import *
import time
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
from mlxtend.evaluate import confusion_matrix
from mlxtend.plotting import plot_confusion_matrix
import seaborn as sns
%matplotlib inline

import matplotlib as mpl
mpl.rcParams['axes.titlesize'] = 20
mpl.rcParams['axes.labelsize'] = 16
mpl.rcParams['xtick.labelsize'] = 16
mpl.rcParams['ytick.labelsize'] = 16

In [None]:
from kaggle.competitions import twosigmanews
# You can only call make_env() once, so don't lose it!
env = twosigmanews.make_env()

In [None]:
(market_train_df, news_train_df) = env.get_training_data()

## Functions for data processing.

In [None]:
### Process market data.
def market_process(market_train_df):
    
    market_train_df['time'] = market_train_df.time.dt.date
    market_train_df['bartrend'] = market_train_df['close'] / market_train_df['open']
    market_train_df['average'] = (market_train_df['close'] + market_train_df['open'])/2
    market_train_df['pricevolume'] = market_train_df['volume'] * market_train_df['close']
    
    # drop nans or not?
    #market_train_df.dropna(axis=0, inplace=True)
    market_train_df.drop('assetName', axis=1, inplace=True)

    # Set datatype to float32 to save space
    float_cols = {c: 'float32' for c in market_train_df.columns if c not in ['assetCode', 'time']}
    
    return market_train_df.astype(float_cols)

### process news data.
def news_process(news_train_df):
    
    news_train_df['time'] = news_train_df.time.dt.date
    news_train_df['position'] = news_train_df['firstMentionSentence'] / news_train_df['sentenceCount']
    news_train_df['coverage'] = news_train_df['sentimentWordCount'] / news_train_df['wordCount']
    droplist_for_now = ['sourceTimestamp','firstCreated','subjects','audiences','headline','assetName']
    news_train_df.drop(droplist_for_now, axis=1, inplace=True)
    
    # factorize the following three
    for col in ['headlineTag', 'provider', 'sourceId', 'marketCommentary']:
        news_train_df[col], uniques = pd.factorize(news_train_df[col])
        del uniques
    
    # Remove {} and '' from assetCodes column
    news_train_df['assetCodes'] = news_train_df['assetCodes'].apply(lambda x: x[1:-1].replace("'", ""))
    return news_train_df

## Unstack assetCodes.
def unstack_asset_codes(news_train_df):
    codes = []
    indexes = []
    for i, values in news_train_df['assetCodes'].iteritems():
        explode = values.split(", ")
        codes.extend(explode)
        repeat_index = [int(i)]*len(explode)
        indexes.extend(repeat_index)
    index_df = pd.DataFrame({'news_index': indexes, 'assetCode': codes})
    del codes, indexes
    gc.collect()
    return index_df

## Merge news on index
def merge_news_on_index(news_train_df, index_df):
    news_train_df['news_index'] = news_train_df.index.copy()

    # Merge news on unstacked assets
    news_unstack_df = index_df.merge(news_train_df, how='left', on='news_index')
    news_unstack_df.drop(['news_index', 'assetCodes'], axis=1, inplace=True)
    return news_unstack_df

## Comine multiple news reports for same assets on same day.
def group_news(news_frame):
    
    aggregations = ['mean']
    gp = news_frame.groupby(['assetCode', 'time']).agg(aggregations)
    gp.columns = pd.Index(["{}_{}".format(e[0], e[1]) for e in gp.columns.tolist()])
    gp.reset_index(inplace=True)
    # Set datatype to float32
    float_cols = {c: 'float32' for c in gp.columns if c not in ['assetCode', 'time']}
    return gp.astype(float_cols)

### Merge market and news data
def merge(market_train_df,news_agg_df):
    
    df = market_train_df.merge(news_agg_df, how='left', on=['time','assetCode'])
    # drop nans or not?
    #df.dropna(axis=0, inplace=True)
    
    del market_train_df, news_agg_df
    return df

######################################################

def data_processing(market_train_df, news_train_df):
    ## Market
    market_train_df = market_process(market_train_df)
    print("Market data shape: ", market_train_df.shape)
    
    ## News
    news_train_df = news_process(news_train_df)
    index_df = unstack_asset_codes(news_train_df)
    news_unstack_df = merge_news_on_index(news_train_df, index_df)
    del news_train_df, index_df
    news_agg_df = group_news(news_unstack_df)
    del news_unstack_df
    print('News data shape: ', news_agg_df.shape)
          
    ## Merge
    df = merge(market_train_df,news_agg_df)
    print('Merged shape: ', df.shape)
    
    df.dropna(axis=0, inplace=True)
    print('wo missing shape: ', df.shape)
    
    gc.collect()
    return df

### Functions for A

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import minmax_scale
from sklearn import svm

def analysis_get_features(market_data, byday=False, trainInfo=None):
    # for full training set feature creation
    if(not byday):
        # assign uids to each asset 
        '''
        uAssestCode = pd.unique(market_data.assetCode)    
        uidList     = np.linspace(1.0, uAssestCode.shape[0], num=uAssestCode.shape[0])

        # feature 0 - map from assetCode to uid    
        uidMap = {}
        for A, B in zip(uAssestCode, uidList):
            uidMap[A] = B

        aUID = np.zeros(market_data.shape[0])
        for i, item in enumerate(market_data.assetCode):
            aUID[i] = uidMap[item]
        '''
        # feature 1, 2 - gain, gainb    
        gain  = market_data.close - market_data.open    
        gainb = np.zeros(gain.shape[0])
        # classify
        gainb[gain > 0] = 1

        # feature 3 - volumeb
        v   = market_data.volume
        npv = np.array(v)    
        vbins, ved = np.histogram(v, bins=20)
        volumeb    = np.zeros(v.shape[0])

        # create classes for bins
        for i in range(1, ved.shape[0] - 1): 
            volumeb[np.logical_and(ved[i] < npv, npv < ved[i+1])] = i

        # features to dataframe
        #Xdict = {1: aUID, 2: gain, 3: gainb, 4: volumeb}
        Xdict = {1: gain, 2: gainb, 3: volumeb}
        X     = pd.DataFrame(Xdict)
        
        # save off training information
        trainInfo = (ved)
        
    # for one off feature creation
    else:                
        # feature 0
        '''
        auid = np.zeros(market_data.assetCode.shape[0])
        for i, assetCode in enumerate(market_data.assetCode):
            # look for uid
            if assetCode in trainInfo[1]:
                uid = trainInfo[1][assetCode]
            else:
                # if its a new asset code create a new uid
                newUID = trainInfo[0].max() + 1
                np.append(trainInfo[0], newUID)
                
                # update dict
                trainInfo[1][assetCode] = newUID
                uid = newUID
                
            # set uid
            auid[i] = uid
        '''
        # feature 1, 2 - gain, gainb
        gain  = market_data.close - market_data.open    
        gainb = np.zeros(gain.shape[0])
        # classify
        gainb[gain > 0] = 1
        
        # feature 3 - volumeb
        v   = market_data.volume
        npv = np.array(v)    
        # TODO consider using the same bin alignment as the training data
        # it may be better to leave it as-is; it would be proportionate
        # ved = trainInfo[2][i]
        vbins, ved = np.histogram(v, bins=20)
        volumeb    = np.zeros(v.shape[0])

        # create classes for bins
        for i in range(1, ved.shape[0] - 1): 
            volumeb[np.logical_and(ved[i] < npv, npv < ved[i+1])] = i
                
        # features to dataframe
        #Xdict = {1: auid, 2: gain, 3: gainb, 4: volumeb}
        Xdict = {1: gain, 2: gainb, 3: volumeb}
        X     = pd.DataFrame(Xdict)
    
    return X, trainInfo

def analysis_train(features, target):
    # scale y to be max [-1,1] to represent confidence 
    y       = np.zeros(target.shape[0])
    #y_scale = minmax_scale(list(target), feature_range=(-1, 1), axis=0, copy=True)
    y[target >  1e-3] = 1
    y[target < -1e-3] = -1

    # implement SVM regression
    clf = svm.LinearSVC()
    #clf = svm.SVR(C=0.9, kernel='rbf')  
    #clf = svm.SVR(kernel='linear', C=1e3)   
    #clf = svm.SVC(gamma='auto')
    clf.fit(features, y)
    
    return clf

def analysis_predict(market_obs, predictions, trainInfo, trainedModel, toggle=True):    
    features, trainInfo = analysis_get_features(market_obs, toggle, trainInfo)
    p       = trainedModel.predict(features)
    #p_scale = minmax_scale(list(p), feature_range=(-1, 1), axis=0, copy=True)
    #p_class = np.ones(p_scale.shape[0]) * -1
    #p_class[p_scale > 0] = 1
    p_class = p
    
    # set
    predictions.confidenceValue = p_class

## Data processing.
### Get df_news and df.

In [None]:
df = data_processing(market_train_df, news_train_df)

# extract useful data.
dates = df.time
num_target = df.returnsOpenNextMktres10.astype('float32')
bin_target = (df.returnsOpenNextMktres10 >= 0).astype('int8')
universe = df.universe.astype('int8')

#Slice out df_news for LR. Clean df for lgb.
def data_slice(df):
    # Drop columns that are not features
    df.drop(['returnsOpenNextMktres10', 'universe', 'assetCode', 'time'], axis=1, inplace=True)
    
    market_column = df.columns.tolist()[:14] #14
    news_column = df.columns.tolist()[14:] #29
    
    # df_news for B.
    df_news = df[news_column]
    print('df_news shape: ', df_news.shape)
    
    # df for C.
    drop_list = ['takeSequence_mean','provider_mean','firstMentionSentence_mean',
                'headlineTag_mean','marketCommentary_mean',
                'noveltyCount12H_mean','noveltyCount24H_mean','noveltyCount3D_mean','sourceId_mean',
                'noveltyCount5D_mean','noveltyCount7D_mean','urgency_mean','sentimentClass_mean']
    df.drop(drop_list, axis=1, inplace=True)
    print('df shape: ', df.shape)
    
    return df_news, df

df_news, df = data_slice(df)

In [None]:
df_news.head()

In [None]:
df.head()

## Split data for training.

In [None]:
# random sample split
train_index, test_index = model_selection.train_test_split(df.index.values, test_size=0.25, 
                                                           random_state = 11)

## A - Train LinearSVC classifier for market only.

Custom features are slow so here we create a test mode

In [None]:
test_mode = False

if(test_mode):
    split_train_index = 20000
    split_test_index  = 20000
else:
    split_train_index = train_index.shape[0]
    split_test_index  = test_index.shape[0]        
    
train_market_train_df = df.loc[train_index[0:split_train_index]]
#
test_market_train_df = df.loc[test_index[0:split_test_index]]
test_target_raw      = bin_target[test_index[0:split_test_index]]
test_target_class    = np.ones(test_target_raw.shape[0]) * -1
test_target_class[test_target_raw >  0] = 1

# train analytical
features, trainInfo = analysis_get_features(train_market_train_df)
trainedModel_A      = analysis_train(features, bin_target[train_index[0:split_train_index]])
print(trainedModel_A)

# predict
#predictions = {'assetCode': test_market_train_df['assetCode'], 'confidenceValue' : np.zeros(test_market_train_df.shape[0])}
predictions = {'confidenceValue' : np.zeros(test_market_train_df.shape[0])}
predictions = pd.DataFrame(predictions)

# get model A predictions
analysis_predict(test_market_train_df, predictions, trainInfo, trainedModel_A, False)        
# save
predA = predictions['confidenceValue'].values.copy()
predA = predA.astype('int8')

print("SVC clf accuracy : %f" % \
  accuracy_score(predA,
                 test_target_raw))

## B - Train logistic regression classifier for news only.
It will not converge even with iteraction=400, so don't bother adding more.

In [None]:
def train_news_model(df_news):
    t = time.time()
    print('Fitting Up')
    clf = LogisticRegression(solver='sag', max_iter=200, n_jobs=4) # Stochastic Average Gradient: fast
    clf.fit(df_news.loc[train_index],bin_target.loc[train_index])
    print('Done')
    print(f'Done, time = {time.time() - t}')
    return clf

trainedModel_B = train_news_model(df_news)
print(trainedModel_B)

## B - LR Evaluation

In [None]:
predB = trainedModel_B.predict(df_news.loc[test_index])

print("LR clf accuracy : %f" % \
      accuracy_score(predB,
                     bin_target.loc[test_index]))
print("LR clf AUC : %f" % \
      roc_auc_score(bin_target.loc[test_index].values,
                    trainedModel_B.predict_proba(df_news.loc[test_index])[:, 1]))

In [None]:
plt.hist(trainedModel_B.predict_proba(df_news.loc[test_index])[:, 1]*2-1, 
         bins='auto', alpha=0.3, color='darkorange')
#plt.legend(['Ground truth', 'Predicted'])
plt.xlabel("Confidence")
plt.ylabel("Count")
plt.title("predicted confidence")
plt.show()

In [None]:
cfm = confusion_matrix(y_target=np.array(bin_target.loc[test_index]), 
                       y_predicted=trainedModel_B.predict(df_news.loc[test_index]).tolist())
fig, ax = plot_confusion_matrix(conf_mat=cfm)
plt.show()

## C - Train lgb classifier for market+news combo.

In [None]:
def train_combo_model(df):
    ## best parameters for lgb.
    lgb = LGBMClassifier(
        objective='binary',
        boosting='gbdt',
        learning_rate = 0.05,
        max_depth = 8,
        num_leaves = 80,
        n_estimators = 400,
        bagging_fraction = 0.8,
        feature_fraction = 0.9)

    t = time.time()
    print('Fitting Up')
    lgb.fit(df.loc[train_index],bin_target.loc[train_index])
    print('Done')
    print(f'Done, time = {time.time() - t}')
    return lgb

trainedModel_C = train_combo_model(df)
print(trainedModel_C)

In [None]:
predC = trainedModel_C.predict(df.loc[test_index])

print("lgb accuracy : %f" % \
      accuracy_score(predC,
                     bin_target.loc[test_index]))
print("lgb AUC : %f" % \
      roc_auc_score(bin_target.loc[test_index].values,
                    trainedModel_C.predict_proba(df.loc[test_index])[:, 1]))

## Vote with all model types

In [None]:
# lets add some weighting to C as we know its our best model
# A, B = 2 / 7 : C = 3 / 7
pred  = np.array([predA[0:split_test_index], predA[0:split_test_index], 
                  predB[0:split_test_index], predB[0:split_test_index], 
                  predC[0:split_test_index], predC[0:split_test_index], predC[0:split_test_index]
                 ])
tally = np.apply_along_axis(np.bincount, axis=0, arr=pred, minlength = np.max(pred) +1)
vote  = np.apply_along_axis(np.argmax, axis=0, arr= tally)

test = bin_target.loc[test_index[0:split_test_index]]

print("total accuracy : %f" % \
      accuracy_score(vote,
                     test))

cfm = confusion_matrix(y_target=np.array(test), 
                       y_predicted=vote)
fig, ax = plot_confusion_matrix(conf_mat=cfm)
plt.show()

# The End