# Ensamble  Model
Here we are attempting to combine a number of different approaches


In [None]:
from kaggle.competitions import twosigmanews
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import minmax_scale
from sklearn import svm

# You can only call make_env() once, so don't lose it!
env = twosigmanews.make_env()

# returns the training data DataFrames as a tuple of:
(market_train_df, news_train_df) = env.get_training_data()

# size of total data
print("Market Train Size: ", market_train_df.shape)
print("News Train Size: ", news_train_df.shape)

# we only care about the market data here
market_train_df.head()

## Models

#### A - Analytical
#### B - News
#### C - Combinational

## B - Pre-Process Functions

In [None]:
from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from datetime import datetime, timedelta

def preprocess_market_data(market_df):
    market_features = ['time', 'assetCode', 'assetName', 'returnsOpenNextMktres10']
    market_df = market_df[market_features]
    
    market_df['time'] = market_df.time.dt.strftime("%Y%m%d").astype(int)
    
    return market_df

def drop_news_wo_news(news_train):
    news_train.drop(news_train[news_train["headlineLen"] == 0].index,inplace = True)
    news_train.drop(news_train[news_train["sentenceCount"] == 0].index,inplace = True)
    news_train.drop(news_train[news_train["bodySize"] == 0].index,inplace = True)
    news_train.drop(news_train[news_train["wordCount"] == 0].index,inplace = True)
    news_train.drop(["headline"], axis=1, inplace=True)
    return news_train

def preprocess_news(news_train_df1):
    #news_features = ['time', 'firstCreated', 'headline', 'urgency', 'takeSequence', 'assetCodes', 'assetName', 'firstMentionSentence', 'sentenceCount', 'wordCount', 'relevance', 'sentimentWordCount', 'sentimentClass']
    #news_train_df1 = news_train_df[news_features]
    
    drop_list = [
        'audiences', 'subjects', 'firstCreated', 'sourceTimestamp','marketCommentary'
    ]
    news_train_df1 = news_train_df1.drop(drop_list, axis=1, inplace=False)
    news_train_df1['headlineLen'] = news_train_df1['headline'].apply(lambda x: len(x))
    news_train_df1 = drop_news_wo_news(news_train_df1)
    
    news_train_df1['time'] = news_train_df1.time.dt.strftime("%Y%m%d").astype(int)
    news_train_df1['position'] = news_train_df1['firstMentionSentence'] / news_train_df1['sentenceCount']
    news_train_df1['coverage'] = news_train_df1['sentimentWordCount'] / news_train_df1['wordCount']
    
    news_train_df1['assetCode'] = news_train_df1['assetCodes'].map(lambda x: list(eval(x))[0])
    
    news_train_df1['assetCodesLen'] = news_train_df1['assetCode'].apply(lambda x: len(x))
    
    news_train_df1['asset_sentiment_count'] = news_train_df1.groupby(['assetName', 'sentimentClass'])['time'].transform('count')
    news_train_df1['asset_sentence_mean'] = news_train_df1.groupby(['assetName', 'sentenceCount'])['time'].transform('mean')
    
    ##
    for col in ['headlineTag', 'provider', 'sourceId']:
        news_train_df1[col], uniques = pd.factorize(news_train_df1[col])
        del uniques
    
    ##
    lbl = {k: v for v, k in enumerate(news_train_df1['headlineTag'].unique())}
    news_train_df1['headlineTagT'] = news_train_df1['headlineTag'].map(lbl)
        
    return news_train_df1

def group_news(newsdf):
    newsgp = newsdf.groupby(['time','assetCode'], sort=False).aggregate(np.mean).reset_index()
    return newsgp

def preprocess_market_test_data(market_df):
    market_features = ['time', 'assetCode', 'assetName']
    market_df = market_df[market_features]
    
    market_df['time'] = market_df.time.dt.strftime("%Y%m%d").astype(int)
    
    return market_df

def prepare_data(mark_df, new_df):
    mkt_df = preprocess_market_test_data(mark_df)
    new_df = preprocess_news(new_df)
    newsgp = group_news(new_df)
    cdf = mkt_df.merge(newsgp, how='left', on=['assetCode', 'time'])
    return cdf

## B - Fitting Model

In [None]:
def evaluate_model(df, target, train_index, test_index, params):
    model = LGBMClassifier(**params)
    model.fit(df.iloc[train_index], target.iloc[train_index])
    return log_loss(target.iloc[test_index], model.predict_proba(df.iloc[test_index]))

In [None]:
def train_news_model(market_train_df, news_train_df):

    mkt_df = preprocess_market_data(market_train_df)
    news_df = preprocess_news(news_train_df)
    newsgp = group_news(news_df)
    cdf = mkt_df.merge(newsgp, how='left', on=['assetCode', 'time'])

    # training to find features
    num_target = cdf.returnsOpenNextMktres10.astype('float32')
    bin_target = (cdf.returnsOpenNextMktres10 >= 0).astype('int8')

    cdf.drop(['returnsOpenNextMktres10','time', 'assetCode', 'assetName'], 
            axis=1, inplace=True)
    cdf_train = cdf.astype('float32')
    # train data
    train_index, test_index = train_test_split(cdf.index.values, test_size=0.2)

    param_grid = {
        'learning_rate': [0.05, 0.02, 0.01],
        'num_leaves': [25, 38, 63],
        'n_estimators': [100, 200, 400],
        'min_child_samples': [5, 10, 20, 40, 100],
        'colsample_bytree': [0.8, 0.9, 1],
        'subsample': [0.8, 0.9, 1],
        'reg_alpha': [0.1, 0.2, 0.4, 0.6, 0.8],
        'reg_lambda': [0.1, 0.2, 0.4, 0.6, 0.8],
    }

    best_eval_score = 0
    for i in range(5):
        params = {k: np.random.choice(v) for k, v in param_grid.items()}
        score = evaluate_model(cdf_train, bin_target, train_index, test_index, params)
        if score < best_eval_score or best_eval_score == 0:
            best_eval_score = score
            best_params = params
        print(best_eval_score)
    print("Best evaluation logloss", best_eval_score)

    clf = LGBMClassifier(**best_params)
    clf.fit(cdf_train, bin_target)

    feats_model = cdf_train.columns
    
    return clf

## B - Evaluation Functions

In [None]:
def preprocess_market_test_data(market_df):
    market_features = ['time', 'assetCode', 'assetName']
    market_df = market_df[market_features]
    
    market_df['time'] = market_df.time.dt.strftime("%Y%m%d").astype(int)
    
    return market_df

def prepare_data(mark_df, new_df):
    mkt_df = preprocess_market_test_data(mark_df)
    #mkt_df = mean_volume(mkt_df)
    #mkt_df = process_ma(mkt_df)
    new_df = preprocess_news(new_df)
    newsgp = group_news(new_df)
    cdf = mkt_df.merge(newsgp, how='left', on=['assetCode', 'time'])
    
    return cdf

## B - Prediction Functions

In [None]:
def news_predict(market_obs_df, news_obs_df, predictions, trainedModel):    
    cdf_test = prepare_data(market_obs_df, news_obs_df)
    cdf_test = cdf_test[cdf_test.assetCode.isin(predictions.assetCode)]
    feats = [e for e in cdf_test.columns if e not in ['date', 'assetCode', 'assetName', 'time']]
    preds = trainedModel.predict_proba(cdf_test[feats])[:, 1] * 2 - 1
    predsdf = pd.DataFrame({'ast':cdf_test['assetCode'],'conf':preds})
    
    # set
    predictions['confidenceValue'][predictions['assetCode'].isin(predsdf.ast)] = predsdf['conf'].values    

## A - Feature Function

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def analysis_get_features(market_data, byday=False, trainInfo=None):
    # for full training set feature creation
    if(not byday):
        # assign uids to each asset 
        uAssestCode = pd.unique(market_data.assetCode)    
        uidList     = np.linspace(1.0, uAssestCode.shape[0], num=uAssestCode.shape[0])

        # feature 0 - map from assetCode to uid    
        uidMap = {}
        for A, B in zip(uAssestCode, uidList):
            uidMap[A] = B

        aUID = np.zeros(market_data.shape[0])
        for i, item in enumerate(market_data.assetCode):
            aUID[i] = uidMap[item]

        # feature 1, 2 - gain, gainb    
        gain  = market_data.close - market_data.open    
        gainb = np.zeros(gain.shape[0])
        # classify
        gainb[gain > 0] = 1

        # feature 3 - volumeb
        v   = market_data.volume
        npv = np.array(v)    
        vbins, ved = np.histogram(v, bins=20)
        volumeb    = np.zeros(v.shape[0])

        # create classes for bins
        for i in range(1, ved.shape[0] - 1): 
            volumeb[np.logical_and(ved[i] < npv, npv < ved[i+1])] = i

        # features to dataframe
        #Xdict = {1: aUID, 2: gain, 3: gainb, 4: volumeb}
        Xdict = {1: gain, 2: gainb, 3: volumeb}
        X     = pd.DataFrame(Xdict)
        
        # save off training information
        trainInfo = (uidList, uidMap, ved)
        
    # for one off feature creation
    else:                
        # feature 0
        auid = np.zeros(market_data.assetCode.shape[0])
        for i, assetCode in enumerate(market_data.assetCode):
            # look for uid
            if assetCode in trainInfo[1]:
                uid = trainInfo[1][assetCode]
            else:
                # if its a new asset code create a new uid
                newUID = trainInfo[0].max() + 1
                np.append(trainInfo[0], newUID)
                
                # update dict
                trainInfo[1][assetCode] = newUID
                uid = newUID
                
            # set uid
            auid[i] = uid
        
        # feature 1, 2 - gain, gainb
        gain  = market_data.close - market_data.open    
        gainb = np.zeros(gain.shape[0])
        # classify
        gainb[gain > 0] = 1
        
        # feature 3 - volumeb
        v   = market_data.volume
        npv = np.array(v)    
        # TODO consider using the same bin alignment as the training data
        # it may be better to leave it as-is; it would be proportionate
        # ved = trainInfo[2][i]
        vbins, ved = np.histogram(v, bins=20)
        volumeb    = np.zeros(v.shape[0])

        # create classes for bins
        for i in range(1, ved.shape[0] - 1): 
            volumeb[np.logical_and(ved[i] < npv, npv < ved[i+1])] = i
                
        # features to dataframe
        #Xdict = {1: auid, 2: gain, 3: gainb, 4: volumeb}
        Xdict = {1: gain, 2: gainb, 3: volumeb}
        X     = pd.DataFrame(Xdict)
    
    return X, trainInfo

In [None]:
# debug 
# subset = market_train_df.head()
# features, trainInfo = analysis_get_features(market_train_df)

# features.head()

## A - Training Function

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import minmax_scale
from sklearn import svm

def analysis_train(features, target):
    # scale y to be max [-1,1] to represent confidence 
    y       = np.zeros(target.shape[0])
    #y_scale = minmax_scale(list(target), feature_range=(-1, 1), axis=0, copy=True)
    y[target >  1e-3] = 1
    y[target < -1e-3] = -1

    # implement SVM regression
    clf = svm.LinearSVC()
    #clf = svm.SVR(C=0.9, kernel='rbf')  
    #clf = svm.SVR(kernel='linear', C=1e3)   
    #clf = svm.SVC(gamma='auto')
    clf.fit(features, y)
    
    return clf

## A - Prediction Function

In [None]:
def analysis_predict(market_obs, predictions, trainInfo, trainedModel, toggle=True):    
    features, trainInfo = analysis_get_features(market_obs, toggle, trainInfo)
    p       = trainedModel.predict(features)
    #p_scale = minmax_scale(list(p), feature_range=(-1, 1), axis=0, copy=True)
    #p_class = np.ones(p_scale.shape[0]) * -1
    #p_class[p_scale > 0] = 1
    p_class = p
    
    # set
    predictions.confidenceValue = p_class

## Main Section
Let's loop through all the days and make our random predictions.  The `days` generator (returned from `get_prediction_days`) will simply stop returning values once you've reached the end.

In [None]:
# break up the data for faster run times
test_mode = True

In [None]:
if(test_mode):
    split_index = 100000;
    train_market_train_df = market_train_df[0:split_index]
    train_news_train_df   = news_train_df[0:split_index]
    #
    test_market_train_df = market_train_df[split_index+1:split_index+2000]
    test_news_train_df   = news_train_df[split_index+1:split_index+2000] 
    test_target_raw      = test_market_train_df['returnsOpenNextMktres10']
    test_target_class    = np.ones(test_target_raw.shape[0]) * -1
    test_target_class[test_target_raw >  0] = 1
 
    # train analytical
    features, trainInfo = analysis_get_features(train_market_train_df)
    trainedModel_A      = analysis_train(features, train_market_train_df['returnsOpenNextMktres10'])
    print(trainedModel_A)

    # train news
    trainedModel_B = train_news_model(train_market_train_df, train_news_train_df)
    print(trainedModel_B)
        
else:
    # train analytical
    features, trainInfo = analysis_get_features(market_train_df)
    trainedModel_A      = analysis_train(features, market_train_df['returnsOpenNextMktres10'])
    print(trainedModel_A)

    # train news
    trainedModel_B = train_news_model(market_train_df, news_train_df)
    print(trainedModel_B)

In [None]:
if(test_mode):
    from sklearn.metrics import classification_report, confusion_matrix  
    
    # predict
    predictions = {'assetCode': test_market_train_df['assetCode'], 'confidenceValue' : np.zeros(test_market_train_df.shape[0])}
    predictions = pd.DataFrame(predictions)
    
    # get model A predictions
    analysis_predict(test_market_train_df, predictions, trainInfo, trainedModel_A, False)        
    # save
    predA = predictions['confidenceValue'].values.copy()

    # get model B predictions
    news_predict(test_market_train_df, test_news_train_df, predictions, trainedModel_B)
    # save
    predB_raw = predictions['confidenceValue'].values.copy()
    predB     = np.ones(predB_raw.shape[0]) * -1
    predB[predB_raw >  predB_raw.mean()] = 1
    
    # results
    print('A results')
    print(confusion_matrix(test_target_class,predA))  
    print(classification_report(test_target_class,predA))  
    
    print('B results')
    print(confusion_matrix(test_target_class,predB))  
    print(classification_report(test_target_class,predB))

In [None]:
days = env.get_prediction_days()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

print("generating predictions...")

for (market_obs_df, news_obs_df, predictions_template_df) in days:
    # get model A predictions
    analysis_predict(market_obs_df, predictions_template_df, trainInfo, trainedModel_A)        
    # save
    predA = predictions_template_df['confidenceValue'].values.copy()

    # get model B predictions
    news_predict(market_obs_df, news_obs_df, predictions_template_df, trainedModel_B)
    # save
    predB = predictions_template_df['confidenceValue'].values.copy()

    # average predictions 
    pred = (predA + predB) / 2
    np.clip(pred, -1, 1)
    predictions_template_df.confidenceValue = pred
    
    # make prediction
    env.predict(predictions_template_df)
print('Done!')

In [None]:
predictions_template_df.head()

## **`write_submission_file`** function

Writes your predictions to a CSV file (`submission.csv`) in the current working directory.

In [None]:
env.write_submission_file()

In [None]:
# We've got a submission file!
import os
print([filename for filename in os.listdir('.') if '.csv' in filename])

In [None]:
# lets check out that CSV file
import pandas as pd
from datetime import datetime
import csv
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

headers = ['time', 'assetCode', 'confidenceValue']
df_in  = pd.read_csv('submission.csv',names=headers)

print(df_in)

code = df_in.assetCode[3]
df   = df_in[df_in.assetCode == code]           

y = np.array(df['confidenceValue'], dtype=float)
x = np.linspace(1.0, y.shape[0], num=y.shape[0])

# plot
plt.plot(x,y)
plt.show()