# Cross Validation

In [None]:
# import libraries
import gc
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn import *
from xgboost import XGBClassifier
from xgboost import plot_importance
from kaggle.competitions import twosigmanews

In [None]:
env = twosigmanews.make_env()
(market_train, news_train) = env.get_training_data()

In [None]:
def data_prep(market_train, news_train):
    '''
    Data prepararion function
    Input: 
        - market_train
        - news_train
    Output:
        - merged_train
            - # include following columns
            - 'assetCode', 'time', 'firstCreated', 'returnsOpenNextMktres10', 'universe'
    '''
    market_train.time = market_train.time.dt.date
    news_train.time = news_train.time.dt.hour
    news_train.sourceTimestamp= news_train.sourceTimestamp.dt.hour
    news_train.firstCreated = news_train.firstCreated.dt.date
    news_train['assetCodesLen'] = news_train['assetCodes'].map(lambda x: len(eval(x)))
    news_train['assetCodes'] = news_train['assetCodes'].map(lambda x: list(eval(x))[0])
    kcol = ['firstCreated', 'assetCodes']
    news_train = news_train.groupby(kcol, as_index=False).mean()
    merged_train = pd.merge(market_train, news_train, how='left',
                            left_on=['time', 'assetCode'], 
                            right_on=['firstCreated', 'assetCodes'])
    lbl = {k: v for v, k in enumerate(merged_train['assetCode'].unique())}
    merged_train['assetCodeT'] = merged_train['assetCode'].map(lbl)
    
    merged_train = merged_train.dropna(axis=0)
    
    fcol = [
        c
        for c in merged_train
        if c not in ['assetCodes', 'assetCodesLen', 'assetName', 'audiences',
                     'headline', 'headlineTag', 'marketCommentary', 'provider',
                     'sourceId', 'subjects', 'time_x', 'sourceTimestamp']]

    gc.collect()
    return merged_train[fcol]

In [None]:
def generate_x(df):
    # Remove columns not to use to learning
    x_columns = [
        c
        for c in df
        if c not in ['assetCode', 'time', 'firstCreated',
                     'returnsOpenNextMktres10', 'universe']]
    x = df[x_columns].values
    
    # Scaling of X values
    # It is good to keep these scaling values for later
    mins = np.min(x, axis=0)
    maxs = np.max(x, axis=0)
    rng = maxs - mins
    x = 1 - ((maxs - x) / rng)

    return x, x_columns    


def data_split(x, y, r, u, index, size=0.25):
    if len(x) == len(y) == len(r) == len(u) ==len(index):
        length = len(x)
    return (
        x[:round(length * (1-size))],
        x[round(length * (1-size)):],
        y[:round(length * (1-size))],
        y[round(length * (1-size)):],
        r[:round(length * (1-size))],
        r[round(length * (1-size)):],
        u[:round(length * (1-size))],
        u[round(length * (1-size)):],
        index[:round(length * (1-size))],
        index[round(length * (1-size)):]
    )
    
def format_cv_test_train(df):
    x, x_columns = generate_x(df)
    y = (df.returnsOpenNextMktres10 >= 0).values
    r = df.returnsOpenNextMktres10.values
    u = df.universe.values
    index = df[['firstCreated', 'assetCode']].values

    # Check data shape
    assert x.shape[0] == y.shape[0] == r.shape[0] == u.shape[0]
    train_x, test_x, train_y, test_y, train_r, test_r, train_u, test_u, train_index, test_index = \
        data_split(x, y, r, u, index, 0.25)
    #         model_selection.train_test_split(x, y, r, u, index, test_size=0.25, random_state=99)
    
    test_y_template = pd.DataFrame(test_index, columns=['time', 'assetCode'])
    test_y_template.loc[:, 'confidenceValue'] = 0

    return train_x, train_y, train_r, train_u, test_x, test_y, test_r, test_u, x_columns, test_y_template

In [None]:
def calculate_score(y_df, r, u=None):
    ndf = y_df.copy()
    ndf.loc[:, 'r'] = r
    if u is not None:
        ndf.loc[:, 'u'] = u
    else:
        ndf.loc[:, 'u'] = 1
    ndf.loc[:, 'x'] = ndf.confidenceValue * ndf.r * ndf.u
    xt = ndf.loc[ndf.loc[:, 'r'] <= 1000, :].groupby('time').x.sum()
    score = np.mean(xt) / np.std(xt)
    del ndf, xt
    return score

In [None]:
train_df = data_prep(market_train,news_train)

train_x, train_y, train_r, train_u, test_x, test_y, test_r, test_u, x_columns, test_y_template = \
    format_cv_test_train(train_df)

In [None]:
# Fitting
xgb_up = XGBClassifier(n_jobs=4,n_estimators=200,max_depth=8,eta=0.1)
t = time.time()
print('Fitting Up')
xgb_up.fit(train_x, train_y)
print(f'Done, time = {time.time() - t}')

In [None]:
# Predict and calculate accuracy
pred_y = xgb_up.predict(test_x)
accuracy = metrics.accuracy_score(pred_y, test_y)
print(f'Accuracy: {accuracy}')

pred_proba = xgb_up.predict_proba(test_x)
test_y_template.confidenceValue = 2 * pred_proba[:,1] - 1
score = calculate_score(test_y_template, test_r, test_u)
# score = calculate_score(test_y_template, test_r, None)
print(f'Score: {score}')

In [None]:
# Visualize feature importance
%matplotlib inline
plt.figure(num=None, figsize=(10, 10), dpi=80, facecolor='w', edgecolor='k')
plt.bar(range(len(xgb_up.feature_importances_)), xgb_up.feature_importances_)
plt.xticks(range(len(xgb_up.feature_importances_)), x_columns, rotation='vertical');