**Kernel inspired by**
https://www.kaggle.com/vasumani/simple-xgboost-with-only-few-years-data

In [None]:
import pandas as pd
import numpy as np
import gc

In [None]:
from kaggle.competitions import twosigmanews
# You can only call make_env() once, so don't lose it!
env = twosigmanews.make_env()

In [None]:
(market_train_df, news_train_df) = env.get_training_data()

In [None]:
market_train_df.head()

In [None]:
market_train_df.tail()

In [None]:
print(market_train_df.isna().sum())
print("---------------")
print(market_train_df.shape[0])

In [None]:
market_train_df = market_train_df[market_train_df['universe'] == 1]
market_train_df = market_train_df[market_train_df['time'].dt.year >= 2012]
market_train_df.reset_index(drop=True, inplace=True)

In [None]:
print(market_train_df.isna().sum())
print("---------------")
print(market_train_df.shape[0])

In [None]:
market_train_df['returnsClosePrevMktres1'].fillna(market_train_df['returnsClosePrevMktres1'].mean(), inplace=True)
market_train_df['returnsOpenPrevMktres1'].fillna(market_train_df['returnsOpenPrevMktres1'].mean(), inplace=True)
market_train_df['returnsClosePrevMktres10'].fillna(market_train_df['returnsClosePrevMktres10'].mean(), inplace=True)
market_train_df['returnsOpenPrevMktres10'].fillna(market_train_df['returnsOpenPrevMktres10'].mean(), inplace=True)

print(market_train_df.isna().sum())
print("---------------")
print(market_train_df.shape[0])

In [None]:
market_train_df.head()

In [None]:
market_train_df.tail()

In [None]:
news_train_df.head()

In [None]:
news_train_df.tail()

In [None]:
print(news_train_df.isna().sum())
print("---------------")
print(news_train_df.shape[0])

In [None]:
news_var = ['time','assetName', 'bodySize','companyCount','sentenceCount','wordCount',
            'firstMentionSentence','relevance','sentimentClass','sentimentNegative',
            'sentimentNeutral','sentimentPositive','sentimentWordCount','noveltyCount12H',
            'noveltyCount24H','noveltyCount3D','noveltyCount5D','noveltyCount7D',
            'volumeCounts12H','volumeCounts24H','volumeCounts3D','volumeCounts5D','volumeCounts7D'
            ]

news_train_df = news_train_df[news_var]

news_train_df = news_train_df[news_train_df.time.dt.year >= 2012]

news_train_df['date'] = news_train_df.time.dt.date

news_train_df.info()

In [None]:
news_train_df = news_train_df.groupby(['date','assetName']).mean().reset_index()

news_train_df.info()

In [None]:
market_train_df['date'] = market_train_df.time.dt.date

market_train_df = pd.merge(market_train_df, news_train_df,how='left',on = ['assetName','date'])

train_df = market_train_df
train_df.head()

In [None]:
train_df.tail()

In [None]:
del news_train_df
del market_train_df

gc.collect()

In [None]:
print(train_df.isna().sum() / train_df.shape[0])
print("-------------")
print(train_df.shape[0])

In [None]:
train_df['bodySize'].fillna(train_df['bodySize'].mean(), inplace=True)
train_df['companyCount'].fillna(train_df['companyCount'].mean(), inplace=True)
train_df['sentenceCount'].fillna(train_df['sentenceCount'].mean(), inplace=True)
train_df['wordCount'].fillna(train_df['wordCount'].mean(), inplace=True)
train_df['firstMentionSentence'].fillna(train_df['firstMentionSentence'].mean(), inplace=True)
train_df['relevance'].fillna(train_df['relevance'].mean(), inplace=True)
train_df['sentimentClass'].fillna(train_df['sentimentClass'].mean(), inplace=True)
train_df['sentimentNegative'].fillna(train_df['sentimentNegative'].mean(), inplace=True)
train_df['sentimentNeutral'].fillna(train_df['sentimentNeutral'].mean(), inplace=True)
train_df['sentimentPositive'].fillna(train_df['sentimentPositive'].mean(), inplace=True)
train_df['sentimentWordCount'].fillna(train_df['sentimentWordCount'].mean(), inplace=True)
train_df['noveltyCount12H'].fillna(train_df['noveltyCount12H'].mean(), inplace=True)
train_df['noveltyCount24H'].fillna(train_df['noveltyCount24H'].mean(), inplace=True)
train_df['noveltyCount3D'].fillna(train_df['noveltyCount3D'].mean(), inplace=True)
train_df['noveltyCount5D'].fillna(train_df['noveltyCount5D'].mean(), inplace=True)
train_df['noveltyCount7D'].fillna(train_df['noveltyCount7D'].mean(), inplace=True)
train_df['volumeCounts12H'].fillna(train_df['volumeCounts12H'].mean(), inplace=True)
train_df['volumeCounts24H'].fillna(train_df['volumeCounts24H'].mean(), inplace=True)
train_df['volumeCounts3D'].fillna(train_df['volumeCounts3D'].mean(), inplace=True)
train_df['volumeCounts5D'].fillna(train_df['volumeCounts5D'].mean(), inplace=True)
train_df['volumeCounts7D'].fillna(train_df['volumeCounts7D'].mean(), inplace=True)

print(train_df.isna().sum() / train_df.shape[0])
print("-------------")
print(train_df.shape[0])

In [None]:
# Train/test split
id_train = train_df.time.dt.year != 2016
id_test = train_df.time.dt.year == 2016

dep_var = 'returnsOpenNextMktres10'
ind_var = ['volume', 'close', 'open', 'returnsClosePrevRaw1',
       'returnsOpenPrevRaw1', 'returnsClosePrevMktres1',
       'returnsOpenPrevMktres1', 'returnsClosePrevRaw10',
       'returnsOpenPrevRaw10', 'returnsClosePrevMktres10',
       'returnsOpenPrevMktres10','bodySize', 'companyCount', 'wordCount',
       'firstMentionSentence', 'relevance', 'sentimentClass',
       'sentimentWordCount', 'noveltyCount7D', 'volumeCounts7D']

df_train = train_df.loc[id_train,ind_var]
df_test = train_df.loc[id_test,ind_var]

print("{0} training rows and {1} testing rows".format(df_train.shape[0],df_test.shape[0]))


y_train = train_df.loc[id_train,dep_var]
y_test = train_df.loc[id_test,dep_var]

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

#-------------- XGboost (untuned)
xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 100)

xg_reg.fit(df_train,y_train)

In [None]:
import matplotlib.pyplot as plt
xgb.plot_importance(xg_reg,max_num_features = 25)
plt.rcParams['figure.figsize'] = [5, 5]
plt.show()

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt

pred_train = xg_reg.predict(df_train)
rms_train = sqrt(mean_squared_error(y_train, pred_train))

pred_test = xg_reg.predict(df_test)
rms_test = sqrt(mean_squared_error(y_test, pred_test))

print('Train RMSE: {0} Test RMSE: {1}'.format(rms_train,rms_test))

In [None]:
pred_test_df = train_df.loc[id_test,['time','assetCode','universe','returnsOpenNextMktres10']]
pred_test_df['dayofyear'] = pred_test_df.time.dt.dayofyear
pred_test_df['confidence'] = [1 if pred >=0 else -1 for pred in pred_test]
pred_test_df['score'] = pred_test_df.universe * pred_test_df.returnsOpenNextMktres10 * pred_test_df.confidence
print(pred_test_df.confidence.value_counts())

score_1 = pred_test_df.groupby(['dayofyear']).score.sum()
score_2 = score_1.mean()/ score_1.std()
print("\n Competition Score: ",np.round(score_2,4))

In [None]:
def make_predictions(market_obs_df,news_obs_df,predictions_df,ind_var=ind_var,news_var=news_var,xg_reg=xg_reg):
    
    #Process news data
    news_obs_df = news_obs_df.loc[:,news_var]
    news_obs_df['date'] = news_obs_df.time.dt.date
    news_train_df_grp = news_obs_df.groupby(['date','assetName']).mean().reset_index()
    
    #Merge the market and news data
    market_obs_df['date'] = market_obs_df.time.dt.date
    market_obs_df = pd.merge(market_obs_df,news_train_df_grp,how='left',on = ['assetName','date'])

    #Fill 0 for NA's in News data
    market_obs_df.fillna(0,inplace = True)
    test = market_obs_df.loc[:,ind_var]
    predictions_df.confidenceValue = [1 if  pred >=0 else -1 for pred in xg_reg.predict(test)]

In [None]:
days = env.get_prediction_days()

for (market_obs_df, news_obs_df, predictions_template_df) in days:
    make_predictions(market_obs_df, news_obs_df, predictions_template_df)
    env.predict(predictions_template_df)
print('Done!')

In [None]:
env.write_submission_file()

In [None]:
# We've got a submission file!
import os
print([filename for filename in os.listdir('.') if '.csv' in filename])