In [None]:
from kaggle.competitions import twosigmanews
import numpy as np
from tqdm import tqdm

# Get data
env = twosigmanews.make_env()
market_train_df, news_train_df = env.get_training_data()

Let's get a view on how the data looks

In [None]:
market_train_df.head()

In [None]:
news_train_df.head()

I attempted to encode subjects as catergorical data but it took too long. It takes around 1hr in total just to encode everything. Hence, I have decided that it is not worth it. 

In [None]:
# # Get set of subjects
# subjectSet = set()
# for row in tqdm(news_train_df['subjects']):
#     myset = eval(row)
#     subjectSet = subjectSet.union(myset)

In [None]:
# # Convert subjects to columns
# for subject in tqdm(subjectSet):
#     news_train_df[subject] = news_train_df["subjects"].str.contains(subject)

I would like to join the news and market data together into one table for easy training. First, I want to create the date column for both market and news data for me to join them together by. 

In [None]:
news_train_df['date'] = news_train_df['time'].dt.date
market_train_df['date'] = market_train_df['time'].dt.date

After studying the news and market data, I notice that there could be multiple news articles for the same asset per day. This results in multiple rows for the same asset, hence we need to aggregate somehow. I can think of two ways to do it for now. 

1) Just take a mean of all the numerical data, combine the non-numerical ones and keep a column to count how many news articles we have. 
2) We aggregate only the rows with news articles we deem significant
3) Use weighted means of numerical data where the weights are the relevance value

I have decided to go with weighted means for now. And I think I should aggregate before merging the two dataframes together. 

In [None]:
%%time

# Make news_train_df smaller
news_small = news_train_df.drop(["time", "sourceTimestamp", "firstCreated",
                               "sourceId", "headline", "takeSequence",
                               "provider", "subjects", "audiences",
                               "companyCount", "marketCommentary", "assetCodes"], axis=1)

In [None]:
news_small.head()

In [None]:
%%time
# multiply columns by relevance
weighted_cols = ['urgency', 'bodySize', 'sentenceCount',
                 'wordCount', 'firstMentionSentence', 'sentimentClass',
                 'sentimentNegative', 'sentimentNeutral', 'sentimentPositive', 
                 'sentimentWordCount', "noveltyCount12H", "noveltyCount24H",
                 "noveltyCount3D", "noveltyCount5D", "noveltyCount7D",
                 "volumeCounts12H", "volumeCounts24H", "volumeCounts3D",
                 "volumeCounts5D", "volumeCounts7D"]

# memory error if we use all columns at once
for col in weighted_cols:
    news_small[col] = news_small[col] * news_small['relevance']

In [None]:
%%time
# sum all columns by group, now relevance becomes total relevance
sumFunctions = {"relevance": np.sum,
                "urgency": np.sum,
                "bodySize": np.sum,
                "sentenceCount": np.sum,
                "wordCount": np.sum,
                "firstMentionSentence": np.sum,
                "sentimentClass": np.sum,
                "sentimentNegative": np.sum,
                "sentimentNeutral": np.sum,
                "sentimentPositive": np.sum,
                "sentimentWordCount": np.sum,
                "noveltyCount12H": np.sum,
                "noveltyCount24H": np.sum,
                "noveltyCount3D": np.sum,
                "noveltyCount5D": np.sum,
                "noveltyCount7D": np.sum,
                "volumeCounts12H": np.sum,
                "volumeCounts24H": np.sum,
                "volumeCounts3D": np.sum,
                "volumeCounts5D": np.sum,
                "volumeCounts7D": np.sum}
news_small = news_small.groupby(["date","assetName"]).agg(sumFunctions)

# divide everything by total relevance to get weighted averages
for col in weighted_cols:
    news_small[col] = news_small[col] / news_small['relevance']


In [None]:
import pandas as pd
# now we merge market and news, also drop relevance since it is already used
news_small = news_small.drop('relevance', axis=1)
df = pd.merge(market_train_df, news_small, how='left', on=['date', 'assetName'])
df.head()