In [None]:
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.style.use('seaborn')
%matplotlib inline

In [None]:
mpl.rcParams['figure.figsize'] = 12, 6
mpl.rcParams['axes.titlesize'] = 20
mpl.rcParams['axes.labelsize'] = 16
mpl.rcParams['xtick.labelsize'] = 16
mpl.rcParams['ytick.labelsize'] = 16
mpl.rcParams['legend.fontsize'] = 12

## Loading data

In [None]:
from kaggle.competitions import twosigmanews
# You can only call make_env() once, so don't lose it!
env = twosigmanews.make_env()

In [None]:
(market_train_df, news_train_df) = env.get_training_data()

## Market data EDA

In [None]:
market_train_df.head(3)

In [None]:
market_train_df.shape

In [None]:
market_train_df.columns

In [None]:
print("The market data cover stock stats from: {} to {}."
      .format(market_train_df.time.min(),market_train_df.time.max()))

In [None]:
market_train_df = market_train_df.sort_values('time')
market_train_df['date'] = market_train_df['time'].dt.date

Trends of closing prices by quantiles

In [None]:
# inspired by https://www.kaggle.com/dmdm02/complete-eda-voting-lightgbm
for i in [0.05,0.25, 0.5, 0.75,0.95]:
    close_df = market_train_df.groupby('date')['close'].quantile(i).reset_index()
    plt.plot(close_df['date'], close_df['close'], label='%.2f quantile' %i)
plt.legend(loc='best')
plt.xlabel('Time')
plt.ylabel('Closing Price')
plt.title('Market closing price by quantile')

Number of assets by trading day.

In [None]:
assets_by_day = market_train_df.groupby(market_train_df['date'])['assetCode'].nunique()

plt.fill_between(assets_by_day.index,0,assets_by_day.values)
#plt.plt(assets_by_day)

plt.xlabel('Time')
plt.ylabel('# of assets')
plt.title('Number of assets by trading day')


Trading volumes by day.

In [None]:
volume_by_day = market_train_df.groupby(market_train_df['date'])['volume'].sum()

plt.plot(volume_by_day) #or barchart (x=volume_by_day.index, y=volume_by_day.values)
plt.xlabel('Time')
plt.ylabel('Volume')
plt.title('Trading volumes by day')

Target: returnsOpenNextMktres10 (Market-residualized open-to-open returns in the next 10 days).


In [None]:
market_train_df['returnsOpenNextMktres10'].describe()

In [None]:
target_hist = plt.hist(market_train_df['returnsOpenNextMktres10'].sample(n=10000),
                       bins=1000,
                       range=(-1,1))

Outliers do exist, but most are in the -1 to 1 range.

## News data EDA

In [None]:
news_train_df.head(3)

In [None]:
news_train_df.shape

In [None]:
news_train_df.columns

In [None]:
print("The news data cover news stats from: {} to {}."
      .format(news_train_df.time.min(),news_train_df.time.max()))

In [None]:
# Sort values by time then extract date
news_train_df = news_train_df.sort_values(by='time')
news_train_df['date'] = news_train_df['time'].dt.date

Top 10 news provider.

In [None]:
news_train_df['provider'].value_counts().head(10)

News count by day.

In [None]:
count_by_day = news_train_df.groupby('date')['sourceId'].count()
#count_by_day = count_by_day.ewm(span=10).mean()

plt.bar(x=count_by_day.index, height=count_by_day.values, width=1)
plt.xlabel('Time')
plt.ylabel('Count')
plt.title('Count by day')

Top 10 mentioned assets in news.

In [None]:
asset_name = news_train_df.groupby('assetName')['sourceId'].count().sort_values(ascending=False)[:10]
plt.barh(y=asset_name.index, width=asset_name.values, height=0.5, color='green')
plt.gca().invert_yaxis()
plt.xlabel('News count')

News words and sentences count.

In [None]:
news_train_df['sentence_word_count'] =  news_train_df['wordCount'] / news_train_df['sentenceCount']

plt.subplot(1, 3, 1)
word_count = plt.boxplot(news_train_df['wordCount'][news_train_df['wordCount'] < 40]);
plt.xlabel('Word')

plt.subplot(1, 3, 2)
sen_count = plt.boxplot(news_train_df['sentenceCount'][news_train_df['sentenceCount'] < 40]);
plt.xlabel('Sentence')

plt.subplot(1, 3, 3)
sen_word_count = plt.boxplot(news_train_df['sentence_word_count'][news_train_df['sentence_word_count'] < 40])
plt.xlabel('Word per sentence')


Which attitude prevails in news?

In [None]:
# Barplot on negative, neutral and positive columns.
news_train_df[['sentimentNegative', 'sentimentNeutral','sentimentPositive']].mean().plot(kind='bar')
plt.title("News sentiment chart")
plt.show()

## Restart the Kernel to run your code again
In order to combat cheating, you are only allowed to call `make_env` or iterate through `get_prediction_days` once per Kernel run.  However, while you're iterating on your model it's reasonable to try something out, change the model a bit, and try it again.  Unfortunately, if you try to simply re-run the code, or even refresh the browser page, you'll still be running on the same Kernel execution session you had been running before, and the `twosigmanews` module will still throw errors.  To get around this, you need to explicitly restart your Kernel execution session, which you can do by pressing the Restart button in the Kernel Editor's bottom Console tab:
![Restart button](https://i.imgur.com/hudu8jF.png)