In [None]:
import numpy as np
import pandas as pd

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

from kaggle.competitions import twosigmanews

pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999

env = twosigmanews.make_env()

In [None]:
mt_df, nt_df = env.get_training_data()
mt_df.head()

In [None]:
print("{:,} market samples.".format(mt_df.shape[0]))

In [None]:
mt_df.dtypes

In [None]:
mt_df.isna().sum()

In [None]:
mt_df['price_diff'] = mt_df['close'] - mt_df['open']
grouped = mt_df.groupby('time').agg({'price_diff': ['std', 'min']}).reset_index()
print(f"Average standard deviation of price change within a day in {grouped['price_diff']['std'].mean():.4f}.")

In [None]:
print("There are {:,} unique assets in the training set".format(mt_df['assetCode'].nunique()))

In [None]:
print("There are {} missing values in the `assetCode` column".format(mt_df['time'].isna().sum()))

In [None]:
volumesByTradingDay = mt_df.groupby(mt_df['time'].dt.date)['volume'].sum()

In [None]:
trace1 = go.Bar(
    x = volumesByTradingDay.index,
    y = volumesByTradingDay.values
)

layout = dict(title = "Trading volumes by date",
              xaxis = dict(title = 'Year'),
              yaxis = dict(title = 'Volume'),
              )
data = [trace1]

py.iplot(dict(data=data, layout=layout), filename='basic-line')

In [None]:
mt_df['open'].describe()

In [None]:
mt_df['returnsOpenNextMktres10'].describe()

In [None]:
outliers = mt_df[(mt_df['returnsOpenNextMktres10'] > 1) |  (mt_df['returnsOpenNextMktres10'] < -1)]
outliers['returnsOpenNextMktres10'].describe()

In [None]:
woOutliers = mt_df[(mt_df['returnsOpenNextMktres10'] < 1) &  (mt_df['returnsOpenNextMktres10'] > -1)]
woOutliers['returnsOpenNextMktres10'].describe()

In [None]:
trace1 = go.Histogram(
    x = woOutliers.sample(n=10000)['returnsOpenNextMktres10'].values
)

layout = dict(title = "returnsOpenNextMktres10 (random 10.000 sample; without outliers)")
data = [trace1]

py.iplot(dict(data=data, layout=layout), filename='basic-line')

In [None]:
nt_df.head()

In [None]:
print(f'{nt_df.shape[0]} samples and {nt_df.shape[1]} features in the training news dataset.')

In [None]:
import matplotlib.pyplot as plt 
nt_df['sentence_word_count'] =  nt_df['wordCount'] / nt_df['sentenceCount']
plt.boxplot(nt_df['sentence_word_count'][nt_df['sentence_word_count'] < 40]);

In [None]:
(nt_df['headlineTag'].value_counts() / 1000)[:10].plot('barh');
plt.title('headlineTag counts (thousands)');