# importing libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import datetime
import lightgbm as lgb
from scipy import stats
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split
from wordcloud import WordCloud
from collections import Counter
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
stop = set(stopwords.words('english'))


import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn import model_selection
from sklearn.metrics import accuracy_score

# load data

In [None]:
from kaggle.competitions import twosigmanews
# 因为这个比赛的数据不能下载，而且数据只能读一次
if 'env' not in globals():
    env = twosigmanews.make_env()
#     market_train_df是股票的相关信息，news_train_df是新闻的相关信息
    (market_train_df, news_train_df) = env.get_training_data()

# market data

In [None]:
market_train_df.head()

### 市场数据说明:
每个asset都由assetCode（注意：单个公司可能有多个assetCodes）。可以使用assetCode，assetName或time作为将市场数据加入新闻数据的方式。

marketdata包含通过不同时间跨度计算的各种回报。这组marketdata中的所有回报都具有以下属性：
1. 回报总是计算为open-to-open（从一个交易日的开盘时间到另一个交易日的开盘时间）或close-to-close（从一个交易日的收盘时间到另一个交易日的开盘时间）。 
2. 回报是raw，意味着数据不是根据任何benchmark也不是market-residualized (Mktres)，这意味着整个市场的变动已被考虑， leaving only movements inherent to the instrument。
3. 回报可以是1天和10天的。
4. 如果它们向后看，则回报被标记为'Prev'，如果向前看，则回报被标记为'Next'。

例如：returnsOpenNextMktres10 means Market-residualized open-to-open returns in the next 10 days.

## Possible data errors

In [None]:
# 把数据按时间排序
market_train_orig = market_train_df.sort_values('time')
news_train_orig = news_train_df.sort_values('time')
market_train_df = market_train_orig.copy()
news_train_df = news_train_orig.copy()
del market_train_orig
del news_train_orig

统计数据显示，大多数数据在2009年后表现均匀（成交量增加，价格上涨等）。然而，在2009年之前，由于2008年金融危机导致房地产泡沫破灭，数据表现不同。我们选择了2009年之后的数据。

In [None]:
#选择2009年之后的数据
market_train_df = market_train_df.loc[market_train_df['time'].dt.date>=datetime.date(2009,1,1)]
news_train_df = news_train_df.loc[news_train_df['time'].dt.date>=datetime.date(2009,1,1)]

## fill the data
我们把数据中的空数据进行填充

In [None]:
print('Check null data:')
market_train_df.isna().sum()

In [None]:
# 把空数据直接用该行的raw数据填充
column_market = ['returnsClosePrevMktres1','returnsOpenPrevMktres1','returnsClosePrevMktres10', 'returnsOpenPrevMktres10']
column_raw = ['returnsClosePrevRaw1', 'returnsOpenPrevRaw1','returnsClosePrevRaw10', 'returnsOpenPrevRaw10']
for i in range(len(column_raw)):
    market_train_df[column_market[i]] = market_train_df[column_market[i]].fillna(market_train_df[column_raw[i]])

## delete the Abnormal growth or deline data

In [None]:
#输出变动幅度太大的数据
market_train_df['close_open_ratio'] = np.abs(market_train_df['close']/market_train_df['open'])
threshold = 0.5
print('In %i lines price increases by 50%% or more in a day' %(market_train_df['close_open_ratio']>=1.5).sum())
print('In %i lines price decreases by 50%% or more in a day' %(market_train_df['close_open_ratio']<=0.5).sum())

In [None]:
# 删除变动太大的数据
market_train_df['close_open_ratio'] = np.abs(market_train_df['close']/market_train_df['open'])
market_train_df = market_train_df.loc[market_train_df['close_open_ratio'] < 1.5]
market_train_df = market_train_df.loc[market_train_df['close_open_ratio'] > 0.5]
market_train_df = market_train_df.drop(columns=['close_open_ratio'])

经过查看数据的曲线发现， 'PGN.N', 'EBRYY.OB'两个asset的数据的returnsOpenPrevRaw1异常，大部分是0，所以删除掉

In [None]:
orig_len = market_train_df.shape[0]
market_train_df = market_train_df[~market_train_df['assetCode'].isin(['PGN.N','EBRYY.OB'])]
#market_train_df = market_train_df[~market_train_df['assetName'].isin(['Unknown'])]
new_len = market_train_df.shape[0]
rmv_len = np.abs(orig_len-new_len)
print('There were %i lines removed' %rmv_len)

删除回报变化异常的数据

In [None]:
column_return = column_market + column_raw + ['returnsOpenNextMktres10']
orig_len = market_train_df.shape[0]
for column in column_return:
    market_train_df = market_train_df.loc[market_train_df[column]>=-2]
    market_train_df = market_train_df.loc[market_train_df[column]<=2]
new_len = market_train_df.shape[0]
rmv_len = np.abs(orig_len-new_len)
print('There were %i lines removed' %rmv_len)

# news data

In [None]:
# 新闻的数据的dataframe
news_train_df.head()

### 新闻数据说明：
新闻数据包含关于资产的新闻文章/警报的信息，例如文章细节、观点和其他评论。新闻文章级别和资产级别(换句话说，表有意不标准化)。

In [None]:
print(f'{news_train_df.shape[0]} samples and {news_train_df.shape[1]} features in the training news dataset.')

这个文件太大，无法直接处理文本，所以让我们来看看最近10万个标题的wordcloud。

In [None]:
text = ' '.join(news_train_df['headline'].str.lower().values[-1000000:])
wordcloud = WordCloud(max_font_size=None, stopwords=stop, background_color='white',
                      width=1200, height=1000).generate(text)
plt.figure(figsize=(12, 8))
plt.imshow(wordcloud)
plt.title('Top words in headline')
plt.axis("off")
plt.show()

### deal with possible data errors.

In [None]:
# remove outliers.
def remove_outliers(data_frame, column_list, low=0.02, high=0.98):
    for column in column_list:
        this_column = data_frame[column]
        quant_df = this_column.quantile([low,high])
        low_limit = quant_df[low]
        high_limit = quant_df[high]
        data_frame[column] = data_frame[column].clip(lower=low_limit, upper=high_limit)
    return data_frame

In [None]:
columns_outlier = ['takeSequence', 'bodySize', 'sentenceCount', 'wordCount', 'sentimentWordCount', 'firstMentionSentence','noveltyCount12H',\
                  'noveltyCount24H', 'noveltyCount3D', 'noveltyCount5D', 'noveltyCount7D', 'volumeCounts12H', 'volumeCounts24H',\
                  'volumeCounts3D','volumeCounts5D','volumeCounts7D']
print('Clipping news outliers ...')
news_train_df = remove_outliers(news_train_df, columns_outlier)

In [None]:
# 数据清洗结果
print(f'{news_train_df.shape[0]} samples and {news_train_df.shape[1]} features in the training news dataset.')
news_train_df.describe()