In [1]:
import pandas as pd
import matplotlib as plt
import datetime
import time
import numpy as np

In [2]:
#pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
#pd.set_option('display.width', 1000)

## Preprocess News Data

In [3]:
#Using 2010 to 2016 data
def process_news_data(news_data):
    print("Shape : {}".format(news_data.shape))
    news_data = news_data.drop(news_data.columns[0], axis=1)
    news_data['time'] = pd.to_datetime(news_data['time'])
    news_data['sourceTimestamp'] = pd.to_datetime(news_data['sourceTimestamp'])
    news_data['firstCreated'] = pd.to_datetime(news_data['firstCreated'])
    news_data['provider'] = news_data['provider'].astype('category')
    news_data['subjects'] = news_data['subjects'].astype('category')
    news_data['audiences'] = news_data['audiences'].astype('category')
    news_data['assetCodes'] = news_data['assetCodes'].astype('category')
    news_data['assetName'] = news_data['assetName'].astype('category')
    news_data = news_data[news_data['time'].dt.year > 2009]
    return news_data

In [4]:
df1 = pd.read_csv("train_news/news_train1.csv")
df1 = process_news_data(df1)
df2 = pd.read_csv("train_news/news_train2.csv")
df2 = process_news_data(df2)
df3 = pd.read_csv("train_news/news_train3.csv")
df3 = process_news_data(df3)
df4 = pd.read_csv("train_news/news_train4.csv")
df4 = process_news_data(df4)
df5 = pd.read_csv("train_news/news_train5.csv")
df5 = process_news_data(df5)
df6 = pd.read_csv("train_news/news_train6.csv")
df6 = process_news_data(df6)
df7 = pd.read_csv("train_news/news_train7.csv")
df7 = process_news_data(df7)
df8 = pd.read_csv("train_news/news_train8.csv")
df8 = process_news_data(df8)
df9 = pd.read_csv("train_news/news_train9.csv")
df9 = process_news_data(df9)
df10 = pd.read_csv("train_news/news_train10.csv")
df10 = process_news_data(df10)
news_data = pd.concat([df1,df2,df3,df4,df5,df6,df7,df8,df9,df10])

Shape : (1000000, 36)
Shape : (1000000, 36)
Shape : (1000000, 36)
Shape : (1000000, 36)
Shape : (1000000, 36)
Shape : (1000000, 36)
Shape : (1000000, 36)
Shape : (1000000, 36)
Shape : (1000000, 36)
Shape : (328749, 36)


In [5]:
news_data.shape

(7003399, 35)

In [6]:
news_data = news_data[pd.notnull(news_data['headline'])]
news_data.shape

(6987566, 35)

In [7]:
## Removing late report of news
news_data['news_delay'] = news_data['time'] - news_data['sourceTimestamp']
news_data = news_data[news_data.news_delay < datetime.timedelta(days=1)]
news_data.shape

(6987562, 36)

### Preprocess Market Data 

In [8]:
df1 = pd.read_csv("train_market/market_train1.csv")
df2 = pd.read_csv("train_market/market_train2.csv")
df3 = pd.read_csv("train_market/market_train3.csv")
df4 = pd.read_csv("train_market/market_train4.csv")
df5 = pd.read_csv("train_market/market_train5.csv")
market_data = pd.concat([df1,df2,df3,df4,df5])

In [9]:
market_data.dtypes

Unnamed: 0                    int64
time                         object
assetCode                    object
assetName                    object
volume                      float64
close                       float64
open                        float64
returnsClosePrevRaw1        float64
returnsOpenPrevRaw1         float64
returnsClosePrevMktres1     float64
returnsOpenPrevMktres1      float64
returnsClosePrevRaw10       float64
returnsOpenPrevRaw10        float64
returnsClosePrevMktres10    float64
returnsOpenPrevMktres10     float64
returnsOpenNextMktres10     float64
universe                    float64
dtype: object

In [10]:
market_data = market_data.drop(market_data.columns[0], axis=1)
market_data['time'] = pd.to_datetime(market_data['time'])
market_data['assetName'] = market_data['assetName'].astype('category')

In [11]:
print("Min timestamp : {}, Max timestamp : {}, Market data shape : {}".format(market_data['time'].min(), 
                                                                              market_data['time'].max(),
                                                                              market_data.shape))

Min timestamp : 2007-02-01 22:00:00, Max timestamp : 2016-12-30 22:00:00, Market data shape : (4072955, 16)


In [12]:
market_data = market_data[market_data['time'].dt.year > 2009]
market_data.shape

(2946738, 16)

## Merging two dataframes

In [13]:
def data_prep(market_df,news_df):
    market_df['time'] = market_df.time.dt.date
    market_df['returnsOpenPrevRaw1_to_volume'] = market_df['returnsOpenPrevRaw1'] / market_df['volume']
    market_df['close_to_open'] = market_df['close'] / market_df['open']
    market_df['volume_to_mean'] = market_df['volume'] / market_df['volume'].mean()
    
    news_df['time'] = news_df.time.dt.hour
    news_df['sourceTimestamp']= news_df.sourceTimestamp.dt.hour
    news_df['firstCreated'] = news_df.firstCreated.dt.date
    news_df['headlineLen'] = news_df['headline'].apply(lambda x: len(x))
    news_df['asset_sentiment_count'] = news_df.groupby(['assetName', 'sentimentClass'])['time'].transform('count')
    news_df['asset_sentence_mean'] = news_df.groupby(['assetName', 'sentenceCount'])['time'].transform('mean')
    
    #news_df['assetCodesLen'] = news_df['assetCodes'].map(lambda x: len(eval(x)))
    news_df['assetCodes'] = news_df['assetCodes'].map(lambda x: list(eval(x))[0])

    #news_df['assetCodesLen'] = news_df['assetCodes'].apply(lambda x: len(x))
    
    
    lbl = {k: v for v, k in enumerate(news_df['headlineTag'].unique())}
    news_df['headlineTagT'] = news_df['headlineTag'].map(lbl)
    kcol = ['firstCreated', 'assetCodes']
    news_df = news_df.groupby(kcol, as_index=False).mean()

    market_df = pd.merge(market_df, news_df, how='left', left_on=['time', 'assetCode'], 
                            right_on=['firstCreated', 'assetCodes'])

    lbl = {k: v for v, k in enumerate(market_df['assetCode'].unique())}
    market_df['assetCodeT'] = market_df['assetCode'].map(lbl)
    
    market_df = market_df.dropna(axis=0)
    
    return market_df


In [14]:
market_news = data_prep(market_data, news_data)

In [15]:
market_news.shape

(608595, 52)

In [16]:
market_news['time_x'] = pd.to_datetime(market_news['time_x'])

In [17]:
#market_news = market_news[market_news["assetName"].str.contains("aceboo")]

In [18]:
market_train = market_news[market_news['time_x'].dt.year < 2016]
market_test = market_news[market_news['time_x'].dt.year >= 2016]

In [19]:
market_train = market_train.sort_values(by=['assetName', 'time_x']).reset_index(drop=True)
market_test = market_test.sort_values(by=['assetName', 'time_x']).reset_index(drop=True)

In [20]:
feature_columns = [c for c in market_train if c not in ['assetCode', 'assetCodes', 'assetCodesLen', 'assetCodeT', 'volume_to_mean', 'sentence_word_count',
                'firstCreated', 'headline', 'headlineTag', 'marketCommentary', 'provider', 'returnsOpenPrevRaw1_to_volume',
                'sourceId', 'subjects', 'time', 'universe','sourceTimestamp','returnsOpenNextMktres10',
                                                        
                'close_to_open', 'time_y',
                'assetCodesLen', 'sentimentClass', 'volumeCounts24H','volumeCounts3D', 'volumeCounts5D', 'volumeCounts7D',
                'noveltyCount24H','noveltyCount3D','noveltyCount5D', 'noveltyCount7D', 'companyCount', 'asset_sentiment_count',
                'asset_sentence_mean','headlineTagT', 'time_x', 'assetName']]

# 'volume', 'close', 'open', 'returnsClosePrevRaw1', 'returnsOpenPrevRaw1', 'returnsClosePrevMktres1', 
#                 'returnsOpenPrevMktres1', 'returnsClosePrevRaw10', 'returnsOpenPrevRaw10', 'returnsClosePrevMktres10', 
#                 'returnsOpenPrevMktres10', 
print("SELECTED FEATURES: ", feature_columns)

SELECTED FEATURES:  ['volume', 'close', 'open', 'returnsClosePrevRaw1', 'returnsOpenPrevRaw1', 'returnsClosePrevMktres1', 'returnsOpenPrevMktres1', 'returnsClosePrevRaw10', 'returnsOpenPrevRaw10', 'returnsClosePrevMktres10', 'returnsOpenPrevMktres10', 'urgency', 'takeSequence', 'bodySize', 'sentenceCount', 'wordCount', 'firstMentionSentence', 'relevance', 'sentimentNegative', 'sentimentNeutral', 'sentimentPositive', 'sentimentWordCount', 'noveltyCount12H', 'volumeCounts12H', 'headlineLen']


In [None]:
## Approach 2
## Combining columns of previous days for data aggregation
# def prepare_data(train_df, num_samples):
#     res = pd.DataFrame()
#     for k in range(num_samples):
#         f1 = {}
#         for i in feature_columns:
#             f1.update({i : i+'_' + str(k)}) 
#         df1 = train_df.rename(columns=f1)[k:].reset_index(drop=True)
#         res = pd.concat([res, df1], axis=1)
#     return res
# tmp = prepare_data(train_df, 3)

In [138]:
## Approach 1:
X_train = market_train[feature_columns].values
X_test = market_test[feature_columns].values
## Using only those stocks for which we have more than 500 data points

#market_train = market_train.groupby("assetName").filter(lambda x: len(x) > 500)
up_train = market_train['returnsOpenNextMktres10'].map(lambda x: 0 if x<0 else 1).values
#r = market_train['returnsOpenNextMktres10'].values
Y_train = up_train
up_test = market_test['returnsOpenNextMktres10'].map(lambda x: 0 if x<0 else 1).values
Y_test = up_test

print('Number of training samples: {} , test samples : {}'.format(Y_train.shape[0], Y_test.shape[0]))

Number of training samples: 520610 , test samples : 87985


In [139]:
X_train_norm = (X_train - X_train.min(axis=0)) /X_train.ptp(axis=0)
X_test_norm = (X_test - X_test.min(axis=0)) /X_test.ptp(axis=0)

In [140]:
X_train_norm.shape

(520610, 25)

In [141]:
X_test_norm.shape

(87985, 25)

## RNN Model

In [26]:
# from tf.keras.models import Sequential  # This does not work!
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, GRU, Embedding
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [27]:
import tensorflow as tf

In [129]:
X_train_norm.shape

(104122, 5, 25)

In [142]:
X_train_norm = np.reshape(X_train_norm, (int(X_train_norm.shape[0]/5), 5, X_train_norm.shape[1]))
X_test_norm = np.reshape(X_test_norm, (int(X_test_norm.shape[0]/5), 5, X_test_norm.shape[1]))

In [143]:
Y_train = np.reshape(Y_train, (int(Y_train.shape[0]/5), 5))
Y_test = np.reshape(Y_test, (int(Y_test.shape[0]/5), 5))

In [144]:
Y_train.shape

(104122, 5)

In [145]:

model = Sequential()
model.add(GRU(units=20,
              return_sequences=True,
              input_shape=(X_train_norm.shape[1], X_train_norm.shape[2])))
model.add(GRU(units=5))
#model.add(GRU(units=5))
model.add(Dense(10, activation='relu'))
# model.add(Dense(5, activation='relu'))
# model.add(Dense(2, activation='relu'))
model.add(Dense(5, activation='sigmoid'))
optimizer = Adam(lr=1e-3)
model.compile(loss='binary_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])

In [133]:
## Neural Network

model = Sequential()
model.add(Dense(20, activation='relu', input_dim= X_test_norm.shape[1]))
#model.add(GRU(units=5))
model.add(Dense(20, activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(5, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
optimizer = Adam(lr=1e-3)
model.compile(loss='binary_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])

In [146]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_35 (GRU)                 (None, 5, 20)             2760      
_________________________________________________________________
gru_36 (GRU)                 (None, 5)                 390       
_________________________________________________________________
dense_95 (Dense)             (None, 10)                60        
_________________________________________________________________
dense_96 (Dense)             (None, 5)                 55        
Total params: 3,265
Trainable params: 3,265
Non-trainable params: 0
_________________________________________________________________


In [147]:
%%time
model.fit(X_train_norm, Y_train,
          validation_split=0.1, epochs=30, batch_size=256, shuffle=False)

Train on 93709 samples, validate on 10413 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Wall time: 2min 17s


<tensorflow.python.keras.callbacks.History at 0x1bdbd123208>

In [148]:
result = model.evaluate(X_test_norm, Y_test)



In [149]:
print("Accuracy: {0:.2%}".format(result[1]))

Accuracy: 58.22%


In [165]:
result

[0.6896394010415814, 0.5821901573751124]

In [150]:
pred= model.predict(X_test_norm)

In [151]:
pred = 

array([[0.20065825, 0.16693737, 0.2765075 , 0.47080803, 0.43391868],
       [0.82050043, 0.80740416, 0.6318036 , 0.52233875, 0.58121395],
       [0.64143467, 0.58411163, 0.7064582 , 0.69228345, 0.5411065 ],
       ...,
       [0.7205214 , 0.6916066 , 0.7137251 , 0.6609484 , 0.5569162 ],
       [0.83436376, 0.79181266, 0.7579857 , 0.6816145 , 0.5969599 ],
       [0.8226792 , 0.7525606 , 0.6384215 , 0.57671684, 0.5915044 ]],
      dtype=float32)

In [174]:
val = (pred > 0.5).astype(int)

In [175]:
np.sum((val == Y_test).astype(int))

51224

In [176]:
Y_test.shape

(17597, 5)

In [177]:
51224/87985

0.5821901460476218

In [181]:
one = np.ones((17597, 5))

In [187]:
np.sum(((val == Y_test) & (val == one)).astype(int))

40593

In [188]:
np.sum(((Y_test == one)).astype(int))

45665

In [192]:
40593/45665

0.888930252928939

In [189]:
zero = np.zeros((17597, 5))

In [190]:
np.sum(((val == Y_test) & (val == zero)).astype(int))

10631

In [191]:
np.sum(((Y_test == zero)).astype(int))

42320

In [193]:
10631/42320

0.25120510396975426

In [170]:
pred_train = model.predict(X_train_norm)
val_train = (pred_train > 0.5).astype(int)
np.sum((val_train == Y_train).astype(int))

317637

In [171]:
Y_train.shape

(104122, 5)

In [172]:
317637/520610

0.6101246614548318

In [None]:
df1 = pd.DataFrame({
    'A': [1,2,3,4,5],
    'B': [1,2,3,4,5]
})

df2 = pd.DataFrame({
    'C': [1,2,3,4,5],
    'D': [1,2,3,4,5]
})

df_concat = pd.concat([df1, df2], axis=1)

print(df_concat)

In [None]:
X = market_train[feature_columns].values
## Using only those stocks for which we have more than 500 data points

market_train = market_train.groupby("assetName").filter(lambda x: len(x) > 500)
up = market_train['returnsOpenNextMktres10'].map(lambda x: 0 if x<0 else 1).values
r = market_train['returnsOpenNextMktres10'].values
Y = up
num_sample = Y.shape[0]
print('Number of samples: {}'.format(num_sample))

In [None]:
X[0]

In [None]:
X.shape

In [None]:
market_train.head()

In [None]:
market_train['assetName'].unique()

In [None]:
market_train.groupby('assetName')

In [None]:
type(news_data['assetCodes'])

In [None]:
print("Minimum values : SourceTimestamp : {}, firstCreated : {}, time : {}".format(news_data['sourceTimestamp'].min(), 
                                                                                   news_data['firstCreated'].min(), 
                                                                                   news_data['time'].min()))

In [None]:
print("Maximum values : SourceTimestamp : {}, firstCreated : {}, time : {}".format(news_data['sourceTimestamp'].max(), 
                                                                                   news_data['firstCreated'].max(), 
                                                                                   news_data['time'].max()))

In [None]:
news_data.groupby(['assetName', 'sentimentClass'])['time'].transform('mean')

In [None]:
news_data.head()

## Facebook Stock analysis

In [None]:
fb_market = market_data[market_data["assetName"].str.contains("aceboo")]

In [None]:
fb_market.nunique()

In [None]:
news_data.columns

In [None]:
fb_news = df1[df1["assetName"].str.contains("aceboo")]

In [None]:
news_data.shape

In [None]:
fb_news = pd.concat([df1[df1["assetName"].str.contains("aceboo")],
                     df2[df2["assetName"].str.contains("aceboo")],
                     df3[df3["assetName"].str.contains("aceboo")],
                     df4[df4["assetName"].str.contains("aceboo")],
                     df5[df5["assetName"].str.contains("aceboo")],
                     df6[df6["assetName"].str.contains("aceboo")],
                     df7[df7["assetName"].str.contains("aceboo")],
                     df8[df8["assetName"].str.contains("aceboo")],
                     df9[df9["assetName"].str.contains("aceboo")],
                    df10[df10["assetName"].str.contains("aceboo")]])

In [None]:
fb_news.shape

In [None]:
fb_market.shape

In [None]:
fb_market = fb_market.sort_values(by="time", ascending=True)
fb_news = fb_news.sort_values(by="time", ascending=True)

In [None]:
fb_market['returnsOpenNextMktres10'].plot(grid=True)

In [None]:
fb_market.loc[fb_market['returnsOpenNextMktres10'] > 0.0,'x'] = 1

In [None]:
fb_market.loc[fb_market['returnsOpenNextMktres10'] < 0.0,'x'] = 0

In [None]:
def get_datetime(x):
#     dt0 =  x.split('+')[0]
#     dt1 = datetime.strptime(dt0, "%Y-%m-%d %H:%M:%S")
    dt1 = pd.to_datetime(x)
    return time.mktime(dt1.timetuple())

In [None]:
fb_market['timestamp'] = fb_market['time'].apply(get_datetime)

In [None]:
fb_market.iloc[:50].plot.scatter(x = 'timestamp', y = 'returnsOpenNextMktres10')

In [None]:
fb_news['timestamp'] = fb_news['time'].apply(get_datetime)

In [None]:
fb_news[:50].plot.scatter(x = 'timestamp', y = 'sentimentClass')

## Rough Work

In [None]:
asset_name = [col for col in tmp.columns if 'assetNam' in col]

In [None]:
tmp1 = tmp[(tmp['assetName_0'] == tmp['assetName_1']) & (tmp['assetName_1'] == tmp['assetName_2']) ]

In [None]:
tmp.shape

In [None]:
for v, k in enumerate(market_data['assetCode'].unique()):
    print ("v : {} , k : {}".format(v,k))

In [None]:
market_data.drop()

In [None]:
df = pd.DataFrame

In [None]:
df = market_train.groupby(by='assetName')

In [None]:
df.head()

In [None]:
market_train[market_train['assetName'].str.contains('Apple Inc')]

In [None]:
market_train.groupby(['assetName']).transform('count')

In [None]:
src = market_train[feature_columns]


In [None]:
src1 = src.groupby("assetName").filter(lambda x: len(x) > 500)

In [None]:
src1['assetName'].value_counts().sort_values(ascending=False)