**Two Sigma: Prediction of Stock Prices with market and news data **

In [None]:
from kaggle.competitions import twosigmanews
# Creating an environment to work on...
env = twosigmanews.make_env()

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns; sns.set()
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [None]:
#training data into the market data and news data 
(market_train, news_train) = env.get_training_data()

In [None]:
market_train.shape, news_train.shape

In [None]:
news_train.head()

In [None]:
## figure 1: Volume Scatter plot
first50 = market_train[:50]
plt.figure(figsize=(15,4))
plt.scatter(first50.assetCode, first50.volume, s=first50.volume/5e4, color= 'black')
plt.ylabel('Volume(in million)')
plt.xlabel('Asset Codes')
plt.xticks(rotation=90)

plt.ylim(0,3e7)
plt.show()

In [None]:
## Figure 2: Returns
plt.figure(figsize=(15,4))
plt.plot(first50.assetCode, first50.returnsClosePrevRaw1, ls='--', marker='.', lw=0.75, label='Closing Returns for last 1 day', c='blue')
plt.plot(first50.assetCode, first50.returnsClosePrevRaw10, ls='--', marker='.', lw=0.75, label='Closing Returns for previous 10 days', c='orange')
plt.xticks(rotation=90)
plt.ylabel('Closing Return Rate')
plt.xlabel('Asset Codes')
plt.legend()
plt.show()

In [None]:
## Figure 3: Open and Close
plt.figure(figsize=(15,4))
plt.plot(first50.assetCode, first50.open/first50.close, ls='--', marker='.', lw=0.75)
plt.xticks(rotation=90)
plt.axhline(1.0, ls='--', lw=1, color='r')
plt.ylabel('Opening/Closing prices')
plt.xlabel('Asset Codes')
plt.legend()
plt.grid(False)
plt.show()

In [None]:
# deleting the data that is unnecessary for calculation  
def improvement_data(news_train):
    unnecessarydata = ['sourceTimestamp','firstCreated','headline', 'takeSequence','firstMentionSentence', 'sentenceCount','bodySize', 'marketCommentary','subjects' ,'audiences', 'assetName', 'urgency', 'wordCount', 'sentimentWordCount']
    news_train.drop(unnecessarydata, axis=1, inplace=True)
    
# Factorize categorical columns
    for column in ['headlineTag', 'sourceId']:
        news_train[column], uniques = pd.factorize(news_train[column])
        del uniques
    
# Remove {} and '' from assetCodes column
    news_train['assetCodes'] = news_train['assetCodes'].apply(lambda x: x[1:-1].replace("'", ""))
    return news_train

news_train = improvement_data(news_train)


In [None]:
#creating index for each asset code in order to later merge with market data  
import gc
def seperating_asset_codes(news_train):
    codes = []
    indexes = []
    for i, values in news_train['assetCodes'].iteritems():
        seperated = values.split(", ")
        codes.extend(seperated)
        repeat_index = [int(i)]*len(seperated)
        indexes.extend(repeat_index)
    index_df = pd.DataFrame({'news_index': indexes, 'assetCode': codes})
    del codes, indexes
    gc.collect()
    return index_df

index_df = seperating_asset_codes(news_train)
index_df.head()

In [None]:
#Combinig news data with indexes 
def combine_news(news_train, index_df):
    news_train['news_index'] = news_train.index.copy()

    # Combine news on seperately arranged assets
    news_arranged = index_df.merge(news_train, how='left', on='news_index')
    news_arranged.drop(['news_index', 'assetCodes'], axis=1, inplace=True)
    return news_arranged

news_arranged = combine_news(news_train, index_df)
del news_train, index_df
gc.collect()
news_arranged.head()

In [None]:
#Using mean to group the news for one asset code together.  

def groupnews(news_df):
    # Addding the date column 
    news_df['date'] = news_df.time.dt.date  
    
    aggregations = ['mean']
    group = news_df.groupby(['assetCode', 'date']).agg(aggregations)
    group.columns = pd.Index(["{}_{}".format(e[0], e[1]) for e in group.columns.tolist()])
    group.reset_index(inplace=True)
    # Set datatype to float32
    changetofloat = {c: 'float32' for c in group.columns if c not in ['assetCode', 'date']}
    return group.astype(changetofloat)

news_aggregate = groupnews(news_arranged)
del news_arranged
gc.collect()
news_aggregate.head()

In [None]:
#combine market dataset to the news dataset 
market_train['date'] = market_train.time.dt.date
finaldf = market_train.merge(news_aggregate, how='left', on=['assetCode', 'date'])
del market_train, news_aggregate
gc.collect()
finaldf.head()

In [None]:
# defining a function to predict probability 
def metric(date, pred_probability, num_target, universe):
    y = pred_probability*2 - 1
    r = num_target.clip(-1,1) # getting rid of outliers
    x = y * r * universe
    result = pd.DataFrame({'day' : date, 'x' : x})
    x_t = result.groupby('day').sum().values
    return np.mean(x_t) / np.std(x_t)

In [None]:
# train test split
train_index, test_index = train_test_split(finaldf.index.values, test_size=0.1, shuffle=False)

In [None]:
# shape of the train and test 
train_index.shape, test_index.shape

In [None]:
# defining accuracy- XGB Classifier
from lightgbm import LGBMClassifier as lgb
def accuracy(finaldf, target, train_index, test_index, parameters):
    parameters['n_jobs'] = 2  
    model = lgb(**parameters)
    model.fit(finaldf.iloc[train_index], target.iloc[train_index])
    return log_loss(target.iloc[test_index], model.predict_proba(finaldf.iloc[test_index]))

In [None]:
#defining hyperparameters
parameters_grid = {
    'learning_rate': [0.15, 0.1, 0.05, 0.02, 0.01],
    'num_leaves': [i for i in range(12, 90, 6)],
    'n_estimators': [50, 200, 400, 600, 800],
    'min_child_samples': [i for i in range(10, 100, 10)],
    'colsample_bytree': [0.8, 0.9, 0.95, 1],
    'subsample': [0.8, 0.9, 0.95, 1],
    'reg_alpha': [0.1, 0.2, 0.4, 0.6, 0.8],
    'reg_lambda': [0.1, 0.2, 0.4, 0.6, 0.8],
}

In [None]:
num_target = finaldf.returnsOpenNextMktres10.astype('float32')
bin_target = (finaldf.returnsOpenNextMktres10 >= 0).astype('int8')
universe = finaldf.universe.astype('int8')
finaldf.drop(['returnsOpenNextMktres10', 'date', 'universe', 'assetCode', 'assetName', 'time'], 
        axis=1, inplace=True)  #dropping the categorical data
finaldf = finaldf.astype('float32')

In [None]:
accuracyscore = 0
for i in range(100):  # Hundred runs
    parameters = {k: np.random.choice(v) for k, v in parameters_grid.items()}
    score = accuracy(finaldf, bin_target, train_index, test_index, parameters)
    if score < accuracyscore or accuracyscore == 0:
        accuracyscore = score
        best_parameters = parameters
print("Accuracy Score", accuracyscore)