In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import datetime
import lightgbm as lgb
from scipy import stats
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split
from wordcloud import WordCloud
from collections import Counter
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
stop = set(stopwords.words('english'))


import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn import model_selection
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.impute import SimpleImputer

# Deep Learning
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization
from keras.wrappers.scikit_learn import KerasClassifier

from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [None]:
from kaggle.competitions import twosigmanews
env = twosigmanews.make_env()

In [None]:
(market_train_df, news_train_df) = env.get_training_data()
print(f'Market Data: {market_train_df.shape[0]} training examples, {market_train_df.shape[1]} cols.')
print(f'News Data: {news_train_df.shape[0]} training examples, {news_train_df.shape[1]} cols.')

In [None]:
market_train_df.head(5)

In [None]:
news_train_df[["headline","provider","subjects","bodySize","sentenceCount","assetCodes","sentimentClass","sentimentNegative","sentimentNeutral", "sentimentPositive","sentimentWordCount"]][1:4][::-1]

## Modelling

In [None]:
def prep_market_df(market_df):
    market_df['time'] = market_df.time.dt.date
    #market_df['returnsOpenPrevRaw1_to_volume'] = market_df['returnsOpenPrevRaw1'] / market_df['volume']
    market_df['close_to_open'] = market_df['close'] / market_df['open']
    market_df['volume_to_mean'] = market_df['volume'] / market_df['volume'].mean()
    return market_df
    

In [None]:
def prep_news_df(news_df):
    news_df['sentence_word_count'] =  news_df['wordCount'] / news_train_df['sentenceCount']
    news_df['time'] = news_df.time.dt.hour
    news_df['sourceTimestamp']= news_df.sourceTimestamp.dt.hour
    news_df['firstCreated'] = news_df.firstCreated.dt.date
    news_df['assetCodesLen'] = news_df['assetCodes'].map(lambda x: len(eval(x)))
    news_df['assetCodes'] = news_df['assetCodes'].map(lambda x: list(eval(x))[0])
    news_df['headlineLen'] = news_df['headline'].apply(lambda x: len(x))
    news_df['assetCodesLen'] = news_df['assetCodes'].apply(lambda x: len(x))
    news_df['asset_sentiment_count'] = news_df.groupby(['assetName', 'sentimentClass'])['time'].transform('count')
    news_df['asset_sentence_mean'] = news_df.groupby(['assetName', 'sentenceCount'])['time'].transform('mean')
    lbl = {k: v for v, k in enumerate(news_df['headlineTag'].unique())}
    news_df['headlineTagT'] = news_df['headlineTag'].map(lbl)
    kcol = ['firstCreated', 'assetCodes']
    news_df = news_df.groupby(kcol, as_index=False).mean()
    return news_df

In [None]:
def merge_news_and_market_data(market_df,news_df):
    market_df = pd.merge(market_df, news_df, how='left', left_on=['time', 'assetCode'], right_on=['firstCreated', 'assetCodes'])
    lbl = {k: v for v, k in enumerate(market_df['assetCode'].unique())}
    market_df['assetCodeT'] = market_df['assetCode'].map(lbl)
    market_df = market_df.dropna(axis=0)
    return market_df

In [None]:
# code mostly takes from this kernel: https://www.kaggle.com/ashishpatel26/bird-eye-view-of-two-sigma-xgb
def data_prep(market_df,news_df):
    market_df = prep_market_df(market_df)
    news_df = prep_news_df(news_df)
    merged_df = merge_news_and_market_data(market_df,news_df)
    return merged_df

In [None]:
def get_feature_cols(data_df):
    return [c for c in data_df.columns if c not in 
     ['assetCode', 'assetCodes', 'assetCodesLen', 'assetName', 'assetCodeT',
    'firstCreated', 'headline', 'headlineTag', 'marketCommentary', 'provider',
    'returnsOpenNextMktres10', 'sourceId', 'subjects', 'time', 'time_x', 'universe','sourceTimestamp']]


In [None]:
def scale_data(X):
    mins = np.min(X, axis=0)
    maxs = np.max(X, axis=0)
    rng = maxs - mins
    return 1 - ((maxs - X) / rng),maxs,mins,rng

In [None]:
market_train = data_prep(market_train_df, news_train_df)
class_labels = market_train.returnsOpenNextMktres10 >= 0
fcol = get_feature_cols(market_train)
X = market_train[fcol].values
class_labels = class_labels.values
r = market_train.returnsOpenNextMktres10.values

X,maxs,mins,rng = scale_data(X)

In [None]:
print(fcol)

In [None]:
print(r[:6])
print(class_labels[:6])

In [None]:
X_train, X_test, class_labels_train, class_labels_test, r_train, r_test = model_selection.train_test_split(X, class_labels, r, test_size=0.1, random_state=99)

In [None]:
def build_lgb_model(X_train, X_test, class_labels_train, class_labels_test):
    params = {'learning_rate': 0.05, 'max_depth': 12, 'boosting': 'gbdt', 'objective': 'binary', 'metric': 'auc', 'is_training_metric': True, 'seed': 42}
    model = lgb.train(params, train_set=lgb.Dataset(X_train, label=class_labels_train), num_boost_round=2000,
                      valid_sets=[lgb.Dataset(X_train, label=class_labels_train), lgb.Dataset(X_test, label=class_labels_test)],
                      verbose_eval=100, early_stopping_rounds=100)
    return model

In [None]:
def build_logreg_model(X_train, X_test, class_labels_train, class_labels_test, impute=True):
    
    # Handle NaN/missing vals by imputing if impute set to True
    if impute:
        my_imputer = SimpleImputer()
        X_train = my_imputer.fit_transform(X_train)
        X_test = my_imputer.fit_transform(X_test)
    
    clf = LogisticRegression(random_state=0, solver='sag',multi_class='ovr',verbose=1).fit(X_train, class_labels_train)
    return clf

In [None]:
def build_sgd_model(X_train, X_test, class_labels_train, class_labels_test, impute=True):
    
    # Handle NaN/missing vals by imputing if impute set to True
    if impute:
        my_imputer = SimpleImputer()
        X_train = my_imputer.fit_transform(X_train)
        X_test = my_imputer.fit_transform(X_test)
    
    clf = SGDClassifier(loss='log', penalty='l2', alpha=0.05,max_iter=1000, verbose=1).fit(X_train, class_labels_train)
    return clf

In [None]:
def create_baseline():
    model = Sequential()
    model.add(Dense(X_train.shape[1], input_dim=X_train.shape[1], kernel_initializer='normal', activation='relu'))
    model.add(Dense(1,kernel_initializer='normal', activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
def create_two_layer_baseline():
    model = Sequential()
    model.add(Dense(X_train.shape[1], input_dim=X_train.shape[1], kernel_initializer='normal', activation='relu'))
    model.add(Dense(20, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
def create_three_layer_NN():
    model = Sequential()
    model.add(Dense(X_train.shape[1], input_dim=X_train.shape[1], kernel_initializer='normal', activation='relu'))
    #model.add(Dropout(0.2))
    model.add(BatchNormalization())
    model.add(Dense(20, kernel_initializer='normal', activation='relu'))
    #model.add(Dropout(0.2))
    model.add(BatchNormalization())
    model.add(Dense(10, kernel_initializer='normal', activation='relu'))
    #model.add(Dense(5, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
from keras.callbacks import EarlyStopping
def build_shallow_NN_model(X_train, X_test, class_labels_train, class_labels_test, impute=True, num_hidd_layers=1):
    
    if impute:
        my_imputer = SimpleImputer()
        print(X_train.shape)
        X_train = my_imputer.fit_transform(X_train)
        print(X_train.shape)
        #X_test = my_imputer.fit_transform(X_test)
        
    shallow_net_model = None
        
    if num_hidd_layers == 1:
        shallow_net_model = create_baseline()
    elif num_hidd_layers == 2:
        shallow_net_model = create_two_layer_baseline()
    elif num_hidd_layers == 3:
        shallow_net_model = create_three_layer_NN()
    #shallow_net_model = create_baseline()#X_train.shape[1]
    max_epochs = 10
    class_int_labels_train = (class_labels_train == True).astype(int)
    assert(len(X_train == len(class_int_labels_train)))
    #early_stop = EarlyStopping(patience=5,verbose=True)
    h = shallow_net_model.fit(X_train, class_int_labels_train, batch_size=32,epochs=max_epochs, verbose=1)#,callbacks=[early_stop]
    return shallow_net_model

In [None]:
def test_NN(model, X_test,Y_test, impute=True):
    print(X_test.shape,Y_test.shape)
    merged = np.concatenate((X_test, Y_test.reshape((Y_test.shape[0], 1))), axis=1)
    print(merged.shape)
    if impute:
        my_imputer = SimpleImputer()
        merged = my_imputer.fit_transform(merged)
        print(merged.shape)
        X_test = merged[:,:-1]
        Y_test = merged[:,-1]
        print(X_test.shape, Y_test.shape, Y_test[:4])
        #X_test = my_imputer.fit_transform(X_train)
        #Y_test = my_imputer.fit_transform(Y_test)
    eval_results = model.evaluate(X_test, Y_test,verbose=1)
    e = model.predict(X_test).squeeze()
    print(e)
    print("\nLoss, accuracy on test data: ")
    print("%0.4f %0.2f%%" % (eval_results[0], eval_results[1]*100))
    return e
    

In [None]:
def test_non_NN_model(model, X_test,Y_test, impute=True,logreg=False):
    merged = np.concatenate((X_test, Y_test.reshape((Y_test.shape[0], 1))), axis=1)
    if impute:
        my_imputer = SimpleImputer()
        merged = my_imputer.fit_transform(merged)
        print(merged.shape)
        X_test = merged[:,:-1]
        Y_test = merged[:,-1]
        print(X_test.shape, Y_test.shape, Y_test[:4])
        #X_test = my_imputer.fit_transform(X_train)
        #Y_test = my_imputer.fit_transform(Y_test)
    eval_results = None
    if logreg:
        eval_results = model.predict_proba(X_test)[:,1]
    else:
        eval_results = model.predict(X_test)#, Y_test,verbose=1)
    print(max(eval_results))
    thresholded = [1  if ex >= 0.5 else 0 for ex in eval_results]
    acc = np.mean(thresholded == Y_test)
    print("\nAccuracy on test data: ")
    print("%0.2f%%" % (acc*100))
    print(eval_results)
    return eval_results
    

In [None]:
#lgb_model = build_lgb_model(X_train, X_test, class_labels_train, class_labels_test)

In [None]:
logreg_model = build_logreg_model(X_train, X_test, class_labels_train, class_labels_test,impute=True)

In [None]:
#sgd_model = build_sgd_model(X_train, X_test, class_labels_train, class_labels_test, impute=True)

In [None]:
hidden_layers = 3

In [None]:
batchnorm_model = build_shallow_NN_model(X_train, X_test, class_labels_train, class_labels_test, impute=True,num_hidd_layers=3)

In [None]:
shallow_net_model = build_shallow_NN_model(X_train, X_test, class_labels_train, class_labels_test, impute=True,num_hidd_layers=2)

In [None]:
#test_non_NN_model(lgb_model,X_test,class_labels_test,impute=True)

In [None]:
#test_NN(dropout_model,X_test,class_labels_test,impute=True)

In [None]:
#test_NN(shallow_net_model,X_test,class_labels_test,impute=True)

In [None]:
def ensemble(predictions, method="mean"):
    num_models = len(predictions)
    results = None
    weights = [0.7,0.1,0.1,0.1]
    if method=="median":
        results = []
        for ex in range(len(predictions[0])):
            curr_preds = [predictions[model_idx][ex] for model_idx in range(num_models)]
            pred = np.mean(curr_preds)
            results.append(pred)
    elif method == "weighted":
        results = []
        for ex in range(len(predictions[0])):
            pred = np.sum([predictions[model_idx][ex]*weights[model_idx] for model_idx in range(num_models)])
            #print(pred)
            results.append(pred)
    else:
        summed = np.sum(predictions,axis=0)
        print(len(summed))
        results = summed / float(num_models)
    return results
    #summed = np.sum(predictions,axis=0)
    #print(len(summed))
    #return summed / float(num_models)
    #return np.mean(predictions)

In [None]:
#lgb_results = test_non_NN_model(lgb_model,X_test,class_labels_test,impute=True)

In [None]:
logreg_results = test_non_NN_model(logreg_model,X_test,class_labels_test,impute=True,logreg=True)

In [None]:
shallow_results = test_NN(shallow_net_model,X_test,class_labels_test,impute=True)

In [None]:
batchnorm_results = test_NN(batchnorm_model,X_test,class_labels_test,impute=True)

In [None]:
predictions = [logreg_results,shallow_results,batchnorm_results]
#for i in range(4):
#    print(predictions[i][0])
#print(predictions[0])
ensemble_predictions = ensemble(predictions,method="mean")
print(ensemble_predictions)
#x = 0.5397711618234922+0.5166342344290076+0.53550625+0.50618184
#print(x/4.0)

In [None]:
def test_ensemble(predictions,Y_test):
    thresholded = np.array([1  if ex >= 0.5 else 0 for ex in predictions])
    print(thresholded[:10],Y_test[0:10])
    Y_test = np.array([int(x) for x in Y_test])
    #acc = np.mean(thresholded == Y_test)
    same = 0.0
    for i in range(len(predictions)):
        if thresholded[i] == Y_test[i]:
            same += 1
    acc = same/float(len(predictions))
    print("%0.2f%%" % (acc*100))

In [None]:
test_ensemble(ensemble_predictions,class_labels_test)

In [None]:
days = env.get_prediction_days()

In [None]:
n_days = 0
impute = True
for (market_obs_df, news_obs_df, predictions_template_df) in days:
    n_days +=1
    if n_days % 50 == 0:
        print(n_days,end=' ')
        
    market_obs_df = data_prep(market_obs_df, news_obs_df)
    market_obs_df = market_obs_df[market_obs_df.assetCode.isin(predictions_template_df.assetCode)]
    
    X_live = market_obs_df[fcol].values
    X_live = 1 - ((maxs - X_live) / rng)
    #lp = shallow_net_model.predict(X_live).squeeze()
    #lp_predictions = dropout_model.predict(X_live).squeeze()
    #lgb_prediction = lgb_model.predict(X_live)
    logreg_prediction = logreg_model.predict_proba(X_live)[:,1]
    shallow_net_model_prediction = shallow_net_model.predict(X_live).squeeze()
    batchnorm_model_prediction = batchnorm_model.predict(X_live).squeeze()
    #print(logreg_prediction,shallow_net_model_prediction,batchnorm_model_prediction)
    lp = np.mean([logreg_prediction,shallow_net_model_prediction,batchnorm_model_prediction])
    #lp = batchnorm_model_prediction
    confidence = 2 * lp -1
    #print(confidence)

    preds = pd.DataFrame({'assetCode':market_obs_df['assetCode'],'confidence':confidence})
    predictions_template_df = predictions_template_df.merge(preds,how='left').drop('confidenceValue',axis=1).fillna(0).rename(columns={'confidence':'confidenceValue'})
    env.predict(predictions_template_df)
env.write_submission_file()