In [None]:
from kaggle.competitions import twosigmanews
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import matplotlib.cm as cm
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import pandas as pd
import tensorflow as tf
import seaborn as sns
from copy import deepcopy
from scipy import stats
import keras as k
from keras.layers import Dense, Conv1D, Flatten, Dropout, LSTM, GlobalAveragePooling1D, MaxPooling1D
from keras.models import Sequential
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, RobustScaler, StandardScaler
from sklearn.mixture import  BayesianGaussianMixture, GaussianMixture
from sklearn.impute import SimpleImputer
import lightgbm as lgb
pd.options.mode.chained_assignment = None  # default='warn'
%config IPCompleter.greedy=True
%load_ext line_profiler
from sklearn import model_selection

plt.rcParams['figure.figsize'] = [20, 10]
# You can only call make_env() once, so don't lose it!
env = twosigmanews.make_env()

In [None]:
(market_train_df, news_train_df) = env.get_training_data()
market_train_df.head()

In [None]:
# GMM is trained for two features Raw and Mktres
# All prevClose and prevOpen for both 1 and 10 days are placed under the respective feature
# because all of these are expected to follow the same distribution dispite different window length (1 and 10)
# Five clusters in each group

keys=['returnsOpenPrevRaw1','returnsClosePrevRaw1','returnsOpenPrevMktres1', 'returnsClosePrevMktres1', 'returnsClosePrevRaw10','returnsOpenPrevRaw10','returnsClosePrevMktres10','returnsOpenPrevMktres10','returnsOpenNextMktres10']
keysRaw = []
keysMktres = []
temp_df = deepcopy(market_train_df)
market_train_df[keys] = market_train_df[keys].fillna(10000) # fill NaNs with very large number, this will indicate that something wasn't right
gmmRaw = GaussianMixture(5, reg_covar=1e-6, covariance_type='diag')
gmmMktres = GaussianMixture(5, reg_covar=1e-3, covariance_type='diag')

for key in keys:
    if 'Raw' in key:
        keysRaw.append(key)
    else:
        keysMktres.append(key)
    

valuesRaw = np.reshape(market_train_df[keysRaw].values, (-1,1))
valuesMktres = np.reshape(market_train_df[keysMktres].values, (-1,1))

# predict cluster for each sample
predsRaw = gmmRaw.fit_predict(valuesRaw)
predsMktres = gmmMktres.fit_predict(valuesMktres)


In [None]:
# These are the properties of the five clusters: weights, covariances, etc. so that we don't have to rerun training each time

'''
gmmRaw.weights_ =np.array( [6.83865690e-01, 1.16851905e-02, 1.22760963e-07, 3.04427095e-01, 2.19020444e-05]).reshape(-1)
gmmRaw.means_ = np.array([1.54566863e-03,3.72870021e-02,9.29550000e+03,4.39326020e-03,1.56379609e+02]).reshape(-1,1)
gmmRaw.covariances_ = np.array([5.49162779e-04, 8.51465015e-02, 7.48225000e+03, 8.13615109e-03, 6.43230001e+05]).reshape(-1,1)
gmmRaw.precisions_ = np.array( [1.82095371e+03,1.17444637e+01,1.33649637e-04,1.22908239e+02, 1.55465386e-06]).reshape(-1,1)
gmmRaw.precisions_cholesky_ = np.array([4.26726342e+01, 3.42701966e+00,1.15606936e-02,1.10863988e+01, 1.24685759e-03]).reshape(-1,1)
gmmRaw.converged_ = True
gmmRaw.n_iter_ = 28
gmmRaw.lower_bound_ = 1.6266292098825175
gmmRaw.n_components = 5

gmmMktres.weights_ = np.array([9.64128858e-01, 1.07063273e-02, 8.69301996e-06, 2.50708018e-02, 8.53200247e-05]).reshape(-1)
gmmMktres.means_ = np.array([3.86348381e-04,1.00000000e+04, 6.57471356e+02, 2.92204453e-02, 1.28427948e+01]).reshape(-1,1)
gmmMktres.covariances_ = np.array([2.86978130e-03, 1.00078976e-03, 3.05273209e+06, 4.92966900e-02, 2.67858820e+03]).reshape(-1,1)
gmmMktres.precisions_ = np.array([3.48458609e+02, 9.99210862e+02, 3.27575421e-07, 2.02853376e+01, 3.73330995e-04]).reshape(-1,1)
gmmMktres.precisions_cholesky_ = np.array([1.86670461e+01, 3.16102968e+01,  5.72342049e-04,  4.50392469e+00, 1.93217752e-02]).reshape(-1,1)
gmmMktres.converged_ = True
gmmMktres.n_iter_ = 9
gmmMktres.lower_bound_ = 1.5250902820890222
gmmMktres.n_components = 5
'''

In [None]:
# perform outlier removal with the trained GMM

keysRaw = []
keysMktres = []

for key in keys:
    if 'Raw' in key:
        keysRaw.append(key)
    else:
        keysMktres.append(key)

max_val = 1 # cluster is judged as anomalous if either mean or covariances exceed 1

#valuesRaw = np.reshape(market_train_df[keysRaw].values, (-1,1))
#valuesMktres = np.reshape(market_train_df[keysMktres].values, (-1,1))

#predsRaw = gmmRaw.predict(valuesRaw)
#predsMktres = gmmMktres.predict(valuesMktres)

# Mask to tell which clusters are anomalous
maskRaw = [not ((abs(gmmRaw.means_[i]) < max_val) and (gmmRaw.covariances_[i] < max_val))[0] for i in range(len(gmmRaw.weights_))]
maskMktres = [not ((abs(gmmMktres.means_[i]) < max_val) and (gmmMktres.covariances_[i] < max_val))[0] for i in range(len(gmmMktres.weights_))]

replaceIndicesMktres = [maskMktres[x] for x in predsMktres]
replaceIndicesRaw = [maskRaw[x] for x in predsRaw]

# Replace anomalous values with the mean of the cluster with the largest weight, where the most data resides
valuesRaw[replaceIndicesRaw] = gmmRaw.means_[np.argmax(gmmRaw.weights_)][0]
valuesMktres[replaceIndicesMktres] = gmmMktres.means_[np.argmax(gmmMktres.weights_)][0]

market_train_df[keysRaw] = np.reshape(valuesRaw, (-1, len(keysRaw)))
market_train_df[keysMktres] = np.reshape(valuesMktres, (-1, len(keysMktres)))

market_train_df.head()

In [None]:
# Some features
market_train_df['returnsClosePrevRaw1_1day_lag'] = market_train_df.groupby('assetCode', as_index=False)['returnsClosePrevRaw1'].shift(1).reset_index(drop=True)
market_train_df['returnsOpenPrevRaw1_1day_lag'] = market_train_df.groupby('assetCode', as_index=False)['returnsOpenPrevRaw1'].shift(1).reset_index(drop=True)

market_train_df['returnsClosePrevRaw1_3day_lag'] = market_train_df.groupby('assetCode', as_index=False)['returnsClosePrevRaw1'].shift(3).reset_index(drop=True)
market_train_df['returnsOpenPrevRaw1_3day_lag'] = market_train_df.groupby('assetCode', as_index=False)['returnsOpenPrevRaw1'].shift(3).reset_index(drop=True)

market_train_df['returnsClosePrevRaw1_5day_lag'] = market_train_df.groupby('assetCode', as_index=False)['returnsClosePrevRaw1'].shift(5).reset_index(drop=True)
market_train_df['returnsOpenPrevRaw1_5day_lag'] = market_train_df.groupby('assetCode', as_index=False)['returnsOpenPrevRaw1'].shift(5).reset_index(drop=True)

market_train_df['returnsClosePrevRaw1_10day_lag'] = market_train_df.groupby('assetCode', as_index=False)['returnsClosePrevRaw1'].shift(10).reset_index(drop=True)
market_train_df['returnsOpenPrevRaw1_10day_lag'] = market_train_df.groupby('assetCode', as_index=False)['returnsOpenPrevRaw1'].shift(10).reset_index(drop=True)

keys = ['returnsClosePrevRaw1_1day_lag','returnsOpenPrevRaw1_1day_lag','returnsClosePrevRaw1_3day_lag','returnsOpenPrevRaw1_3day_lag','returnsClosePrevRaw1_5day_lag','returnsOpenPrevRaw1_5day_lag', 'returnsClosePrevRaw1_10day_lag', 'returnsOpenPrevRaw1_10day_lag'] + keys

In [None]:
st_date = pd.Timestamp(year=2014, month=1, day=15, hour=22, tz='UTC')
end_date = pd.Timestamp(year=2018, month=1, day=14, hour=22, tz='UTC')
time = market_train_df[(market_train_df['time'] >= st_date) & (market_train_df['time'] < (end_date))]['time'].values 
data = market_train_df[(market_train_df['time'] >= st_date) & (market_train_df['time'] < (end_date))][keys[:-1]].values 
r = market_train_df[(market_train_df['time'] >= st_date) & (market_train_df['time'] < (end_date))][keys[-1]].values
labels = market_train_df[(market_train_df['time'] >= st_date) & (market_train_df['time'] < (end_date))][keys[-1]].values

# Custom labels for regression
labels = ((labels > 0.001).astype(int) - (labels < -0.001).astype(int))*0.1 + 0.25*((labels > 0.01).astype(int) - (labels < -0.01).astype(int)) + 0.6*((labels > 0.1).astype(int) - (labels < -0.1).astype(int))

universe = market_train_df[(market_train_df['time'] >= st_date) & (market_train_df['time'] < (end_date))]['universe'].values 

In [None]:
train_data, test_data, train_labels, test_labels, train_time, test_time, train_universe, test_universe, train_r, test_r = model_selection.train_test_split(data, labels, time, universe, r, test_size=0.50, random_state=1)

In [None]:
data_scaler = RobustScaler()
labels_scaler = MinMaxScaler()

data_scaled = data_scaler.fit_transform(data)
test_data_scaled = data_scaler.transform(test_data)

labels_scaled = labels_scaler.fit_transform(labels.reshape(-1, 1))
test_labels_scaled = labels_scaler.transform(test_labels.reshape(-1, 1))

print(data_scaled.shape)
print(test_data_scaled.shape)

In [None]:
lgb_train = lgb.Dataset(train_data, train_labels)
lgb_test = lgb.Dataset(test_data, test_labels)


In [None]:
params = {
        'objective':'mae',
        'num_iterations':500,
        'learning_rate':0.01,
        'n_estimators':200,
        'early_stopping_rounds':20,
        'num_leaves':2000
    }

lgbm_model = lgb.train(params, train_set = lgb_train, valid_sets = lgb_test)

In [None]:
pred = lgbm_model.predict(test_data)

In [None]:
conf_data = pd.DataFrame()
conf_data['time'] = test_time

conf_data['conf_pred'] = pred*test_universe*test_r
confidence_value = conf_data.groupby(conf_data.time)['conf_pred'].sum()

conf = np.mean(confidence_value)/np.std(confidence_value)
print(conf)

In [None]:
pred = lgbm_model.predict(train_data)
conf_data = pd.DataFrame()
conf_data['time'] = train_time

conf_data['conf_pred'] = pred*train_universe*train_r
confidence_value = conf_data.groupby(conf_data.time)['conf_pred'].sum()

conf = np.mean(confidence_value)/np.std(confidence_value)
print(conf)

In [None]:
import warnings
import seaborn as sns
warnings.simplefilter(action='ignore', category=FutureWarning)

# sorted(zip(clf.feature_importances_, X.columns), reverse=True)
feature_imp = pd.DataFrame(sorted(zip(lgbm_model.feature_importance(),keys)), columns=['Value','Feature'])

plt.figure(figsize=(20, 10))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.show()
plt.savefig('lgbm_importances-01.png')

In [None]:
# You can only iterate through a result from `get_prediction_days()` once
# so be careful not to lose it once you start iterating.
days = env.get_prediction_days()

In [None]:
# Check and replace anomalous values
def fix_anomalies(df, keysRaw, keysMktres, maskRaw, maskMktres,  gmmMktres, gmmRaw):
    df[keysRaw] = df[keysRaw].fillna(10000) 
    df[keysMktres] = df[keysMktres].fillna(10000) 
    valuesRaw = np.reshape(df[keysRaw].values, (-1,1))
    valuesMktres = np.reshape(df[keysMktres].values, (-1,1))

    predsRaw = gmmRaw.predict(valuesRaw)
    predsMktres = gmmMktres.predict(valuesMktres)
    
    replaceIndicesMktres = [maskMktres[x] for x in predsMktres]
    replaceIndicesRaw = [maskRaw[x] for x in predsRaw]

    valuesRaw[replaceIndicesRaw] = gmmRaw.means_[np.argmax(gmmRaw.weights_)][0]
    valuesMktres[replaceIndicesMktres] = gmmMktres.means_[np.argmax(gmmMktres.weights_)][0]

    df[keysRaw] = np.reshape(valuesRaw, (-1, len(keysRaw)))
    df[keysMktres] = np.reshape(valuesMktres, (-1, len(keysMktres)))
    
    return df

In [None]:
new_df = pd.DataFrame()
offset = 20

# Trimmed df for fast lag computation, because we don't need all values, just few last ones
temp_dates = market_train_df['time'].unique()[-offset:]

st_date = temp_dates[0]
end_date = temp_dates[-1]

temp_df = market_train_df[(market_train_df['time'] >= st_date) & (market_train_df['time'] <= (end_date))] 

In [None]:
for (market_obs_df, news_obs_df, predictions_template_df) in days:
    market_obs_df = fix_anomalies(market_obs_df, keysRaw, keysMktres[:-1], maskRaw, maskMktres,  gmmMktres, gmmRaw)
    market_obs_df['pred'] = np.zeros_like(market_obs_df['time'])
    #market_obs_df['log_volume_day_avg'] = market_obs_df.groupby('time', as_index=False)['volume'].transform(np.mean).transform(np.log).reset_index(drop=True)
    #market_obs_df['pred_nn'] = np.zeros_like(market_obs_df['time'])
    
    temp_df = temp_df.append(market_obs_df, sort=False, ignore_index=True)
    
    # Make lagged features
    temp_df['returnsClosePrevRaw1_1day_lag'] = temp_df.groupby('assetCode', as_index=False)['returnsClosePrevRaw1'].shift(1).reset_index(drop=True)
    temp_df['returnsOpenPrevRaw1_1day_lag'] = temp_df.groupby('assetCode', as_index=False)['returnsOpenPrevRaw1'].shift(1).reset_index(drop=True)
    temp_df['returnsClosePrevRaw1_3day_lag'] = temp_df.groupby('assetCode', as_index=False)['returnsClosePrevRaw1'].shift(3).reset_index(drop=True)
    temp_df['returnsOpenPrevRaw1_3day_lag'] = temp_df.groupby('assetCode', as_index=False)['returnsOpenPrevRaw1'].shift(3).reset_index(drop=True)
    temp_df['returnsClosePrevRaw1_5day_lag'] = temp_df.groupby('assetCode', as_index=False)['returnsClosePrevRaw1'].shift(5).reset_index(drop=True)
    temp_df['returnsOpenPrevRaw1_5day_lag'] = temp_df.groupby('assetCode', as_index=False)['returnsOpenPrevRaw1'].shift(5).reset_index(drop=True)
    temp_df['returnsClosePrevRaw1_10day_lag'] = temp_df.groupby('assetCode', as_index=False)['returnsClosePrevRaw1'].shift(10).reset_index(drop=True)
    temp_df['returnsOpenPrevRaw1_10day_lag'] = temp_df.groupby('assetCode', as_index=False)['returnsOpenPrevRaw1'].shift(10).reset_index(drop=True)
    
    # Put lags to market_obs_df
    market_obs_df['returnsClosePrevRaw1_1day_lag'] = temp_df[temp_df.assetCode.isin(market_obs_df.assetCode) & (temp_df.time == market_obs_df.time[0])]['returnsClosePrevRaw1_1day_lag'].reset_index(drop=True)
    market_obs_df['returnsOpenPrevRaw1_1day_lag'] = temp_df[temp_df.assetCode.isin(market_obs_df.assetCode) & (temp_df.time == market_obs_df.time[0])]['returnsOpenPrevRaw1_1day_lag'].reset_index(drop=True)
    market_obs_df['returnsClosePrevRaw1_3day_lag'] = temp_df[temp_df.assetCode.isin(market_obs_df.assetCode) & (temp_df.time == market_obs_df.time[0])]['returnsClosePrevRaw1_3day_lag'].reset_index(drop=True)
    market_obs_df['returnsOpenPrevRaw1_3day_lag'] = temp_df[temp_df.assetCode.isin(market_obs_df.assetCode) & (temp_df.time == market_obs_df.time[0])]['returnsOpenPrevRaw1_3day_lag'].reset_index(drop=True)
    market_obs_df['returnsClosePrevRaw1_5day_lag'] = temp_df[temp_df.assetCode.isin(market_obs_df.assetCode) & (temp_df.time == market_obs_df.time[0])]['returnsClosePrevRaw1_5day_lag'].reset_index(drop=True)
    market_obs_df['returnsOpenPrevRaw1_5day_lag'] = temp_df[temp_df.assetCode.isin(market_obs_df.assetCode) & (temp_df.time == market_obs_df.time[0])]['returnsOpenPrevRaw1_5day_lag'].reset_index(drop=True)
    market_obs_df['returnsClosePrevRaw1_10day_lag'] = temp_df[temp_df.assetCode.isin(market_obs_df.assetCode) & (temp_df.time == market_obs_df.time[0])]['returnsClosePrevRaw1_10day_lag'].reset_index(drop=True)
    market_obs_df['returnsOpenPrevRaw1_10day_lag'] = temp_df[temp_df.assetCode.isin(market_obs_df.assetCode) & (temp_df.time == market_obs_df.time[0])]['returnsOpenPrevRaw1_10day_lag'].reset_index(drop=True)
    
    market_obs_df[keys[:-1]] = market_obs_df[keys[:-1]].fillna(0)
    
    obs_data = market_obs_df[market_obs_df['assetCode'].isin(predictions_template_df.assetCode)][keys[0:-1]]
    pred = lgbm_model.predict(obs_data)
    pred = np.clip(pred, -1, 1)
    #pred_nn = model_nn.predict(obs_data)
    market_obs_df['pred'] = pred
    #market_obs_df['pred_nn'] = pred_nn.astype(np.float64).reshape(-1,)
    new_df = new_df.append(market_obs_df, sort=True)
    
    # Trim df to last #offset values
    temp_dates = temp_df['time'].unique()[-offset:]

    st_date = temp_dates[0]
    end_date = temp_dates[-1]
    
    temp_df = temp_df[(temp_df['time'] >= st_date) & (temp_df['time'] <= (end_date))]
    
    predictions_template_df.confidenceValue = pred
    env.predict(predictions_template_df)
print('Done!')

In [None]:
#stored_new_df = deepcopy(new_df)
#new_df = deepcopy(stored_new_df)

new_df.returnsOpenPrevMktres10 = new_df.groupby('assetCode').returnsOpenPrevMktres10.shift(-11)
new_df.returnsOpenPrevMktres10 = new_df.returnsOpenPrevMktres10.fillna(0)

new_df['conf_pred'] = new_df['pred'] * new_df.returnsOpenPrevMktres10  
confidence_value = new_df.groupby(new_df.time)['conf_pred'].sum()

conf = np.mean(confidence_value)/np.std(confidence_value)
print(conf)

In [None]:
env.write_submission_file()

In [None]:
# We've got a submission file!
import os
print([filename for filename in os.listdir('.') if '.csv' in filename])