In [None]:
# Use LGBM with probs

# %load_ext line_profiler
import numpy as np
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import gc
import xgboost as xgb
import lightgbm as lgb
from xgboost import plot_importance
import warnings
%pylab inline
pylab.rcParams['figure.figsize'] = (20, 20)
from kaggle.competitions import twosigmanews
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
fold = StratifiedKFold(n_splits=3, shuffle=True)
pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999
from sklearn.model_selection import train_test_split, TimeSeriesSplit
# Going to use these 5 base models for the stacking
# from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
#                               GradientBoostingClassifier, ExtraTreesClassifier)
pylab.rcParams['figure.figsize'] = (20, 20)

if __name__ == '__main__':
    warnings.filterwarnings(action='ignore', category=DeprecationWarning)
gc.collect()

env = twosigmanews.make_env()

In [None]:
def format_time(t, n=365):
    d = t/n*2*np.pi
    return np.sin(d), np.cos(d)

mt_df = env._var07
mt_df['time'] = pd.to_datetime(mt_df['time'])
# mt_df['date'] = pd.to_datetime(mt_df['time'].apply(lambda x: x.date))
mt_df.set_index(['assetCode', 'time'], inplace=True)
target = mt_df['returnsOpenNextMktres10']
universe = mt_df['universe']
mt_df.drop(['returnsOpenNextMktres10', 'universe'], axis=1, inplace=True)
mt_df.reset_index(inplace=True)
# mt_df.sort_index(inplace=True)
gc.collect()

In [None]:
del(env)
gc.collect()

In [None]:
def make_features(df):
    f = pd.DataFrame()
    for column in df.columns:
        f['returnsClosePrevMktres1_ewm_64'] = df['returnsClosePrevMktres1'].ewm(32).vol()
#         f['returnsClosePrevMktres1_ewm_64'] = df['returnsClosePrevMktres1'].ewm(64).std()
#         f['returnsClosePrevMktres1_ewm_256'] = df['returnsClosePrevMktres1'].ewm(256).mean()
        if "volume" in column:
            f['v_pct_d'] = df['volume'].pct_change()
        if "returns" in column:
            s = df[column]
            t = 1
            name = str(s.name)
#             f[name] = s
            f["%s_lag%i" % (name, t)] = s.shift(1)
            f['%s_lag%i_diff' % (name, t)] = s.diff()
            f['%s_lag%i_absdiff' % (name, t)] = s.diff().abs()
            f['%s_lag_%i_sumlag' % (name, t)] = s + s.shift(1)
        #     f['%s_amean' % (name)] = s.groupby('date').mean()
        #     f['%s_demean' % (name)] = s - f['%s_amean' % (name)]

    return f

def mean_features(df):
    f = pd.DataFrame()
    for column in df.columns:
        if "returns" in column or "volume" in column:
            s = df[column]
            name = str(s.name)
            f['%s_amean' % (name)] = s.groupby('time').mean()
    a = df.join(f, how='left')
    for column in df.columns:
        if "returns" in column or "volume" in column:
            a["%s_demean" % column] = a[column] - a['%s_amean' % (column)]
    return a



In [None]:
def get_x(mt_df):
#     mt_df = mt_df.drop(['assetName', 'volume', 'close', 'open'], axis=1)
    mt_df['time'] = pd.to_datetime(mt_df['time'])
    mt_df = mt_df.set_index(['assetCode','time'])
    m = pd.concat([mean_features(mt_df), mt_df.groupby('assetCode').apply(make_features)],axis=1)
    m['t_s'], m['t_c'] = format_time(m.index.get_level_values(1).dayofyear)
    m['m_s'], m['m_c'] = format_time(m.index.get_level_values(1).day, n=30)
    m['w_s'], m['w_c'] = format_time(m.index.get_level_values(1).dayofweek, n=6)
    return m

In [None]:
gc.collect()
X = get_x(mt_df)

In [None]:
y = target >=0
# def get_last_days(stack, n=1, column='time'):
#     return stack.set_index('time').last(str(n)+'D').reset_index()

# m_stack = get_last_days(mt_df, 3)

In [None]:
del(mt_df)
# del(env)
gc.collect()

In [None]:
X.drop('assetName', axis=1, inplace=True)
X.reset_index('assetCode', inplace=True)

In [None]:
X_train = X.loc['2009':'2015'].reset_index().set_index(['assetCode', 'time'])
y_train = y.to_frame().reset_index('assetCode').loc['2009':'2015'].reset_index().set_index(['assetCode', 'time'])
X_test = X.loc['2016'].reset_index().set_index(['assetCode', 'time'])
y_test = y.to_frame().reset_index('assetCode').loc['2016'].reset_index().set_index(['assetCode', 'time'])
X_valid = X.loc['2017':].reset_index().set_index(['assetCode', 'time'])
y_valid = y.to_frame().reset_index('assetCode').loc['2017':].reset_index().set_index(['assetCode', 'time'])

In [None]:
t_train = target.loc[y_train.index]

In [None]:
corr = {}
for column in X_train.columns:
    corr[column] = t_train.corr(X_train[column])
corr = pd.Series(corr).abs().sort_values()

In [None]:
from scipy.stats import zscore
train_cols = corr[corr > corr.quantile(0.8)].index
X_train = X_train[train_cols]
X_train = X_train.dropna()
y_train = y_train.loc[X_train.index]
X_train = X_train[(np.abs(zscore(X_train)) < 3).all(axis=1)]
y_train = y_train.loc[X_train.index]

In [None]:
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, VotingClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import ElasticNet, RidgeClassifier, SGDClassifier
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier, LGBMModel
from sklearn.decomposition import PCA

et = make_pipeline(SimpleImputer(), StandardScaler(), PCA(4), ExtraTreesClassifier(n_estimators=100, max_depth=20, min_samples_split=2, n_jobs=-1))
# rf = make_pipeline(SimpleImputer(), StandardScaler(), PCA(4), RandomForestClassifier())
# ridge = make_pipeline(SimpleImputer(), StandardScaler(), PCA(4), RidgeClassifier())
en = make_pipeline(SimpleImputer(), StandardScaler(), PCA(4), SGDClassifier(loss="log", penalty="elasticnet"))
lg = make_pipeline(SimpleImputer(), StandardScaler(), LGBMClassifier(max_depth=4))

est = [('et', et), ('en', en), ('lg', lg)]
model = lg
# model = VotingClassifier(estimators=est, voting='hard')
model.fit(X_train, y_train)

gc.collect()
model.fit(X_train,           
          y_train,
         )

In [None]:
z = pd.Series(model.predict(X_valid[train_cols]), y_valid.index).apply(reformat)


In [None]:
def s(r):
    return r.mean()/r.std()

reformat = np.vectorize(lambda x: 1 if x == True else -1)

norm = z * (X_valid['returnsClosePrevMktres1'] * universe).loc[z.index]
norm_returns = norm.groupby('time').sum()
norm_r_std = norm.ewm(5).std()/norm.ewm(5).std().mean()

a = (z / (norm_r_std*2.3)).clip(-1,1) * (target * universe).loc[z.index]
b = a.groupby('time').sum()

s(b)

In [None]:
(z/(norm_r_std*2.3)).clip(-1,1).plot.hist()

In [None]:
norm_r_std.mean()

In [None]:
b.ewm(5).std()/6.76

In [None]:
j = X_valid['returnsOpenPrevMktres10'].groupby('time').sum().ewm(5).std()/14

In [None]:
b.mean()/b.std()

In [None]:
returnsOpenPrevMktres10.group

In [None]:
j['2008']

In [None]:
s(b.ewm(5).mean())

In [None]:
train_cols

In [None]:
def sharpe(model):
    reformat = np.vectorize(lambda x: 1 if x == True else -1)
    
    def evaluate(X, y):
        z = pd.Series(model.predict(X), y.index).apply(reformat)
        a = z * (target * universe).loc[z.index]
        b = a.groupby('time').sum()
        return (b.mean()/b.std())
    

    print(evaluate(X_train[train_cols], y_train))
    print(evaluate(X_test[train_cols], y_test))
    print(evaluate(X_valid[train_cols], y_valid))

sharpe(model)

In [None]:
z = model.predict_proba(X_test)

In [None]:
pd.Series(z[:,1] - .5, index=X_test.index).std()

In [None]:
y.tail()

In [None]:
pylab.rcParams['figure.figsize'] = (20, 20)
pd.Series(model.feature_importances_, X_test.columns).sort_values().plot.bar()

In [None]:

def reformat_proba(pp, k):
    c = (pp[:,1] - pp[:,0]).reshape((-1,1))
    return np.clip(c*k,-1,1)


In [None]:
# m = m_stack.copy()
# m['date'] = m['time'].apply(lambda x: x.date)
# m = m.set_index(['assetCode', 'date'], drop = True)

In [None]:
# %lprun -f group_features format_market_obs(mt_df)
# %lprun -f format_news format_news(n_df, mt_df)

In [None]:
# %lprun -f group_features m.groupby('assetCode').apply(group_features)

In [None]:
# X, y = get_xy(mt_df, n_df)
# i = int(len(X)/2)
# X_train, y_train = X.head(i), y.head(i)
# X_test, y_test = X.tail(i), y.tail(i)
# del(X, y)
# gc.collect()

In [None]:
# m_stack = get_last_days(mt_df,n=300)

In [None]:
# %%time
# X, y = get_xy(mt_df, n_df)

In [None]:
# %%time
# X, y = get_xy(mt_df)

# # del(mt_df)
# # gc.collect()
# model = xgb.XGBClassifier(n_jobs=4, learning_rate=0.01, n_estimators=250, max_depth=2)
# # params = {'alpha': 7.0, 'booster': 'gbtree', 'colsample_bytree': 0.7000000000000001, 'eta': 0.125, 'gamma': 0.9, 'lambda': 1.7000000000000002, 'max_depth': 13, 'min_child_weight': 10.0, 'n_estimators': 393, 'nthread': -1, 'objective': 'binary:logistic', 'seed': 0, 'subsample': 0.9500000000000001}
# params= {}
# # model = xgb.XGBClassifier(**params)
# model.fit(X,y)
# pylab.rcParams['figure.figsize'] = (20, 20)
# plot_importance(model)

In [None]:
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=0.33, random_state=42)

In [None]:
# Put in our parameters for said classifiers
# Random Forest parameters
lgbm_params = {"n_jobs=": -1,
               'max_depth': -1}

# Extra Trees Parameters
et_params = {
#     'n_jobs': -1,
    'n_estimators': 15,
}


In [None]:
gc.collect()

In [None]:
m_stack = get_last_days(mt_df,n=300)

del(mt_df)
# del(env)
# del(X)
gc.collect()

In [None]:
from sklearn.linear_model import RidgeClassifierCV
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
lgbm_model = LGBMClassifier(**lgbm_params)
ridge_pipe = make_pipeline(SimpleImputer(), RidgeClassifierCV())
# rf_pipe = make_pipeline(SimpleImputer(), RandomForestClassifier(**rf_params))
# et_pipe = make_pipeline(SimpleImputer(), ExtraTreesClassifier(**et_params))

In [None]:
# scores = cross_val_score(lgbm_model, X, y)

In [None]:
# scores.mean()

In [None]:
# %%time

# def score_c(clf, X, y):
#     z = clf.predict_proba(X)
#     z = z[:, 1] - 0.5
#     zs = pd.Series(np.clip(z*k,-1,1), index=y.index)
#     result = (zs*target).mean()/(zs*target).std()
#     print("Fold sharpe", result)
#     return result

# model = LGBMClassifier()
# model.fit(X, y)
# # scores = cross_val_score(model, X, y)
# # print(scores.mean(), scores.std())
# z = model.predict_proba(X)
# z = z[:,1] - 0.5
# k = .5/z.std()
# print("k", k)

# # scores = cross_val_score(model, X, y, scoring=score_c, cv=fold)
# # print(scores.mean(), scores.std())

In [None]:
# from sklearn.feature_selection import RFE, RFECV
# selector = RFECV(lgbm_model, verbose=1, cv=fold)
# X_i = SimpleImputer().fit_transform(X)
# selector.fit(X_i, y)

In [None]:
# def loglikelihood(preds, train_data):
#     labels = train_data.get_label()
#     preds = 1. / (1. + np.exp(-preds))
#     grad = preds - labels
#     hess = preds * (1. - preds)
#     return grad, hess


In [None]:
# pd.Series(selector.support_, index=X.columns).sort_values()

In [None]:
# list(zip(X.columns, selector.support_))

In [None]:
%%time
# def test(m_stack,X):
days = env.get_prediction_days()
n_stack = pd.DataFrame()
# X_stack = X
i=0
for (market_obs_df, news_obs_df, predictions_template_df) in days:
    start = time.time()
    i+=1
    m_stack = m_stack.append(market_obs_df)  
    X_today, y = get_xy(m_stack.set_index(['time']).sort_index().last('300D').reset_index(), last=True)
#     predictions = pd.Series(model.predict(X_today), index=X_today.index.get_level_values(0))
    proba = reformat_proba(model.predict_proba(X_today), k)
    predictions = pd.Series(proba.flatten(), index=X_today.index.get_level_values(0))
    predictions_template_df.set_index('assetCode', inplace=True)
#     predictions_template_df.confidenceValue = predictions.apply(reformat)
    predictions_template_df.confidenceValue = predictions
    predictions_template_df.confidenceValue = predictions_template_df.confidenceValue.fillna(0)
    predictions_template_df.reset_index(inplace=True)
    env.predict(predictions_template_df)
    stop = time.time()
    duration = stop-start
    print(i, duration, sum(predictions)/len(predictions))

In [None]:
# x, y = get_xy(market_obs_df, news_obs_df)
# make_predictions(predictions_template_df, market_obs_df, news_obs_df)

In [None]:
# x, y = get_xy(market_obs_df, news_obs_df)

In [None]:
env.write_submission_file()