In [None]:
import gc
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from kaggle.competitions import twosigmanews

In [None]:
env = twosigmanews.make_env()
market_train, news_train = env.get_training_data()

In [None]:
start = datetime(2013, 1, 1, 0, 0, 0).date()
market_train = market_train.loc[market_train['time'].dt.date >= start].reset_index(drop=True)
news_train = news_train.loc[news_train['time'].dt.date >= start].reset_index(drop=True)

In [None]:
market_train.head(5)

In [None]:
news_train.head(5)

In [None]:
def preprocess_news(news_train):
    drop_list = [
        'headline','sourceTimestamp','firstCreated','subjects','audiences','assetName'
    ]
    for col in ['headlineTag','provider','sourceId']:
        news_train[col], uniques = pd.factorize(news_train[col])
        del uniques
    news_train['assetCodes'] = news_train['assetCodes'].apply(lambda x: x[1:-1].replace("'", ""))
    return news_train
news_train = preprocess_news(news_train)

In [None]:
def unstack_asset_codes(news_train):
    codes = []
    indexes = []
    for i, values in news_train['assetCodes'].iteritems():
        explode = values.split(", ")
        codes.extend(explode)
        repeat_index = [int(i)]*len(explode)
        indexes.extend(repeat_index)
    index_df = pd.DataFrame({'news_index': indexes, 'assetCode': codes})
    del codes, indexes
    gc.collect()
    return index_df

index_df = unstack_asset_codes(news_train)
index_df.head()

In [None]:
def unstack_asset_codes(news_train):
    codes = []
    indexes = []
    for i, values in news_train['assetCodes'].iteritems():
        explode = values.split(", ")
        codes.extend(explode)
        repeat_index = [int(i)]*len(explode)
        indexes.extend(repeat_index)
    index_df = pd.DataFrame({'news_index': indexes, 'assetCode': codes})
    del codes, indexes
    gc.collect()
    return index_df

index_df = unstack_asset_codes(news_train)
index_df.head()

In [None]:
def merge_news_on_index(news_train, index_df):
    news_train['news_index'] = news_train.index.copy()

    # Merge news on unstacked assets
    news_unstack = index_df.merge(news_train, how='left', on='news_index')
    news_unstack.drop(['news_index', 'assetCodes'], axis=1, inplace=True)
    return news_unstack

news_unstack = merge_news_on_index(news_train, index_df)
del news_train, index_df
gc.collect()
news_unstack.head(5)

In [None]:
def group_news(news_frame):
    news_frame['date'] = news_frame.time.dt.date  # Add date column
    
    aggregations = ['mean']
    gp = news_frame.groupby(['assetCode', 'date']).agg(aggregations)
    gp.columns = pd.Index(["{}_{}".format(e[0], e[1]) for e in gp.columns.tolist()])
    gp.reset_index(inplace=True)
    # Set datatype to float32
    float_cols = {c: 'float32' for c in gp.columns if c not in ['assetCode', 'date']}
    return gp.astype(float_cols)

news_agg = group_news(news_unstack)
del news_unstack; gc.collect()
news_agg.head(5)

In [None]:
market_train['date'] = market_train.time.dt.date
df = market_train.merge(news_agg, how='left', on=['assetCode', 'date'])
del market_train, news_agg
gc.collect()
df.head(5)

In [None]:
def custom_metric(date, pred_proba, num_target, universe):
    y = pred_proba*2 - 1
    r = num_target.clip(-1,1) # get rid of outliers
    x = y * r * universe
    result = pd.DataFrame({'day' : date, 'x' : x})
    x_t = result.groupby('day').sum().values
    return np.mean(x_t) / np.std(x_t)

In [None]:
date = df.date
num_target = df.returnsOpenNextMktres10.astype('float32')
bin_target = (df.returnsOpenNextMktres10 >= 0).astype('int8')
universe = df.universe.astype('int8')
# Drop columns that are not features
df.drop(['returnsOpenNextMktres10', 'date', 'universe', 'assetCode', 'assetName', 'time'], 
        axis=1, inplace=True)
df = df.astype('float32')  # Set all remaining columns to float32 datatype
gc.collect()

In [None]:
train_index, test_index = train_test_split(df.index.values, test_size=0.3)

In [None]:
def evaluate_model(df, target, train_index, test_index, params):
    #model = XGBClassifier(**params)
    model = LGBMClassifier(**params)
    model.fit(df.iloc[train_index], target.iloc[train_index])
    return log_loss(target.iloc[test_index], model.predict_proba(df.iloc[test_index]))

In [None]:
# param_grid = {
#     'learning_rate': [0.1, 0.5, 0.02, 0.01],
#     'num_leaves': [15, 30, 40, 65],
#     'n_estimators': [20, 30, 50, 100, 200]
# }
# best_eval_score = 0
# for i in range(20):
#     params = {k: np.random.choice(v) for k, v in param_grid.items()}
#     score = evaluate_model(df, bin_target, train_index, test_index, params)
#     if score < best_eval_score or best_eval_score == 0:
#         best_eval_score = score
#         best_params = params
# print("Best evaluation logloss", best_eval_score)

In [None]:
df.head(5)

In [None]:
print(df.isnull().sum(axis=0))

In [None]:
# Checking feature correlations
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
corr = pd.concat([df, bin_target], axis=1).corr()
plt.figure(figsize=(14, 8))
plt.title('Overall Correlation of House Prices', fontsize=18)
sns.heatmap(corr, annot=False,cmap='RdYlGn', linewidths=0.2, annot_kws={'size':20})
plt.show()

In [None]:
print(df.columns)

In [None]:
# drop SourceId and after feature , if is no na ,let it be 1, otherwise be 0
def dropfeatureTooNa(df):
    columns = ['sourceId_mean', 'urgency_mean',
       'takeSequence_mean', 'provider_mean', 'bodySize_mean',
       'companyCount_mean', 'headlineTag_mean', 'marketCommentary_mean',
       'sentenceCount_mean', 'wordCount_mean', 'firstMentionSentence_mean',
       'relevance_mean', 'sentimentClass_mean', 'sentimentNegative_mean',
       'sentimentNeutral_mean', 'sentimentPositive_mean',
       'sentimentWordCount_mean', 'noveltyCount12H_mean',
       'noveltyCount24H_mean', 'noveltyCount3D_mean', 'noveltyCount5D_mean',
       'noveltyCount7D_mean', 'volumeCounts12H_mean', 'volumeCounts24H_mean',
       'volumeCounts3D_mean', 'volumeCounts5D_mean', 'volumeCounts7D_mean']
    new_df = df
    new_df['have_new_influence'] = 0
    new_df.loc[new_df['sourceId_mean'].notna(), 'have_new_influence'] = 1
    new_df = new_df.drop(columns, axis=1)
    return new_df
    
new_df = dropfeatureTooNa(df)

In [None]:
new_df.head(5)

In [None]:
# Checking feature correlations
import seaborn as sns
import matplotlib.pyplot as plt
corr = pd.concat([new_df, bin_target], axis=1).corr()
plt.figure(figsize=(14, 8))
plt.title('Overall Correlation of House Prices', fontsize=18)
sns.heatmap(corr, annot=False,cmap='RdYlGn', linewidths=0.2, annot_kws={'size':20})
plt.show()

In [None]:
print(new_df.isnull().sum(axis=0))

In [None]:
# filter missing data by median
new_df['returnsClosePrevMktres1'] = new_df['returnsClosePrevMktres1'].fillna(new_df['returnsClosePrevMktres1'].dropna().median())
new_df['returnsOpenPrevMktres1'] = new_df['returnsOpenPrevMktres1'].fillna(new_df['returnsOpenPrevMktres1'].dropna().median())
new_df['returnsClosePrevMktres10'] = new_df['returnsClosePrevMktres10'].fillna(new_df['returnsClosePrevMktres10'].dropna().median())
new_df['returnsOpenPrevMktres10'] = new_df['returnsOpenPrevMktres10'].fillna(new_df['returnsOpenPrevMktres10'].dropna().median())
print(new_df.isnull().sum(axis=0) > 0)

now, data doesn't have na-data, i will use boost model to train this dataframe

In [None]:
# define 
from sklearn.cross_validation import cross_val_score
def rmse_cv(model):
    rmse = np.sqrt(-cross_val_score(model, new_df, bin_target, scoring='neg_mean_squared_error', cv=10))
    return rmse

In [None]:
from sklearn.linear_model import Ridge
# alphas = [0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 70, 90, 100, 500, 1000, 2000]
# cv_ridge = [rmse_cv(Ridge(alpha=alpha)).mean() for alpha in alphas]

# cv_ridge = pd.Series(cv_ridge, index = alphas)
# cv_ridge.plot(title = "Validation")
# plt.xlabel("Alpha")
# plt.ylabel("Rmse")

best alpha is 500 about Ridge

In [None]:
# 500 looks like the optimal alpha level, so let's fit the Ridge model with this value
model_ridge = Ridge(alpha=500)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
# Initiating Gradient Boosting Regressor
model_gbr = GradientBoostingRegressor(n_estimators=1200, 
                                      learning_rate=0.05,
                                      max_depth=4, 
                                      max_features='sqrt',
                                      min_samples_leaf=15, 
                                      min_samples_split=10, 
                                      loss='huber',
                                      random_state=5)

In [None]:
# Initiating XGBRegressor
import xgboost as xgb
model_xgb = xgb.XGBRegressor(colsample_bytree=0.2,
                             learning_rate=0.06,
                             max_depth=3,
                             n_estimators=1150)

In [None]:
import lightgbm as lgb
# Initiating LGBMRegressor model
model_lgb = lgb.LGBMRegressor(objective='regression',
                              num_leaves=4,
                              learning_rate=0.05, 
                              n_estimators=1080,
                              max_bin=75, 
                              bagging_fraction=0.80,
                              bagging_freq=5, 
                              feature_fraction=0.232,
                              feature_fraction_seed=9, 
                              bagging_seed=9,
                              min_data_in_leaf=6, 
                              min_sum_hessian_in_leaf=11)

In [None]:
from catboost import CatBoostRegressor

# Initiating CatBoost Regressor model
model_cat = CatBoostRegressor(iterations=2000,
                              learning_rate=0.10,
                              depth=3,
                              l2_leaf_reg=4,
                              border_count=15,
                              loss_function='RMSE',
                              verbose=200)

In [None]:
# cv_ridge = rmse_cv(model_ridge).mean()
cv_gbr = rmse_cv(model_gbr).mean()
# cv_xgb = rmse_cv(model_xgb).mean()
# cv_lgb = rmse_cv(model_lgb).mean()
# cv_cat = model_cat.fit(new_df, bin_target)