In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import warnings
warnings.filterwarnings('ignore')
RS = sum(list(map(ord, 'Dale Boca')))

import pandas as pd
pd.option_context('display.max_rows', None, 'display.max_columns', None)
pd.set_option('display.max_colwidth', -1)

import numpy as np
import os;

### Preparación de datasets

In [2]:
from sklearn.preprocessing import LabelEncoder

def dataset_prep(window=3, neutral=True):
    
    train = pd.read_csv('data/trainset_va.csv', index_col=0)
    test  = pd.read_csv('data/testset_va.csv', index_col=0)
    
    cols1 = ['pos', 'neg', 'neu'] if neutral==True else ['pos', 'neg']
    
    for col in cols1:
        train[col+'w'] = train[col].rolling(window=window).sum()
        test[col+'w'] = test[col].rolling(window=window).sum()
    
    train['exc_ret'] = train['exc_ret'].shift(periods=-1)
    test['exc_ret']  = test['exc_ret'].shift(periods=-1)
    
    train.dropna(axis=0, inplace=True)
    test.dropna(axis=0, inplace=True)
    
    cols2 = [x+'w' for x in cols1]
    
    X = train[cols2]
    y = train.exc_ret
    Xt = test[cols2]
    yt = test.exc_ret
    
    le = LabelEncoder()
    y  = le.fit_transform(y)
    yt = le.fit_transform(yt)
    
    return X, Xt, y, yt    

### Entrenamiento de Clasificadores

In [12]:
import scipy.stats as st
from sklearn.model_selection import RandomizedSearchCV, KFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from scikitplot.metrics import plot_roc

from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier
#import lightgbm as lgb
from sklearn.linear_model import LogisticRegression
#from catboost import CatBoostClassifier
from sklearn.svm import SVC

In [27]:
def train_classifier(X, Xt, y, yt):
    
    #model_name = ['Random Forest', 'XGBoost', 'LightGBM', 'Logistic Regression',
    #              'Catboost', 'Support Vector']
    model_name = ['Random Forest', 'XGBoost', 'Logistic Regression', 'Support Vector']
   
    model_init = [RandomForestClassifier(),
                  XGBClassifier(),
                  #lgb.LGBMClassifier(),
                  LogisticRegression(),
                  #CatBoostClassifier(),
                  SVC(probability=True)
                 ]
    params1 = {  
        "n_estimators": st.randint(10,100),    # Number of boosted trees to fit.
        "max_depth"   : st.randint(2, 25),     # Maximum tree depth for base learners.
    }
    
    params2 = {  
        "n_estimators": st.randint(10,100),    # Number of boosted trees to fit.
        "max_depth": st.randint(2, 25),         # Maximum tree depth for base learners.
        "learning_rate": st.uniform(0.01, 0.5), # Boosting learning rate (xgb’s “eta”)
        "colsample_bytree": st.beta(10, 1),     # Subsample ratio of columns when constructing each tree.
        "subsample": st.beta(10, 1),            # Subsample ratio of the training instance.
        "gamma": st.uniform(0, 10),             # Minimum loss reduction required to make a further partition on a leaf node of the tree.
        'reg_alpha': st.uniform(0.05,10),       # L1 regularization term on weights
        "min_child_weight": st.uniform(1,20),   # Minimum sum of instance weight(hessian) needed in a child.
    }
    
    params3 = {  
        "penalty"     : ['l1', 'l2'],
        "C"           : st.uniform(0.1, 10.),
    }
    
    params4 = {}
    
    params5 = {
        "C"           : st.uniform(0.1, 10.),
        #"kernel"      : ['lbf', 'linear']
    }
    
    #model_params = [params1, params2, params2, params3, params4, params5]
    model_params = [params1, params2, params3, params5]
    
    train_scores    = []
    test_scores     = []
    best_estimators = []
    best_parameters = []
    
    for name, mod, params in zip(model_name, model_init, model_params):
        model = mod
        kf    = KFold(n_splits=5, shuffle=True, random_state=RS)
        rgrid = RandomizedSearchCV(estimator=model, param_distributions=params, cv=kf,
                                   scoring='roc_auc', n_iter=50, verbose=1, n_jobs=-1)
        rgrid.fit(X, y)
        best_estimators.append(rgrid.best_estimator_)
        best_parameters.append(rgrid.best_params_)
        train_scores.append(rgrid.best_score_)
        ppred = rgrid.predict_proba(Xt)
        score = roc_auc_score(y_score=ppred[:, 1], y_true=yt)
        test_scores.append(score)
    
    results = pd.DataFrame(
        {'Model'      : model_name,
         'Train Score': train_scores,
         'Test Score' : test_scores,
         'Params'     : best_parameters,
         'Estimator'  : best_estimators
        })
    
    return results

In [28]:
import itertools
windows = np.arange(1, 6)
neutral = [1, 0]

resultado = pd.DataFrame()

for win, neu in itertools.product(windows, neutral):
    
    X, X_test, y, y_test = dataset_prep(window=win, neutral=neu)
    
    clfs = train_classifier(X, X_test, y, y_test)
  
    res = pd.DataFrame(
    {'Model'      : clfs['Model'],
     'Window'     : list(str(win)*clfs.shape[0]),
     'Neutral'    : list(str(neu)*clfs.shape[0]),
     'Train Score': clfs['Train Score'],
     'Test Score' : clfs['Test Score'],
     'Params'     : clfs['Params'],
     'Estimator'  : clfs['Estimator']
    })
    
    resultado = pd.concat([resultado, res], axis=0)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    1.0s


Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    4.2s finished


Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    0.5s finished


Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    0.6s finished


Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done  60 tasks      | elapsed:    1.2s


Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    4.3s finished


Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    0.4s finished


Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    0.6s finished


Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    0.9s


Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    4.2s finished


Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    0.6s finished


Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    0.2s finished


Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    0.6s finished
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    1.5s


Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    4.2s finished


Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    0.5s finished


Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    0.2s finished


Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    0.6s finished
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    0.9s


Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    4.3s finished


Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    0.5s finished


Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    0.2s finished


Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    0.6s finished
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    1.8s


Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    4.1s finished


Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    0.4s finished


Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    0.2s finished


Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    0.6s finished
[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    3.4s finished


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    0.5s finished


Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    0.2s finished


Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    0.6s finished
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    1.6s


Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    4.1s finished


Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    0.5s finished


Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    0.2s finished


Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    0.6s finished
[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed:    2.6s


Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    4.4s finished


Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    0.6s finished


Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    0.3s finished


Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    0.6s finished
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    1.4s


Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    4.2s finished


Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    0.5s finished


Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    0.7s finished


In [29]:
resultado.sort_values(by='Train Score', ascending=False).iloc[:, :-1]

Unnamed: 0,Model,Window,Neutral,Train Score,Test Score,Params
0,Random Forest,4,1,0.634764,0.446633,"{'max_depth': 21, 'n_estimators': 91}"
3,Support Vector,2,1,0.620285,0.525146,{'C': 1.021127977480073}
0,Random Forest,2,1,0.620146,0.429916,"{'max_depth': 21, 'n_estimators': 17}"
1,XGBoost,4,1,0.608756,0.50202,"{'colsample_bytree': 0.9558007758002638, 'gamma': 0.3592271437338024, 'learning_rate': 0.2347214582414079, 'max_depth': 13, 'min_child_weight': 1.5094203289648025, 'n_estimators': 45, 'reg_alpha': 1.4168590285168114, 'subsample': 0.8460337904893644}"
0,Random Forest,4,0,0.606338,0.431145,"{'max_depth': 7, 'n_estimators': 89}"
0,Random Forest,2,0,0.605955,0.484263,"{'max_depth': 2, 'n_estimators': 15}"
0,Random Forest,5,1,0.591777,0.403581,"{'max_depth': 8, 'n_estimators': 72}"
0,Random Forest,5,0,0.587454,0.422865,"{'max_depth': 15, 'n_estimators': 12}"
2,Logistic Regression,4,1,0.58548,0.419192,"{'C': 0.9361762042067422, 'penalty': 'l2'}"
1,XGBoost,2,1,0.583696,0.474367,"{'colsample_bytree': 0.9306406209261033, 'gamma': 2.002491473029553, 'learning_rate': 0.49513991006872526, 'max_depth': 11, 'min_child_weight': 5.159915326988246, 'n_estimators': 46, 'reg_alpha': 2.8400398934563573, 'subsample': 0.9487864294110588}"
