# 01 - Holdout Simples

É a maneira mais simples. Pegamos a parte inicial dos dados para treino e parte final para testes.

## Importação

In [1]:
import pandas as pd
import numpy as np

## Carga dos Dados

In [2]:
train = pd.read_csv("data-raw/numerai_training_data.csv")
test = pd.read_csv("data-raw/numerai_tournament_data.csv")
test = test[test['data_type'] == 'validation']

train['era'] = train['era'].str.extract(r'era(\d+)').astype(int)
test['era'] = test['era'].str.extract(r'era(\d+)').astype(int)

train.reset_index(drop=True)
test.reset_index(drop=True)

train.to_csv('data-processed/train.csv', index=False)
test.to_csv('data-processed/test.csv', index=False)

In [3]:
train = pd.read_csv('data-processed/train.csv')
test = pd.read_csv('data-processed/test.csv')

In [4]:
train.head(2)

Unnamed: 0,id,era,data_type,feature_intelligence1,feature_intelligence2,feature_intelligence3,feature_intelligence4,feature_intelligence5,feature_intelligence6,feature_intelligence7,...,feature_wisdom38,feature_wisdom39,feature_wisdom40,feature_wisdom41,feature_wisdom42,feature_wisdom43,feature_wisdom44,feature_wisdom45,feature_wisdom46,target
0,n000315175b67977,1,train,0.0,0.5,0.25,0.0,0.5,0.25,0.25,...,1.0,1.0,0.75,0.5,0.75,0.5,1.0,0.5,0.75,0.5
1,n0014af834a96cdd,1,train,0.0,0.0,0.0,0.25,0.5,0.0,0.0,...,1.0,1.0,0.0,0.0,0.75,0.25,0.0,0.25,1.0,0.25


In [5]:
train.tail(2)

Unnamed: 0,id,era,data_type,feature_intelligence1,feature_intelligence2,feature_intelligence3,feature_intelligence4,feature_intelligence5,feature_intelligence6,feature_intelligence7,...,feature_wisdom38,feature_wisdom39,feature_wisdom40,feature_wisdom41,feature_wisdom42,feature_wisdom43,feature_wisdom44,feature_wisdom45,feature_wisdom46,target
501806,nffaab4e1cacc4b1,120,train,0.25,0.25,0.25,0.5,0.0,1.0,1.0,...,0.75,0.75,0.75,0.75,0.75,0.5,0.5,0.25,0.75,0.5
501807,nffba5460b572cfa,120,train,0.75,0.5,0.5,0.75,0.75,0.0,0.0,...,0.5,0.5,0.25,0.5,0.75,1.0,0.25,0.75,0.5,0.5


In [6]:
test.head(2)

Unnamed: 0,id,era,data_type,feature_intelligence1,feature_intelligence2,feature_intelligence3,feature_intelligence4,feature_intelligence5,feature_intelligence6,feature_intelligence7,...,feature_wisdom38,feature_wisdom39,feature_wisdom40,feature_wisdom41,feature_wisdom42,feature_wisdom43,feature_wisdom44,feature_wisdom45,feature_wisdom46,target
0,n0003aa52cab36c2,121,validation,0.25,0.75,0.5,0.5,0.0,0.75,0.5,...,0.75,0.75,1.0,0.75,0.5,0.5,1.0,0.0,0.0,0.25
1,n000920ed083903f,121,validation,0.75,0.5,0.75,1.0,0.5,0.0,0.0,...,0.5,0.5,0.75,1.0,0.75,0.5,0.5,0.5,0.5,0.5


In [7]:
test.tail(2)

Unnamed: 0,id,era,data_type,feature_intelligence1,feature_intelligence2,feature_intelligence3,feature_intelligence4,feature_intelligence5,feature_intelligence6,feature_intelligence7,...,feature_wisdom38,feature_wisdom39,feature_wisdom40,feature_wisdom41,feature_wisdom42,feature_wisdom43,feature_wisdom44,feature_wisdom45,feature_wisdom46,target
137777,nffc376c3127112d,212,validation,0.0,0.75,0.75,0.0,0.75,0.5,0.75,...,0.5,0.5,0.75,0.75,0.25,0.25,0.75,0.5,0.5,1.0
137778,nffe5d84fb971b26,212,validation,0.0,0.5,0.5,0.25,0.5,0.25,0.25,...,0.25,0.0,0.75,0.5,0.0,0.0,0.75,0.25,0.5,0.5


In [8]:
train.shape, test.shape

((501808, 314), (137779, 314))

## Holdout 50/50

Separamos metade dos dados para treino e metade para validação

In [9]:
data = pd.read_csv('data-processed/train.csv')

train = data[data['era'] <= 60]
val = data[data['era'] > 60]

X_train = train.filter(regex=r'feature')
X_val = val.filter(regex=r'feature')

y_train = train['target']
y_val = val['target']

In [10]:
%%time
from lightgbm import LGBMRegressor

mdl = LGBMRegressor(max_depth=5, num_leaves=2**5, learning_rate=0.01, n_estimators=2000, colsample_bytree=0.1, random_state=0)

mdl.fit(X_train, y_train)

Wall time: 1min 4s


LGBMRegressor(colsample_bytree=0.1, learning_rate=0.01, max_depth=5,
              n_estimators=2000, num_leaves=32, random_state=0)

In [11]:
predictions = pd.Series(mdl.predict(X_val))

ranked_predictions = predictions.rank(pct=True, method="first")

correlation = np.corrcoef(y_val, ranked_predictions)[0, 1]

print(correlation)

0.04070674069038811


In [12]:
test = pd.read_csv("data-processed/test.csv")

X_test = test.filter(regex=r'feature')
y_test = test['target']

predictions = pd.Series(mdl.predict(X_test))

ranked_predictions = predictions.rank(pct=True, method="first")

correlation = np.corrcoef(y_test, ranked_predictions)[0, 1]

print(correlation)

0.02124717034088588


## Holdout 70/30

In [13]:
data = pd.read_csv('data-processed/train.csv')

train = data[data['era'] <= 84]
val = data[data['era'] > 84]

X_train = train.filter(regex=r'feature')
X_val = val.filter(regex=r'feature')

y_train = train['target']
y_val = val['target']

In [14]:
%%time
from lightgbm import LGBMRegressor

mdl = LGBMRegressor(max_depth=5, num_leaves=2**5, learning_rate=0.01, n_estimators=2000, colsample_bytree=0.1, random_state=0)
mdl.fit(X_train, y_train)

Wall time: 1min 14s


LGBMRegressor(colsample_bytree=0.1, learning_rate=0.01, max_depth=5,
              n_estimators=2000, num_leaves=32, random_state=0)

In [15]:
predictions = pd.Series(mdl.predict(X_val))

ranked_predictions = predictions.rank(pct=True, method="first")

correlation = np.corrcoef(y_val, ranked_predictions)[0, 1]

print(correlation)

0.042735723025595196


In [16]:
test = pd.read_csv("data-processed/test.csv")

X_test = test.filter(regex=r'feature')
y_test = test['target']

predictions = pd.Series(mdl.predict(X_test))

ranked_predictions = predictions.rank(pct=True, method="first")

correlation = np.corrcoef(y_test, ranked_predictions)[0, 1]

print(correlation)

0.022500048079699582


## Retrain Test Score

In [17]:
data = pd.read_csv('data-processed/train.csv')

X_train = data.filter(regex=r'feature')
y_train = data['target']

In [18]:
%%time
from lightgbm import LGBMRegressor

mdl = LGBMRegressor(max_depth=5, num_leaves=2**5, learning_rate=0.01, n_estimators=2000, colsample_bytree=0.1, random_state=0)

mdl.fit(X_train, y_train)

Wall time: 2min


LGBMRegressor(colsample_bytree=0.1, learning_rate=0.01, max_depth=5,
              n_estimators=2000, num_leaves=32, random_state=0)

In [19]:
test = pd.read_csv("data-processed/test.csv")

X_test = test.filter(regex=r'feature')
y_test = test['target']

predictions = pd.Series(mdl.predict(X_test))

ranked_predictions = predictions.rank(pct=True, method="first")

correlation = np.corrcoef(y_test, ranked_predictions)[0, 1]

print(correlation)

0.024965507088286824


# Fim