# 05 - Combinatorial Cross-Validation

In [None]:
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor

In [None]:
data = pd.read_csv('data-processed/train.csv')

## Block CV (1-block)

É muito simmilar a validação cruzada que já conhecemos. Por exemplo, separados os dados em 10 folders (atentar de manter as linhas de uma era no mesmo bloco), de 1 a 10. Treinamos nos blocos 2 a 10 e validamos no bloco 1. Depois treinamos com os blocos 1 e 3 a 10, ai validamos com o bloco 2. E por ai vai.

Se notou, estamos usando dado do futuro para treinar o modelo e dados do passado para validar...

In [None]:
data['block'] = np.trunc(data['era']*.1).astype(int)
data.loc[data['block'] == 12, 'block'] = 11

data['block'].value_counts().sort_index()

In [None]:
results_val = []

for block in range(12):
    print("Validation Block {}".format(block))
    
    train = data[data['block'] != block]
    val = data[data['block'] == block]
    
    X_train = train.filter(regex=r'feature')
    X_val = val.filter(regex=r'feature')

    y_train = train['target']
    y_val = val['target']
     
    mdl = LGBMRegressor(max_depth=5, num_leaves=2**5, learning_rate=0.01, n_estimators=2000, colsample_bytree=0.1, random_state=0)
    mdl.fit(X_train, y_train)
    
    predictions = pd.Series(mdl.predict(X_val))
    ranked_predictions = predictions.rank(pct=True, method="first")
    correlation = np.corrcoef(y_val, ranked_predictions)[0, 1]
    #print(correlation)
    
    results_val.append(correlation)
    print("Correlation {}".format(correlation))
    print()

In [None]:
np.median(results_val)

In [None]:
np.min(results_val)

In [None]:
np.max(results_val)

In [None]:
np.mean(results_val)

In [None]:
len(results_val)

# HV-Block CV (1-block)

O racional é o mesmo aqui. A diferença é que não pegamos o bloco antes e o bloco depois do bloco de validação. Assim criamos um gap entre os dados de treino (demais blocos) e o bloco de validação.

In [None]:
results_val = []

for block in range(2,12):
    print("Train block {} - Gap Block {} - Validation Block {}".format(block - 2, block - 1,  block))

    if block == 0:
        train = data[data['block'] > block+1]
    elif block == 11:
        train = data[data['block'] < block-1]
    else:
        train = data[(data['block'] != block-1) & (data['block'] != block) & (data['block'] != block+1)]
    
    val = data[data['block'] == block]
    
    X_train = train.filter(regex=r'feature')
    X_val = val.filter(regex=r'feature')

    y_train = train['target']
    y_val = val['target']

    mdl = LGBMRegressor(max_depth=5, num_leaves=2**5, learning_rate=0.01, n_estimators=2000, colsample_bytree=0.1, random_state=0)
    mdl.fit(X_train, y_train)
    
    predictions = pd.Series(mdl.predict(X_val))
    ranked_predictions = predictions.rank(pct=True, method="first")
    correlation = np.corrcoef(y_val, ranked_predictions)[0, 1]
    #print(correlation)
    
    results_val.append(correlation)
    print("Correlation {}".format(correlation))
    print()

In [None]:
np.median(results_val)

In [None]:
np.min(results_val)

In [None]:
np.max(results_val)

In [None]:
np.mean(results_val)

In [None]:
len(results_val)

## Combinatorial Purged (Gap) (Block)-CV

Gera todas as combicações possíveis entre blocos e não pega as combinações em que os blocos estão lado a lado (são vizinhos).

In [None]:
from itertools import permutations

block_combos = list(permutations(range(12), 2)) # pode ser mais que 2

total_purged_pairs = sum([1 for train_block, val_block in block_combos if abs(train_block - val_block) != 1])
print("Total de pares com gap {}".format(total_purged_pairs))

results_val = []
for train_block, val_block in block_combos:
    if abs(train_block - val_block) == 1:
        continue
        
    print("Train block {} - Validation Block {}".format(train_block, val_block))

    train = data[data['block'] == train_block]
    val = data[data['block'] == val_block]
    
    X_train = train.filter(regex=r'feature')
    X_val = val.filter(regex=r'feature')

    y_train = train['target']
    y_val = val['target']
     
    mdl = LGBMRegressor(max_depth=5, num_leaves=2**5, learning_rate=0.01, n_estimators=2000, colsample_bytree=0.1, random_state=0)
    mdl.fit(X_train, y_train)
    
    predictions = pd.Series(mdl.predict(X_val))
    ranked_predictions = predictions.rank(pct=True, method="first")
    correlation = np.corrcoef(y_val, ranked_predictions)[0, 1]
    #print(correlation)
    
    results_val.append(correlation)
    print("Correlation {}".format(correlation))
    print()

In [None]:
np.median(results_val)

In [None]:
np.min(results_val)

In [None]:
np.max(results_val)

In [None]:
np.mean(results_val)

In [None]:
len(results_val)

# Fim