In [1]:
import os
import joblib
import numpy as np
import pandas as pd

from joblib import Parallel, delayed
from sklearn.model_selection import KFold, StratifiedKFold

data_path = '../../data/old_presets/data'
benchmark_path = '../../data/old_presets'

In [2]:
# not run setup!!
NFOLDS = 5
NJOBS = 4
SEED = 42

### Preprocess ASHRAE

In [None]:
%%time

data = pd.read_csv(data_path + '/ashrae-energy-prediction/train.csv')
bm = pd.read_csv(data_path + '/ashrae-energy-prediction/building_metadata.csv').set_index('building_id')

for col in bm.columns:
    data[col] = data['building_id'].map(bm[col]).values
    
data.reset_index(inplace=True)

data['index'] = list(map(hash, zip(data['site_id'], data['timestamp'])))

weather = pd.read_csv(data_path + '/ashrae-energy-prediction/weather_train.csv')
weather['index'] = list(map(hash, zip(weather['site_id'], weather['timestamp'])))
weather = weather.set_index('index').drop(['site_id', 'timestamp'],axis=1)


for col in weather.columns:
    data[col] = data['index'].map(weather[col]).values
    
data.drop('index', axis=1, inplace=True)

data.to_csv(data_path + '/ashrae-energy-prediction/processed_train.csv', index=False)

### Preprocess IEEE FRAUD

In [None]:
data = pd.read_csv(data_path + '/ieee-fraud-detection' + '/train_transaction.csv')
data_id = pd.read_csv(data_path + '/ieee-fraud-detection' + '/train_identity.csv')

data = pd.merge(data, data_id, on='TransactionID', how='left')

data.drop('TransactionID', axis=1, inplace=True)

data.to_csv(data_path + '/ieee-fraud-detection' + '/processed_train.csv', index=False)


### Preprocess BNP

In [None]:
!cd {data_path + '/bnp-paribas-cardif-claims-management'}; unzip train.csv.zip 

### Preprocess SPRING

In [None]:
!cd {data_path + '/springleaf-marketing-response'}; unzip train.csv.zip 

### Configurate runs

In [3]:
data_info = {
    
    # OPENML
    
    'covertype': {
        'path': 'openml/covertype.csv',
        'target': 'class',
        'task_type': 'multiclass',
        'class_map': {x: x - 1 for x in range(1, 8)},
        'read_csv_params': {'na_values': '?'}
    },
    
    'albert': {
        'path': 'openml/albert.csv',
        'target': 'class',
        'task_type': 'binary',
        'read_csv_params': {'na_values': '?'}
    },
    
    'higgs': {
        'path': 'openml/higgs.csv',
        'target': 'class',
        'task_type': 'binary',
        'read_csv_params': {'na_values': '?'}
    },
    
    'guillermo': {
        'path': 'openml/guillermo.csv',
        'target': 'class',
        'task_type': 'binary',
        'read_csv_params': {'na_values': '?'}
    },
    
    'bank-marketing': {
        'path': 'openml/bank-marketing.csv',
        'target': 'Class',
        'task_type': 'binary',
        'class_map': {x: x - 1 for x in range(1, 3)},
        'read_csv_params': {'na_values': '?'}
    },
    
    'numerai28.6': {
        'path': 'openml/numerai28.6.csv',
        'target': 'attribute_21',
        'task_type': 'binary',
        'read_csv_params': {'na_values': '?'}
    },
    
    'volkert': {
        'path': 'openml/volkert.csv',
        'target': 'class',
        'task_type': 'multiclass',
        'read_csv_params': {'na_values': '?'}
    },
    
    'adult': {
        'path': 'openml/adult.csv',
        'target': 'class',
        'task_type': 'binary',
        'class_map': {' <=50K': 0, ' >50K': 1},
        'read_csv_params': {'na_values': '?'}
    },
    
    'MiniBooNE': {
        'path': 'openml/MiniBooNE.csv',
        'target': 'signal',
        'task_type': 'binary',
        'class_map': {False: 0, True: 1},
        'read_csv_params': {'na_values': '?'}
    },
    
    'dilbert': {
        'path': 'openml/dilbert.csv',
        'target': 'class',
        'task_type': 'multiclass',
        'read_csv_params': {'na_values': '?'}
    },
    
    'riccardo': {
        'path': 'openml/riccardo.csv',
        'target': 'class',
        'task_type': 'binary',
        'read_csv_params': {'na_values': '?'}
    },
    
    'shuttle': {
        'path': 'openml/shuttle.csv',
        'target': 'class',
        'task_type': 'multiclass',
        'class_map': {x: x - 1 for x in range(1, 8)},
        'read_csv_params': {'na_values': '?'}
    },
    
    'KDDCup09_appetency': {
        'path': 'openml/KDDCup09_appetency.csv',
        'target': 'APPETENCY',
        'task_type': 'binary',
        'class_map': {-1: 0, 1: 1},
        'read_csv_params': {'na_values': '?'}
    },
    
    'Fashion-MNIST': {
        'path': 'openml/Fashion-MNIST.csv',
        'target': 'class',
        'task_type': 'multiclass',
        'read_csv_params': {'na_values': '?'}
    },
    
    'connect-4': {
        'path': 'openml/connect-4.csv',
        'target': 'class',
        'task_type': 'multiclass',
        'read_csv_params': {'na_values': '?'}
    },
    
    'airlines': {
        'path': 'openml/airlines.csv',
        'target': 'Delay',
        'task_type': 'binary',
        'read_csv_params': {'na_values': '?'}
    },
    
    'jannis': {
        'path': 'openml/jannis.csv',
        'target': 'class',
        'task_type': 'multiclass',
        'read_csv_params': {'na_values': '?'}
    },
    
    'nomao': {
        'path': 'openml/nomao.csv',
        'target': 'Class',
        'task_type': 'binary',
        'class_map': {x: x - 1 for x in range(1, 3)},
        'read_csv_params': {'na_values': '?'}
    },
    
    'Amazon_employee_access': {
        'path': 'openml/Amazon_employee_access.csv',
        'target': 'target',
        'task_type': 'binary',
        'read_csv_params': {'na_values': '?'}
    },
    
    'robert': {
        'path': 'openml/robert.csv',
        'target': 'class',
        'task_type': 'multiclass',
        'read_csv_params': {'na_values': '?'}
    },
    
    'aps_failure': {
        'path': 'openml/aps_failure.csv',
        'target': 'class',
        'task_type': 'binary',
        'class_map': {'neg': 0, 'pos': 1},
        'read_csv_params': {'na_values': '?'}
    },
    
    'jungle_chess_2pcs_raw_endgame_complete': {
        'path': 'openml/jungle_chess_2pcs_raw_endgame_complete.csv',
        'target': 'class',
        'task_type': 'multiclass',
        'class_map': {'w': 0, 'b': 1, 'd': 2},
        'read_csv_params': {'na_values': '?'}
    },
    
    # KAGGLE ....
    'ashrae-energy-prediction': {
        'path': 'ashrae-energy-prediction/processed_train.csv',
        'target': 'meter_reading',
        'task_type': 'reg'
    },
    
    'ieee-fraud-detection': {
        'path': 'ieee-fraud-detection/processed_train.csv',
        'target': 'isFraud',
        'task_type': 'binary',
    },
    
    'bnp-paribas-cardif-claims-management': {
        'path': 'bnp-paribas-cardif-claims-management/train.csv',
        'target': 'target',
        'task_type': 'binary',
        'drop': ['ID']
    },
    
    'porto-seguro-safe-driver-prediction': {
        'path': 'porto-seguro-safe-driver-prediction/train.csv',
        'target': 'target',
        'task_type': 'binary',
        'drop': ['id']
    },
    
    'springleaf-marketing-response': {
        'path': 'springleaf-marketing-response/train.csv',
        'target': 'target',
        'task_type': 'binary',
        'drop': ['ID']
    },  
    
    'talkingdata-adtracking-fraud-detection': {
        'path': 'talkingdata-adtracking-fraud-detection/train.csv',
        'target': 'is_attributed',
        'task_type': 'binary',
        'drop': ['attributed_time']
    }, 
}


# # update data path
# for k in data_info:
#     data_info[k]['path'] = os.path.join(data_path, data_info[k]['path'])

joblib.dump(data_info, os.path.join(benchmark_path, 'data_info.pkl'))

['../../data/old_presets/data_info.pkl']

### CREATE TARGET MAPPING AND FOLDS

In [7]:
folds_dir = os.path.join(benchmark_path, 'folds')
os.makedirs(folds_dir, exist_ok=True)

def get_target(name):
    
    data_info = joblib.load(os.path.join(benchmark_path, 'data_info.pkl'))[name]
    target = pd.read_csv(os.path.join(data_path, data_info['path']), usecols=[data_info['target']])
    
    target = target[target.columns[0]]
    
    if 'class_map' in data_info:
        target = target.map(data_info['class_map'])
        
    assert target.notnull().all(), 'NaN target'
    
    return target


def create_folds(name):
    
    target = get_target(name).values
    cv = KFold(NFOLDS, random_state=SEED, shuffle=True)
    folds = np.zeros(target.shape[0], dtype=np.int64)
    
    
    for n, (f0, f1) in enumerate(cv.split(target, target)):
        
        folds[f1] = n
        
    joblib.dump(folds, os.path.join(folds_dir, name + '.pkl'))
        

for x in data_info:
    create_folds(x)