In [1]:
import pandas as pd
import numpy as np
from joblib import dump, load
from sklearn.preprocessing import OneHotEncoder

In [2]:
# load the data to be processed
train_data = pd.read_csv('../data/train_data.csv')
valid_data = pd.read_csv('../data/valid_data.csv')
test_data = pd.read_csv('../data/test_data.csv')

## create functions for one-hot encoding categorical columns

In [3]:
def create_ohe(train_data, filename='joblib/ohe.joblib'):
    """
    Return an one-hot encoder fitted using `train_data` as a joblib file.
    Parameters
    ----------
    train_data: pandas.core.frame.DataFrame
        Training set
    filename: str
        Filename to use to save encoder
    """

    # identify columns to one-hot encode
    features = train_data.drop(columns=['y'])
    to_encode = features.columns[features.dtypes==object].tolist()

    # create encoder
    ohe = OneHotEncoder(handle_unknown='ignore', sparse=False, dtype=int)

    # fit the encoder on training data
    ohe.fit(train_data.loc[:, to_encode])

    # check number of columns to encode
    assert len(ohe.categories_) == len(to_encode)

    # Save the OHE
    dump(ohe, filename)


    return None

In [4]:
def apply_ohe(data, filename='ohe.joblib'):
    """
    Given a one-hot encoder fit on `train_data`, return a data frame
    with the one-hot encoder applied.
    Parameters
    ----------
    data: pandas.core.frame.DataFrame
        `train_data`, `valid_data` or `test_data`
    filename: str
        Filename of fitted encoder
    Returns
    -------
    pandas.core.frame.DataFrame
    """

    # WARNING: `to_encode` must match list passed in create_ohe
    features = data.drop(columns=['y'])
    to_encode = features.columns[features.dtypes==object].tolist()

    # load the encoder that has been fit on the training data
    ohe = load(filename)

    # apply the encoder
    ohe_array = ohe.transform(data.loc[:, to_encode])

    # get names of encoded columns and add variable name as prefix
    ohe_cols = []
    for i in range(len(ohe.categories_)):
        ohe_cols.append(to_encode[i] + '_' + ohe.categories_[i])

    ohe_cols = np.concatenate(ohe_cols).ravel()

    # create dataframe containing encoded columns (preserve original row indices)
    ohe_df = pd.DataFrame(ohe_array, index=data.index, columns=ohe_cols)

    # concatenate with existing data frame
    data_ohe = pd.concat((data, ohe_df), axis=1)

    # drop the columns that were one-hot encoded
    data_ohe = data_ohe.drop(columns = to_encode)

    # check that the number of rows is unchanged
    assert data_ohe.shape[0] == data.shape[0]

    # check that first column in `to_encode` is not in the encoded dataframe
    assert to_encode[0] not in data_ohe.columns.to_list()
    
    # drop `y` from encoded data
    data_ohe = data_ohe.drop(columns = ['y'])


    return data_ohe

In [5]:
# apply_ohe(train_data, filename='joblib/ohe.joblib').info()

## pre-process the training data

In [6]:
# create a function to pre-process the data

def process_train(train_data):
    """
    Processes training data set. 
    Fits and applies a one-hot encoder. 
    Drops some features. 
    Makes target variable numeric & binary.
    Returns a dataframe.
    
    Parameters
    ----------
    train_data : pandas.core.frame.DataFrame
       The training data set ('train_data.csv')

    Returns
    -------
    pandas.core.frame.DataFrame
        The processed train set with features + target 
    """
        
    # target
    y = train_data['y'].replace({'no':0, 'yes':1})
    
    # predictors
    features = train_data.drop(columns=['y'])
    
    # fit and save one-hot encoder
    create_ohe(train_data, filename='joblib/ohe.joblib')
    
    # transform data using one-hot encoder
    data_ohe = apply_ohe(train_data, filename='joblib/ohe.joblib')
    
    # drop `duration`
    data_ohe = data_ohe.drop(columns = ['duration'])
    
#     # drop highly correlated features
#     data_ohe = data_ohe.drop(columns = ['emp.var.rate', 'euribor3m'])
    
    # combine processed predictor data back with target
    data = pd.concat([y, data_ohe], axis=1, sort='false')
    
    
    return data

In [7]:
train_data_processed = process_train(train_data)
train_data_processed

Unnamed: 0,y,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success
0,0,59,4,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0,0,0,1,0,0,0,0,1,0
1,0,30,2,999,0,1.4,93.918,-42.7,4.958,5228.1,...,0,0,0,0,1,0,0,0,1,0
2,0,28,1,999,0,1.4,93.918,-42.7,4.955,5228.1,...,0,0,0,0,0,1,0,0,1,0
3,0,34,4,999,0,1.4,93.918,-42.7,4.961,5228.1,...,0,0,0,0,0,1,0,0,1,0
4,0,43,3,999,0,1.4,94.465,-41.8,4.961,5228.1,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28819,0,50,11,999,0,-0.1,93.200,-42.0,4.021,5195.8,...,0,0,1,0,0,0,0,0,1,0
28820,0,43,1,999,0,1.1,93.994,-36.4,4.858,5191.0,...,0,0,0,0,0,0,1,0,1,0
28821,1,37,1,999,0,1.4,93.444,-36.1,4.966,5228.1,...,0,0,0,0,0,1,0,0,1,0
28822,0,25,3,999,0,1.1,93.994,-36.4,4.856,5191.0,...,0,0,0,0,0,1,0,0,1,0


In [8]:
train_data_processed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28824 entries, 0 to 28823
Data columns (total 63 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   y                              28824 non-null  int64  
 1   age                            28824 non-null  int64  
 2   campaign                       28824 non-null  int64  
 3   pdays                          28824 non-null  int64  
 4   previous                       28824 non-null  int64  
 5   emp.var.rate                   28824 non-null  float64
 6   cons.price.idx                 28824 non-null  float64
 7   cons.conf.idx                  28824 non-null  float64
 8   euribor3m                      28824 non-null  float64
 9   nr.employed                    28824 non-null  float64
 10  job_admin.                     28824 non-null  int64  
 11  job_blue-collar                28824 non-null  int64  
 12  job_entrepreneur               28824 non-null 

## pre-process the validation and test data

In [9]:
# create a function to pre-process the data

def process_valid_test(valid_test_data):
    """
    Processes validation and/or test data sets. 
    Applies a one-hot encoder that was fit on training data.
    Drops some features. 
    Makes target variable numeric & binary.
    Returns a dataframe.
    
    Parameters
    ----------
    valid_test_data : pandas.core.frame.DataFrame
       Either the validation or the test data set ('valid_data.csv' or 'test_data.csv')

    Returns
    -------
    pandas.core.frame.DataFrame
        Either the processed validation or test set with features + target 
    """
        
    # target
    y = valid_test_data['y'].replace({'no':0, 'yes':1})
    
    # predictors
    
    # transform data using saved one-hot encoder
    data_ohe = apply_ohe(valid_test_data, filename='joblib/ohe.joblib')
    
    # drop `duration`
    data_ohe = data_ohe.drop(columns = ['duration'])
    
#     # drop highly correlated features
#     data_ohe = data_ohe.drop(columns = ['emp.var.rate', 'euribor3m'])
    
    # combine processed predictor data back with target
    data = pd.concat([y, data_ohe], axis=1, sort='false')
    
    
    return data

In [10]:
valid_data_processed = process_valid_test(valid_data)
valid_data_processed

Unnamed: 0,y,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success
0,0,38,3,999,0,-0.1,93.200,-42.0,4.120,5195.8,...,0,0,0,0,0,0,1,0,1,0
1,0,36,2,999,1,-1.8,92.893,-46.2,1.327,5099.1,...,0,0,0,0,1,0,0,1,0,0
2,1,50,6,3,1,-1.8,92.893,-46.2,1.354,5099.1,...,0,0,0,1,0,0,0,0,0,1
3,0,25,3,999,0,1.4,93.918,-42.7,4.958,5228.1,...,0,0,0,0,1,0,0,0,1,0
4,0,33,1,999,1,-1.8,93.075,-47.1,1.405,5099.1,...,0,0,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6171,0,43,1,999,0,-0.1,93.200,-42.0,4.076,5195.8,...,0,0,0,0,1,0,0,0,1,0
6172,0,36,1,999,1,-1.8,93.075,-47.1,1.405,5099.1,...,0,0,0,1,0,0,0,1,0,0
6173,1,52,1,999,0,1.4,94.465,-41.8,4.864,5228.1,...,0,0,0,0,0,0,1,0,1,0
6174,1,46,3,999,0,-1.7,94.215,-40.3,0.822,4991.6,...,0,0,1,0,0,0,0,0,1,0


In [11]:
test_data_processed = process_valid_test(test_data)
test_data_processed

Unnamed: 0,y,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success
0,0,34,2,999,0,1.4,94.465,-41.8,4.865,5228.1,...,0,0,0,1,0,0,0,0,1,0
1,0,37,1,999,1,-1.8,92.893,-46.2,1.344,5099.1,...,0,0,0,0,0,1,0,1,0,0
2,0,33,1,6,2,-2.9,92.201,-31.4,0.859,5076.2,...,0,0,0,0,0,1,0,0,0,1
3,0,26,14,999,0,1.4,94.465,-41.8,4.959,5228.1,...,0,0,1,0,0,0,0,0,1,0
4,0,28,1,999,0,1.4,93.918,-42.7,4.960,5228.1,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6171,0,23,2,999,0,1.4,94.465,-41.8,4.962,5228.1,...,0,0,0,0,0,0,1,0,1,0
6172,0,50,9,999,0,1.4,93.918,-42.7,4.962,5228.1,...,0,0,1,0,0,0,0,0,1,0
6173,0,33,1,999,0,-1.8,92.893,-46.2,1.266,5099.1,...,0,0,0,0,0,1,0,0,1,0
6174,0,53,4,999,0,1.1,93.994,-36.4,4.860,5191.0,...,0,0,0,0,1,0,0,0,1,0


## save as csv files

In [12]:
# save as csv files
train_data_processed.to_csv('../data/train_data_processed.csv', index = False)
print('shape of processed train data: ', train_data_processed.shape)

valid_data_processed.to_csv('../data/valid_data_processed.csv', index = False)
print('shape of processed validation data: ', valid_data_processed.shape)

test_data_processed.to_csv('../data/test_data_processed.csv', index = False)
print('shape of processed test data: ', test_data_processed.shape)

shape of processed train data:  (28824, 63)
shape of processed validation data:  (6176, 63)
shape of processed test data:  (6176, 63)
