#### Preprocessing:

In [1]:
import numpy as np
import pandas as pd
import logging
import os
from dotenv import find_dotenv, load_dotenv
import datetime
import glob
from os.path import abspath
from pathlib import Path
from inspect import getsourcefile
from datetime import datetime
import math
import argparse
import sys
import tensorflow as tf

from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OneHotEncoder

nb_dir = os.path.join(Path(os.getcwd()).parents[0], 'src', 'data')
if nb_dir not in sys.path:
    sys.path.insert(0, nb_dir)
import get_raw_data as grd
import data_classes
import Normalizer

DT_FLOAT = np.float32 
DT_BOOL = np.uint8
RANDOM_SEED = 123
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# logger.propagate = False # it will not log to console.

RAW_DIR = os.path.join(Path(os.getcwd()).parents[0], 'data', 'raw') 
PRO_DIR = os.path.join(Path(os.getcwd()).parents[0], 'data', 'processed')

print(RAW_DIR, PRO_DIR)

  from ._conv import register_converters as _register_converters
  """)
2018-12-18 16:13:24,356 - matplotlib - DEBUG - CACHEDIR=/home/ubuntu/.cache/matplotlib
2018-12-18 16:13:24,361 - matplotlib.font_manager - DEBUG - Using fontManager instance from /home/ubuntu/.cache/matplotlib/fontList.json
2018-12-18 16:13:24,497 - matplotlib.backends - DEBUG - backend module://ipykernel.pylab.backend_inline version unknown
2018-12-18 16:13:24,510 - matplotlib.backends - DEBUG - backend module://ipykernel.pylab.backend_inline version unknown


/home/ubuntu/MLMortgage/data/raw /home/ubuntu/MLMortgage/data/processed


In [2]:
def update_parser(parser):
    """Parse the arguments from the CLI and update the parser."""    
    parser.add_argument(
        '--prepro_step',
        type=str,
        default='preprocessing', #'slicing', 'preprocessing'
        help='To execute a preprocessing method')    
    #this is for allfeatures_preprocessing:
    parser.add_argument(
        '--train_period',
        type=int,
        nargs='*',
        default=[121,323], #[121,279], #[156, 180], [121,143],  # 279],
        help='Training Period')
    parser.add_argument(
        '--valid_period',
        type=int,
        nargs='*',
        default=[324,329], #[280,285], #[181,185], [144,147],
        help='Validation Period')    
    parser.add_argument(
        '--test_period',
        type=int,
        nargs='*',
        default=[330, 342], #[286, 304], # [186,191], [148, 155],
        help='Testing Period')    
    parser.add_argument(
        '--prepro_dir',
        type=str,
        default='chuncks_random_c1mill',
        help='Directory with raw data inside data/raw/ and it will be the output directory inside data/processed/')    
    parser.add_argument(
        '--prepro_chunksize',
        type=int,
        default=500000,
        help='Chunk size to put into the h5 file...')    
    parser.add_argument(
        '--prepro_with_index',
        type=bool,
        default=True,
        help='To keep indexes for each record')
    parser.add_argument(
        '--ref_norm',
        type=bool,
        default=True,
        help='To execute the normalization over the raw inputs')
        
    return parser.parse_known_args()

In [3]:

FLAGS, UNPARSED = update_parser(argparse.ArgumentParser())    
#these are the more important parameters for preprocessing:
FLAGS.prepro_dir='chuncks_random_c1mill' #this directory must be the same inside 'raw' and processed directories.
FLAGS.prepro_chunksize=500000 
FLAGS.train_period=[121,323] #[121,279] #[121, 143] 
FLAGS.valid_period=[324,329] #[280,285] #[144, 147] 
FLAGS.test_period=[330,342] #[286,304] #[148, 155]                                                
FLAGS.prepro_with_index = True

print(FLAGS)    

Namespace(prepro_chunksize=500000, prepro_dir='chuncks_random_c1mill', prepro_step='preprocessing', prepro_with_index=True, ref_norm=True, test_period=[330, 342], train_period=[121, 323], valid_period=[324, 329])


In [4]:
glob.glob(os.path.join(RAW_DIR, FLAGS.prepro_dir,"*.txt"))
# from IPython.core.debugger import Tracer; Tracer()()

['/home/ubuntu/MLMortgage/data/raw/chuncks_random_c1mill/temporalloandynmodifmrstaticitur_3Trans_0Lab_100th.txt',
 '/home/ubuntu/MLMortgage/data/raw/chuncks_random_c1mill/temporalloandynmodifmrstaticitur_CTrans_3Lab_100th.txt']

In [5]:
def allfeatures_extract_labels(data, columns='MBA_DELINQUENCY_STATUS_next'):
    '''Extract the labels from Dataset, order-and-transform them into one-hot matrix of labels.
        Args: 
            data (DataFrame): Input Dataset which is modified in place.
            columns (string): Name of the class column.
        Returns: 
            one-hot matrix of labels of shape: [data.shape[0], 7]. 
        Raises:        
    '''    
    logger.name = 'allfeatures_extract_labels'
    if (type(columns)==str):
         indices = [i for i, elem in enumerate(data.columns) if columns in elem] # (alphabetically ordered)
    else:
        indices =  columns 

    if indices:
        labels = data[data.columns[indices]]
        data.drop(data.columns[indices], axis=1, inplace=True)    
        logger.info('...Labels extracted from Dataset...')
        return labels
    else: return None


In [6]:
def tag_chunk(tag, label, chunk, chunk_periods, tag_period, log_file, with_index, tag_index, hdf=None, tfrec=None):
    '''Extract records filtering by chunk_periods parameter, define indexes in case of with_index=True, 
        extract labels and save the results into the target file.
        Args: 
            chunk (DataFrame): Input Dataset which is modified in place.
            tag (string): 'train', 'valid' or 'test'
            chunk_periods (integer array): an array containing all periods into the chunk.
            tag_period (integer array): an array of form [init_period, end_period] for the correspond tag.
            log_file (Logger): An object of the log file.
            with_index (boolean): If true it will be saved the indexes.
            tag_index (int): an index that accumulates the size of the processed chunk. 
            hdf or tfrec (HDFStore or TFRecords): an object of the target file. Only one must be distint of None.
        Returns: 
            tag_index (int): tag_index updated.
        Raises:        
    '''    
    
    inter_periods = list(chunk_periods.intersection(set(range(tag_period[0], tag_period[1]+1))))
    log_file.write('Periods corresponding to ' + tag +' period: %s\r\n' % str(inter_periods))
    p_chunk = chunk.loc[(slice(None), inter_periods), :]
    log_file.write('Records for ' + tag +  ' Set - Number of rows: %d\r\n' % (p_chunk.shape[0]))
    print('Records for ' + tag + ' Set - Number of rows:', p_chunk.shape[0])
    if (p_chunk.shape[0] > 0):
        if (with_index==True):
            # p_chunk.index = pd.MultiIndex.from_tuples([(i, x[1], x[2],x[3]) for x,i in zip(p_chunk.index, range(tag_index, tag_index + p_chunk.shape[0]))])                                
            p_chunk.reset_index(inplace=True)
            allfeatures_drop_cols(p_chunk, ['PERIOD'])      
            p_chunk.set_index('DELINQUENCY_STATUS_NEXT', inplace=True) #1 index                                      
        else:
            p_chunk.reset_index(drop=True, inplace=True)
            
        labels = allfeatures_extract_labels(p_chunk, columns=label)
        p_chunk = p_chunk.astype(DT_FLOAT)
        labels = labels.astype(np.int8)
        if (p_chunk.shape[0] != labels.shape[0]) : 
            print('Error in shapes:', p_chunk.shape, labels.shape)
        else :
            if (hdf!=None):
                hdf.put(tag + '/features', p_chunk, append=True, index=True) #data_columns=p_chunk.columns.values), index=False
                hdf.put(tag + '/labels', labels, append=True, index=True) #data_columns=labels.columns.values)                         
                hdf.flush()                      
            elif (tfrec!=None):
                for row, lab in zip(p_chunk.values, labels.values):
                    feature = {tag + '/labels': _int64_feature(lab),
                               tag + '/features': _float_feature(row)}
                    # Create an example protocol buffer
                    example = tf.train.Example(features=tf.train.Features(feature=feature))
                    tfrec.write(example.SerializeToString())                            
                tfrec.flush()
            tag_index += p_chunk.shape[0]

    return tag_index

In [7]:
def allfeatures_drop_cols(data, columns):
    '''Exclude from the dataset 'data' the descriptive columns as parameters.
        Args: 
            data (DataFrame): Input Dataset which is modified in place.
        Returns: 
            None
        Raises:        
    '''
    logger.name = 'allfeatures_drop_cols'    
    data.drop(columns, axis=1, inplace=True)
    logger.info('...Columns Excluded from dataset...')
    return None

In [8]:
def oneHotDummies_column(column, categories):
    '''Convert categorical variable into dummy/indicator variables.
    
    Args: 
        column (Series): Input String Categorical Column.
    Returns: 
        DataFrame. Integer Sparse binary matrix of categorical features.
    Raises:        
    '''    
    logger.name = 'oneHotDummies_column: ' +  column.name
    cat_column = pd.Categorical(column.astype('str'), categories=categories)
    cat_column = pd.get_dummies(cat_column)   # in the same order as categories! (alphabetically ordered) 
    cat_column = cat_column.add_prefix(column.name + '_')
    if (cat_column.isnull().any().any()):
        null_cols = cat_column.columns[cat_column.isnull().any()]
        print(cat_column[null_cols].isnull().sum())
        print(cat_column[cat_column.isnull().any(axis=1)][null_cols].head(50))
    return cat_column

In [9]:
def imputing_nan_values(nan_dict, distribution):        
    '''Replace nan values with a value according the nan_dict dictionary and distribution of this feature.
        Args: 
            nan_dict (Dictionary): the key values are the name of features, the values could be a literal or 
            values belonging to the distribution.
            distribution (DataFrame): Contains the median value for numerical features.
        Returns: 
            new_dict (Dictionary): contains the values updated.
        Raises:        
    '''    
    new_dict = {}
    for k,v in nan_dict.items():
        if v=='median':
            new_dict[k] = float(distribution[k+'_MEDIAN'])    
        elif v=='mean':
            new_dict[k] = float(distribution[k+'_MEAN'])                
        else:
            new_dict[k] = v
            
    return new_dict

In [10]:
def drop_invalid_delinquency_status(data, gflag, log_file):   
    '''Delete all subsecuent records of a loan when the feature delinquency_status_next 
       contains any of the following invalid status: S,T,X or Z.
        Args: 
            data (DataFrame): Input Dataset which is modified in place.
            gflag (int): Loan_id of the last loan in previous data, in case this contains some invalid status, 
            to delete all records inside the current data.
            log_file (Logger): An object of the log file.
        Returns: 
            gflag (int): Loan_id of the last loan in current data, in case this contains some invalid status.
        Raises:        
    '''        
    logger.name = 'drop_invalid_delinquency_status'
    delinq_ids =  data[data['MBA_DELINQUENCY_STATUS'].isin(['0', 'R', 'S', 'T', 'X', 'Z'])]['LOAN_ID']
    groups = data[data['LOAN_ID'].isin(delinq_ids)][['LOAN_ID', 'PERIOD', 'MBA_DELINQUENCY_STATUS', 'DELINQUENCY_STATUS_NEXT']].groupby('LOAN_ID') 
    groups_list = list(groups)
    
    iuw= pd.Index([])
    
    if gflag != '': 
        try:
            iuw= iuw.union(groups.get_group(gflag).index[0:])
        except  Exception  as e:
            print(str(e))
                
    if data.iloc[-1]['LOAN_ID'] in groups.groups.keys():
        gflag = data.iloc[-1]['LOAN_ID']
    else:
        gflag = ''
                
    for k, group in groups_list: 
        li= group.index[(group['MBA_DELINQUENCY_STATUS'] =='S') | (group['MBA_DELINQUENCY_STATUS'] =='T') 
                         | (group['MBA_DELINQUENCY_STATUS'] =='X') | (group['MBA_DELINQUENCY_STATUS'] =='Z')].tolist()
        if li: iuw= iuw.union(group.index[group.index.get_loc(li[0]):])
        # In case of REO or Paid-Off, we need to exclude since the next record:
        df_delinq_01 = group[(group['MBA_DELINQUENCY_STATUS'] =='0') | (group['MBA_DELINQUENCY_STATUS'] =='R')]
        if df_delinq_01.shape[0]>0: 
            track_i = df_delinq_01.index[0]
            iuw= iuw.union(group.index[group.index.get_loc(track_i)+1:])
        
    if iuw!=[]:
        log_file.write('drop_invalid_delinquency_status - Total rows: %d\r\n' % len(iuw)) # (log_df.shape[0])
        data.drop(iuw, inplace=True) 
        logger.info('invalid_delinquency_status dropped')             
    
    return gflag

In [11]:
def zscore(x,mean,stdd):
    return (x - mean) / stdd

def zscore_apply(dist_file, data):            
    stddv_0 = []
    nnorm_cols = []
    for col_name in data.columns.values:                                
        mean = pd.Series(dist_file.iloc[0, np.where(pd.DataFrame(dist_file.columns.values)[0].str.contains(col_name+'_MEAN'))[0]], dtype='float32')    
        stddev = dist_file.iloc[0, np.where(pd.DataFrame(dist_file.columns.values)[0].str.contains(col_name+'_STDD'))[0]]    
        if not mean.empty and not stddev.empty:  
            mean = np.float32(mean.values[0])
            stddev = np.float32(stddev.values[0])            
            if stddev == 0: 
                stddv_0.append(col_name)        
            else:        
                data[col_name] = data[col_name].apply(lambda x: zscore(x, mean, stddev))                        
        else: 
            nnorm_cols.append(col_name)
    print('STANDARD DEV zero: ', stddv_0)        
    return data, nnorm_cols

In [12]:
def prepro_chunk(file_name, file_path, chunksize, label, log_file, nan_cols, categorical_cols, descriptive_cols, time_cols,
                 dist_file, with_index, refNorm, train_period, valid_period, test_period, robust_cols, 
                 minmax_cols=None, hdf=None, tfrec=None, filtering_cols=None):
    gflag = ''    
    i = 1                  
    train_index = 0
    valid_index = 0
    test_index = 0
    for chunk in pd.read_csv(file_path, chunksize = chunksize, sep=',', low_memory=False):    
        print('chunk: ', i, ' chunk size: ', chunk.shape[0])
        log_file.write('chunk: %d, chunk size: %d \n' % (i, chunk.shape[0]))
        chunk.columns = chunk.columns.str.upper()                            
        
        log_df = chunk[chunk[label].isnull()]
        log_file.write('Dropping Rows with Null Labels - Number of rows: %d\r\n' % (log_df.shape[0]))
        chunk.drop(chunk.index[chunk[label].isnull()], axis=0, inplace=True)
        
        log_df = chunk[chunk['INVALID_TRANSITIONS']==1]
        log_file.write('Dropping Rows with Invalid Transitions - Number of rows: %d\r\n' % (log_df.shape[0]))                                
        chunk.drop(chunk.index[chunk['INVALID_TRANSITIONS']==1], axis=0, inplace=True)    
        #print('chunk with missing MBA_DELINQUENCY_STATUS', chunk[(chunk['MBA_DELINQUENCY_STATUS']=='') | (chunk['MBA_DELINQUENCY_STATUS'].isna())])
        chunk.drop(chunk.index[(chunk['MBA_DELINQUENCY_STATUS'].astype('str')=='')], axis=0, inplace=True) #| (chunk['MBA_DELINQUENCY_STATUS'].isna())        
        
        gflag = drop_invalid_delinquency_status(chunk, gflag, log_file)               
                    
        null_columns=chunk.columns[chunk.isnull().any()]
        log_df = chunk[chunk.isnull().any(axis=1)][null_columns]
        log_file.write('Filling NULL values - (rows, cols) : %d, %d\r\n' % (log_df.shape[0], log_df.shape[1]))                    
        log_df = chunk[null_columns].isnull().sum().to_frame().reset_index()
        log_df.to_csv(log_file, index=False, mode='a')                                    
        nan_cols = imputing_nan_values(nan_cols, dist_file)            
        chunk.fillna(value=nan_cols, inplace=True)   
        
        chunk.drop_duplicates(inplace=True) # Follow this instruction!!                        
        logger.info('dropping invalid transitions and delinquency status, fill nan values, drop duplicates')                  
        log_file.write('Drop duplicates - new size : %d\r\n' % (chunk.shape[0]))
                               
        chunk.reset_index(drop=True, inplace=True)  #don't remove this line! otherwise NaN values appears.
        #chunk['ORIGINATION_YEAR'][chunk['ORIGINATION_YEAR']<1995] = "B1995"
        #chunk['ORIGINATION_YEAR'][(chunk['ORIGINATION_YEAR']<>"B1995") & (chunk['ORIGINATION_YEAR']>2018)] = "nan"
        chunk['ORIGINATION_YEAR'] = chunk['ORIGINATION_YEAR'].apply(lambda x: "B1995" if x<1995 else '' if (x>2018 or x is None) else x) #.isna()
        for k,v in categorical_cols.items():
            # if (chunk[k].dtype=='O'):                
            chunk[k] = chunk[k].astype('str')
            chunk[k] = chunk[k].str.strip()
            chunk[k].replace(['\.0$'], [''], regex=True,  inplace=True)
            new_cols = oneHotDummies_column(chunk[k], v)
            if (chunk[k].value_counts().sum()!=new_cols.sum().sum()):
                print('Error at categorization, different sizes', k)
                print(chunk[k].value_counts(), new_cols.sum())                
                log_file.write('Error at categorization, different sizes %s\r\n' % str(k))
                chunk[new_cols.columns] = new_cols
            else:
                chunk[new_cols.columns] = new_cols
                log_file.write('New columns added: %s\r\n' % str(new_cols.columns.values))
            
                    
        allfeatures_drop_cols(chunk, descriptive_cols)                    
        #np.savetxt(log_file, descriptive_cols, header='descriptive_cols dropped:', newline=" ")
        log_file.write('descriptive_cols dropped: %s\r\n' % str(descriptive_cols))
        allfeatures_drop_cols(chunk, time_cols)
        #np.savetxt(log_file, time_cols, header='time_cols dropped:', newline=" ")
        log_file.write('time_cols dropped: %s\r\n' % str(time_cols))
        cat_list = list(categorical_cols.keys())
        cat_list.remove('DELINQUENCY_STATUS_NEXT')
        #np.savetxt(log_file, cat_list, header='categorical_cols dropped:', newline=" ")
        log_file.write('categorical_cols dropped: %s\r\n' % str(cat_list))
        allfeatures_drop_cols(chunk, cat_list)

        chunk.reset_index(drop=True, inplace=True)  
        chunk.set_index(['DELINQUENCY_STATUS_NEXT', 'PERIOD'], append=False, inplace=True) #2 indexes
        # np.savetxt(log_file, str(chunk.index.names), header='Indexes created:', newline=" ")
        log_file.write('Indexes created: %s\r\n' % str(chunk.index.names))
         
        if (filtering_cols!=None):
            chunk = chunk[filtering_cols]
            robust_cols = list(set(robust_cols).intersection(filtering_cols))
            log_file.write('Columns Filtered: %s\r\n' % str(chunk.columns.values))
        
        if chunk.isnull().any().any(): 
            # from IPython.core.debugger import Tracer; Tracer()()
            raise ValueError('There are null values...File: ' + file_name)   
                        
        if (refNorm==True):            
            chunk[robust_cols], nnorm_cols =  zscore_apply(dist_file, chunk[robust_cols]) #robust_normalizer.transform(chunk[robust_cols])            
            log_file.write('Columns not normalized: %s\r\n' % str(nnorm_cols))            
            log_file.write('Columns normalized: %s\r\n' % str(set(robust_cols)-set(nnorm_cols)))
            
        
        if chunk.isnull().any().any(): raise ValueError('There are null values...File: ' + file_name)       
        
        chunk_periods = set(list(chunk.index.get_level_values('PERIOD')))
        #print(tfrec)
        if (tfrec!=None):
            train_index = tag_chunk('train', label, chunk, chunk_periods, train_period, log_file, with_index, train_index, tfrec=tfrec[0])
            valid_index = tag_chunk('valid', label, chunk, chunk_periods, valid_period, log_file, with_index, valid_index, tfrec=tfrec[1])
            test_index = tag_chunk('test', label, chunk, chunk_periods, test_period, log_file, with_index, test_index, tfrec=tfrec[2])
            sys.stdout.flush()
        elif (hdf!=None):
            train_index = tag_chunk('train', label, chunk, chunk_periods, train_period, log_file, with_index, train_index, hdf=hdf[0])
            valid_index = tag_chunk('valid', label, chunk, chunk_periods, valid_period, log_file, with_index, valid_index, hdf=hdf[1])
            test_index = tag_chunk('test', label, chunk, chunk_periods, test_period, log_file, with_index, test_index, hdf=hdf[2])                
        
        inter_periods = list(chunk_periods.intersection(set(range(test_period[1]+1,355))))    
        log_file.write('Periods greater than test_period: %s\r\n' % str(inter_periods))
        p_chunk = chunk.loc[(slice(None), inter_periods), :]
        log_file.write('Records greater than test_period - Number of rows: %d\r\n' % (p_chunk.shape[0]))
        
        del chunk        
        i +=  1   
    
    return train_index, valid_index, test_index

In [13]:
def custom_robust_normalizer(ncols, dist_file, normalizer_type='robust_scaler_sk', center_value='median'):            
    norm_cols = []
    scales = []
    centers = []
    scales_0 =[]
    for i, x in enumerate (ncols):                        
        x_frame = dist_file.iloc[:, np.where(pd.DataFrame(dist_file.columns.values)[0].str.contains(x+'_Q'))[0]]    
        if not x_frame.empty and (x_frame.shape[1]>1):       
            iqr = float(pd.to_numeric(x_frame[x+'_Q3'], errors='coerce').subtract(pd.to_numeric(x_frame[x+'_Q1'], errors='coerce')))
            if iqr == 0: scales_0.append(x)
            if iqr!=0: 
                norm_cols.append(x)                
                scales.append(iqr)                    
                if center_value == 'median':
                    centers.append( float(x_frame[x+'_MEDIAN']) )   
                else:
                    centers.append( float(x_frame[x+'_Q1']) )                                       
    if (normalizer_type == 'robust_scaler_sk'):    
        normalizer = RobustScaler()
        normalizer.scale_ = scales
        normalizer.center_ = centers        
    elif (normalizer_type == 'percentile_scaler'):    
        normalizer = Normalizer.Normalizer(scales, centers)     
    else: normalizer=None                  
    
    print(scales_0)
    
    return norm_cols, normalizer

In [14]:
def custom_minmax_normalizer(ncols, scales, dist_file):    
    norm_cols = []
    minmax_scales = []
    centers = []
    for i, x in enumerate (ncols):  
        x_min = dist_file.iloc[0, np.where(pd.DataFrame(dist_file.columns.values)[0].str.contains(x+'_MIN'))[0]]
        x_max = dist_file.iloc[0, np.where(pd.DataFrame(dist_file.columns.values)[0].str.contains(x+'_MAX'))[0]]
        if not(x_min.empty) and not(x_max.empty):            
            x_min = np.float32(x_min.values[0])
            x_max = np.float32(x_max.values[0])
            minmax_scales.append(x_max - x_min)                            
            centers.append(x_min)
            norm_cols.append(x)
            # to_delete.append(i)
        
    normalizer = Normalizer.Normalizer(minmax_scales, centers)         
    
    return norm_cols, normalizer #, to_delete

In [15]:
def allfeatures_preprocessing(RAW_DIR, PRO_DIR, raw_dir, train_period, valid_period, test_period, dividing='percentage', 
                              chunksize=500000, refNorm=True, with_index=True, output_hdf=True, 
                              label='DELINQUENCY_STATUS_NEXT', filtering_cols=None):            

    descriptive_cols = [
    'LOAN_ID',
    'ASOFMONTH',        
    'PERIOD_NEXT',
    'MOD_PER_FROM',
    'MOD_PER_TO',
    'PROPERTY_ZIP',
    'INVALID_TRANSITIONS',
    'CONSECUTIVE'
    ]

    numeric_cols = ['MBA_DAYS_DELINQUENT', 'MBA_DAYS_DELINQUENT_NAN',
       'CURRENT_INTEREST_RATE', 'CURRENT_INTEREST_RATE_NAN', 'LOANAGE', 'LOANAGE_NAN',
       'CURRENT_BALANCE', 'CURRENT_BALANCE_NAN', 'SCHEDULED_PRINCIPAL',
       'SCHEDULED_PRINCIPAL_NAN', 'SCHEDULED_MONTHLY_PANDI',
       'SCHEDULED_MONTHLY_PANDI_NAN', 
       'LLMA2_CURRENT_INTEREST_SPREAD', 'LLMA2_CURRENT_INTEREST_SPREAD_NAN',  
       'LLMA2_C_IN_LAST_12_MONTHS',
       'LLMA2_30_IN_LAST_12_MONTHS', 'LLMA2_60_IN_LAST_12_MONTHS',
       'LLMA2_90_IN_LAST_12_MONTHS', 'LLMA2_FC_IN_LAST_12_MONTHS',
       'LLMA2_REO_IN_LAST_12_MONTHS', 'LLMA2_0_IN_LAST_12_MONTHS',       
       'NUM_MODIF', 'NUM_MODIF_NAN', 'P_RATE_TO_MOD', 'P_RATE_TO_MOD_NAN', 'MOD_RATE',
       'MOD_RATE_NAN', 'DIF_RATE', 'DIF_RATE_NAN', 'P_MONTHLY_PAY',
       'P_MONTHLY_PAY_NAN', 'MOD_MONTHLY_PAY', 'MOD_MONTHLY_PAY_NAN',
       'DIF_MONTHLY_PAY', 'DIF_MONTHLY_PAY_NAN', 'CAPITALIZATION_AMT',
       'CAPITALIZATION_AMT_NAN', 'MORTGAGE_RATE', 'MORTGAGE_RATE_NAN',
       'FICO_SCORE_ORIGINATION', 'INITIAL_INTEREST_RATE', 'ORIGINAL_LTV',
       'ORIGINAL_BALANCE', 'BACKEND_RATIO', 'BACKEND_RATIO_NAN',
       'ORIGINAL_TERM', 'ORIGINAL_TERM_NAN', 'SALE_PRICE', 'SALE_PRICE_NAN', 	   
       'PREPAY_PENALTY_TERM', 'PREPAY_PENALTY_TERM_NAN', 
        'NUMBER_OF_UNITS', 'NUMBER_OF_UNITS_NAN', 'MARGIN',
       'MARGIN_NAN', 'PERIODIC_RATE_CAP', 'PERIODIC_RATE_CAP_NAN',
       'PERIODIC_RATE_FLOOR', 'PERIODIC_RATE_FLOOR_NAN', 'LIFETIME_RATE_CAP',
       'LIFETIME_RATE_CAP_NAN', 'LIFETIME_RATE_FLOOR',
       'LIFETIME_RATE_FLOOR_NAN', 'RATE_RESET_FREQUENCY',
       'RATE_RESET_FREQUENCY_NAN', 'PAY_RESET_FREQUENCY',
       'PAY_RESET_FREQUENCY_NAN', 'FIRST_RATE_RESET_PERIOD',
       'FIRST_RATE_RESET_PERIOD_NAN', 	           
       'LLMA2_ORIG_RATE_SPREAD', 'LLMA2_ORIG_RATE_SPREAD_NAN', 
       'AGI', 'AGI_NAN', 'UR', 'UR_NAN', 'COUNT_INT_RATE_LESS', 'LLMA2_ORIG_RATE_ORIG_MR_SPREAD', 
       'LLMA2_ORIG_RATE_ORIG_MR_SPREAD_NAN', 'NUM_PRIME_ZIP', 'NUM_PRIME_ZIP_NAN'
       ]
    
    binary_cols = ['LLMA2_HIST_LAST_12_MONTHS_MIS', 'LLMA2_PRIME', 
                   'LLMA2_SUBPRIME', 'LLMA2_APPVAL_LT_SALEPRICE']

    '''
    nan_cols = {'MBA_DAYS_DELINQUENT': 'median', 'CURRENT_INTEREST_RATE': 'median', 'LOANAGE': 'median',
                'CURRENT_BALANCE' : 'median', 'SCHEDULED_PRINCIPAL': 'median', 'SCHEDULED_MONTHLY_PANDI': 'median',       
                'LLMA2_CURRENT_INTEREST_SPREAD': 'median', 'NUM_MODIF': 0, 'P_RATE_TO_MOD': 0, 'MOD_RATE': 0,
                'DIF_RATE': 0, 'P_MONTHLY_PAY': 0, 'MOD_MONTHLY_PAY': 0, 'DIF_MONTHLY_PAY': 0, 'CAPITALIZATION_AMT': 0,
                'MORTGAGE_RATE': 'median', 'FICO_SCORE_ORIGINATION': 'median', 'INITIAL_INTEREST_RATE': 'median', 'ORIGINAL_LTV': 'median',
                'ORIGINAL_BALANCE': 'median', 'BACKEND_RATIO': 'median', 'ORIGINAL_TERM': 'median', 'SALE_PRICE': 'median', 'PREPAY_PENALTY_TERM': 'median',
                'NUMBER_OF_UNITS': 'median', 'MARGIN': 'median', 'PERIODIC_RATE_CAP': 'median', 'PERIODIC_RATE_FLOOR': 'median', 'LIFETIME_RATE_CAP': 'median',
                'LIFETIME_RATE_FLOOR': 'median', 'RATE_RESET_FREQUENCY': 'median', 'PAY_RESET_FREQUENCY': 'median',
                'FIRST_RATE_RESET_PERIOD': 'median', 'LLMA2_ORIG_RATE_SPREAD': 'median', 'AGI': 'median', 'UR': 'median',
                'LLMA2_C_IN_LAST_12_MONTHS': 'median', 'LLMA2_30_IN_LAST_12_MONTHS': 'median', 'LLMA2_60_IN_LAST_12_MONTHS': 'median',
                'LLMA2_90_IN_LAST_12_MONTHS': 'median', 'LLMA2_FC_IN_LAST_12_MONTHS': 'median',
                'LLMA2_REO_IN_LAST_12_MONTHS': 'median', 'LLMA2_0_IN_LAST_12_MONTHS': 'median', 
                'LLMA2_ORIG_RATE_ORIG_MR_SPREAD':0, 'NUM_PRIME_ZIP':'median'
                }
    '''
    '''
    set(nan_cols) - set(nan_cols_nonan)
    Out[56]: 
    {'COUNT_INT_RATE_LESS', # never missed
     'FICO_SCORE_ORIGINATION', # never missed
     'INITIAL_INTEREST_RATE', # never missed
     'LLMA2_0_IN_LAST_12_MONTHS', #In average, 14% of missing data!
     'LLMA2_30_IN_LAST_12_MONTHS',
     'LLMA2_60_IN_LAST_12_MONTHS',
     'LLMA2_90_IN_LAST_12_MONTHS',
     'LLMA2_C_IN_LAST_12_MONTHS',
     'LLMA2_FC_IN_LAST_12_MONTHS',
     'LLMA2_REO_IN_LAST_12_MONTHS',
     'ORIGINAL_BALANCE', # never missed
     'ORIGINAL_LTV'} # never missed
    '''
    nan_cols = {'MBA_DAYS_DELINQUENT': 'mean', 'CURRENT_INTEREST_RATE': 'mean', 'LOANAGE': 'mean',
                'CURRENT_BALANCE' : 'mean', 'SCHEDULED_PRINCIPAL': 'mean', 'SCHEDULED_MONTHLY_PANDI': 'mean',       
                'LLMA2_CURRENT_INTEREST_SPREAD': 'mean', 'NUM_MODIF': 0, 'P_RATE_TO_MOD': 0, 'MOD_RATE': 0,
                'DIF_RATE': 0, 'P_MONTHLY_PAY': 0, 'MOD_MONTHLY_PAY': 0, 'DIF_MONTHLY_PAY': 0, 'CAPITALIZATION_AMT': 0,
                'MORTGAGE_RATE': 'mean', 'FICO_SCORE_ORIGINATION': 'mean', 'INITIAL_INTEREST_RATE': 'mean', 'ORIGINAL_LTV': 'mean',
                'ORIGINAL_BALANCE': 'mean', 'BACKEND_RATIO': 'mean', 'ORIGINAL_TERM': 'mean', 'SALE_PRICE': 'mean', 'PREPAY_PENALTY_TERM': 'mean',
                'NUMBER_OF_UNITS': 'mean', 'MARGIN': 'mean', 'PERIODIC_RATE_CAP': 'mean', 'PERIODIC_RATE_FLOOR': 'mean', 'LIFETIME_RATE_CAP': 'mean',
                'LIFETIME_RATE_FLOOR': 'mean', 'RATE_RESET_FREQUENCY': 'mean', 'PAY_RESET_FREQUENCY': 'mean',
                'FIRST_RATE_RESET_PERIOD': 'mean', 'LLMA2_ORIG_RATE_SPREAD': 'mean', 'AGI': 'mean', 'UR': 'mean',
                'LLMA2_C_IN_LAST_12_MONTHS': 'mean', 'LLMA2_30_IN_LAST_12_MONTHS': 'mean', 'LLMA2_60_IN_LAST_12_MONTHS': 'mean',
                'LLMA2_90_IN_LAST_12_MONTHS': 'mean', 'LLMA2_FC_IN_LAST_12_MONTHS': 'mean',
                'LLMA2_REO_IN_LAST_12_MONTHS': 'mean', 'LLMA2_0_IN_LAST_12_MONTHS': 'mean', 
                'LLMA2_ORIG_RATE_ORIG_MR_SPREAD':0, 'COUNT_INT_RATE_LESS' :'median', 'NUM_PRIME_ZIP':'mean'
                }
    
    categorical_cols = {'MBA_DELINQUENCY_STATUS':  ['0','3','6','9','C','F','R'], 'DELINQUENCY_STATUS_NEXT': ['0','3','6','9','C','F','R'],  #,'S','T','X'
                           'BUYDOWN_FLAG': ['N','U','Y'], 'NEGATIVE_AMORTIZATION_FLAG': ['N','U','Y'], 'PREPAY_PENALTY_FLAG': ['N','U','Y'],
                           'OCCUPANCY_TYPE': ['1','2','3','U'], 'PRODUCT_TYPE': ['10','20','30','40','50','51','52','53','54','5A','5Z',
                                            '60','61','62','63','6Z','70','80','81','82','83','84','8Z','U'], 
                           'PROPERTY_TYPE': ['1','2','3','4','5','6','7','8','9','L','M','U','Z'], 'LOAN_PURPOSE_CATEGORY': ['P','R','U'], 
                           'DOCUMENTATION_TYPE': ['1','2','3','U'], 'CHANNEL': ['1','2','3','4','5','6','7','8','9','A','B','C','D','U'], 
                           'LOAN_TYPE': ['1','2','3','4','5','6','7','U'], 'IO_FLAG': ['N','U','Y'], 
                           'CONVERTIBLE_FLAG': ['N','U','Y'], 'POOL_INSURANCE_FLAG': ['N','U','Y'], 'STATE': ['AK', 'AL', 'AR', 'AZ', 'CA', 'CO',
                                               'CT', 'DC', 'DE', 'FL', 'GA', 'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 
                                               'MD', 'ME', 'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 
                                               'NY', 'OH', 'OK', 'OR', 'PA', 'PR', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VT', 
                                               'WA', 'WI', 'WV', 'WY'], 
                           'CURRENT_INVESTOR_CODE': ['240', '250', '253', 'U'], 'ORIGINATION_YEAR': ['B1995','1995','1996','1997','1998','1999','2000','2001','2002','2003',
                                                    '2004','2005','2006','2007','2008','2009','2010','2011','2012','2013','2014','2015','2016','2017','2018','nan']}

    time_cols = ['YEAR', 'MONTH'] #, 'PERIOD'] #no nan values        

    total_cols = numeric_cols.copy() 
    total_cols.extend(descriptive_cols)
    total_cols.extend(categorical_cols.keys())
    total_cols.extend(time_cols)
    print('total_cols size: ', len(total_cols)) #110 !=112?? set(chunk_cols) - set(total_cols): {'LOAN_ID', 'PERIOD'}
    
    pd.set_option('io.hdf.default_format','table')

    dist_file = pd.read_csv(os.path.join(RAW_DIR, "percentile features3-mean.csv"), sep=';', low_memory=False)
    dist_file.columns = dist_file.columns.str.upper()

    ncols = [x for x in numeric_cols if x.find('NAN')<0]
    print(ncols)

    #sum = 0
    #for elem in categorical_cols.values():
    #    sum += len(elem)
    #print('total categorical values: ', sum) #181

    for file_path in glob.glob(os.path.join(RAW_DIR, raw_dir,"*.txt")):  
        file_name = os.path.basename(file_path)
        if with_index==True:
            target_path = os.path.join(PRO_DIR, raw_dir,file_name[:-4])        
        else:
            target_path = os.path.join(PRO_DIR, raw_dir,file_name[:-4]+'_non_index')
        log_file=open(target_path+'-log.txt', 'w+', 1)        
        print('Preprocessing File: ' + file_path)
        log_file.write('Preprocessing File:  %s\r\n' % file_path)
        startTime = datetime.now()      
        
        if (output_hdf == True):
            #with  pd.HDFStore(target_path +'-pp.h5', complib='lzo', complevel=9) as hdf: #complib='lzo', complevel=9
            train_writer = pd.HDFStore(target_path +'-train_.h5', complib='lzo', complevel=9) 
            valid_writer = pd.HDFStore(target_path +'-valid_.h5', complib='lzo', complevel=9)
            test_writer = pd.HDFStore(target_path +'-test_.h5', complib='lzo', complevel=9) 

            print('generating: ', target_path +'-pp.h5')
            train_index, valid_index, test_index = prepro_chunk(file_name, file_path, chunksize, label, log_file, 
                                                                nan_cols, categorical_cols, descriptive_cols, time_cols,
                                                                dist_file, with_index, 
                                                                refNorm, train_period, valid_period, test_period, ncols,                                                                
                                                                hdf=[train_writer, valid_writer, test_writer], tfrec=None,
                                                                filtering_cols=filtering_cols)            


            if train_writer.get_storer('train/features').nrows != train_writer.get_storer('train/labels').nrows:
                    raise ValueError('Train-DataSet: Sizes should match!')  
            if valid_writer.get_storer('valid/features').nrows != valid_writer.get_storer('valid/labels').nrows:
                    raise ValueError('Valid-DataSet: Sizes should match!')  
            if test_writer.get_storer('test/features').nrows != test_writer.get_storer('test/labels').nrows:
                    raise ValueError('Test-DataSet: Sizes should match!')  

            print('train/features size: ', train_writer.get_storer('train/features').nrows)
            print('valid/features size: ', valid_writer.get_storer('valid/features').nrows)
            print('test/features size: ', test_writer.get_storer('test/features').nrows)

            log_file.write('***SUMMARY***\n')
            log_file.write('train/features size: %d\r\n' %(train_writer.get_storer('train/features').nrows))
            log_file.write('valid/features size: %d\r\n' %(valid_writer.get_storer('valid/features').nrows))
            log_file.write('test/features size: %d\r\n' %(test_writer.get_storer('test/features').nrows))

            logger.info('training, validation and testing set into .h5 file')        
        else:        
            train_writer = tf.python_io.TFRecordWriter(target_path +'-train_.tfrecords')
            valid_writer = tf.python_io.TFRecordWriter(target_path +'-valid_.tfrecords')
            test_writer = tf.python_io.TFRecordWriter(target_path +'-test_.tfrecords')
            train_index, valid_index, test_index = prepro_chunk(file_name, file_path, chunksize, label, log_file, 
                                                                nan_cols, categorical_cols, descriptive_cols, time_cols,
                                                                dist_file, with_index, 
                                                                refNorm, train_period, valid_period, test_period, ncols,
                                                                hdf=None, tfrec=[train_writer, valid_writer, test_writer],
                                                                filtering_cols=filtering_cols) 
        print(train_index, valid_index, test_index)
        train_writer.close()
        valid_writer.close()
        test_writer.close()        
        
        #def allfeatures_prepro_file(RAW_DIR, file_path, raw_dir, file_name, target_path, train_period, valid_period, test_period, log_file, dividing='percentage', chunksize=500000, 
        #                    refNorm=True, , with_index=True, output_hdf=True):

        #allfeatures_prepro_file(RAW_DIR, file_path, raw_dir, file_name, target_path, train_num, valid_num, test_num, log_file, dividing=dividing, chunksize=chunksize, 
        #                        refNorm=refNorm, with_index=with_index, output_hdf=output_hdf)          
        
        startTime = datetime.now() - startTime
        print('Preprocessing Time per file: ', startTime)     
        log_file.write('Preprocessing Time per file:  %s\r\n' % str(startTime))
        log_file.close()


In [16]:
def allclasses_Ncomp_71feat():
    cols = ['PRODUCT_TYPE_20',
    'IO_FLAG_U',
    'NEGATIVE_AMORTIZATION_FLAG_N',
    'LOAN_TYPE_1',
    'NEGATIVE_AMORTIZATION_FLAG_U',
    'IO_FLAG_N',
    'CURRENT_INVESTOR_CODE_250',
    'NEGATIVE_AMORTIZATION_FLAG_Y',
    'LOAN_PURPOSE_CATEGORY_U',
    'PREPAY_PENALTY_FLAG_U',
    'LOAN_PURPOSE_CATEGORY_P',
    'CHANNEL_D',
    'CONVERTIBLE_FLAG_N',
    'IO_FLAG_Y',
    'CONVERTIBLE_FLAG_U',
    'LOAN_PURPOSE_CATEGORY_R',
    'ORIGINATION_YEAR_B1995',
    'CHANNEL_U',
    'POOL_INSURANCE_FLAG_U',
    'CHANNEL_2',
    'PREPAY_PENALTY_FLAG_Y',
    'PROPERTY_TYPE_6',
    'DOCUMENTATION_TYPE_U',
    'PRODUCT_TYPE_10',
    'CURRENT_INVESTOR_CODE_U',
    'PERIODIC_RATE_FLOOR_NAN',
    'PERIODIC_RATE_CAP_NAN',
    'LIFETIME_RATE_FLOOR_NAN',
    'PAY_RESET_FREQUENCY_NAN',
    'CONVERTIBLE_FLAG_Y',
    'DOCUMENTATION_TYPE_2',
    'POOL_INSURANCE_FLAG_N',
    'RATE_RESET_FREQUENCY_NAN',
    'FIRST_RATE_RESET_PERIOD_NAN',
    'PROPERTY_TYPE_2',
    'CURRENT_INVESTOR_CODE_253',
    'LOAN_TYPE_3',
    'LIFETIME_RATE_CAP_NAN',
    'PREPAY_PENALTY_FLAG_N',
    'OCCUPANCY_TYPE_U',
    'SCHEDULED_MONTHLY_PANDI_NAN',
    'ORIGINATION_YEAR_2012',
    'BUYDOWN_FLAG_N',
    'ORIGINATION_YEAR_2008',
    'BUYDOWN_FLAG_U',
    'MARGIN',
    'LOAN_TYPE_2',
    'ORIGINATION_YEAR_2007',
    'LLMA2_ORIG_RATE_ORIG_MR_SPREAD',
    'AGI_NAN',
    'ORIGINATION_YEAR_2006',
    'DOCUMENTATION_TYPE_1',
    'CHANNEL_1',
    'ORIGINATION_YEAR_1999',
    'CURRENT_INVESTOR_CODE_240',
    'PROPERTY_TYPE_U',
    'MARGIN_NAN',
    'ORIGINATION_YEAR_2013',
    'ORIGINATION_YEAR_2004',
    'ORIGINATION_YEAR_1998',
    'OCCUPANCY_TYPE_2',
    'CHANNEL_3',
    'LIFETIME_RATE_FLOOR',
    'PROPERTY_TYPE_1',
    'PERIODIC_RATE_CAP',
    'ORIGINATION_YEAR_2005',
    'PRODUCT_TYPE_82',
    'LLMA2_HIST_LAST_12_MONTHS_MIS',
    'LOANAGE',
    'PROPERTY_TYPE_5',
    'SCHEDULED_PRINCIPAL_NAN']
    return cols

def perclass_Ncomp_71feat():
    # 71 selected features from allcols(size=257) using a per-class dataset with n_components=None:     
    cols = [
    'PRODUCT_TYPE_20', 
    'NEGATIVE_AMORTIZATION_FLAG_N', 
    'NEGATIVE_AMORTIZATION_FLAG_U', 
    'CONVERTIBLE_FLAG_N', 
    'CONVERTIBLE_FLAG_U', 
    'IO_FLAG_U', 
    'NEGATIVE_AMORTIZATION_FLAG_Y', 
    'LOAN_TYPE_1', 
    'CHANNEL_U', 
    'LOAN_PURPOSE_CATEGORY_U', 
    'PRODUCT_TYPE_10', 
    'BUYDOWN_FLAG_N', 
    'BUYDOWN_FLAG_U', 
    'DOCUMENTATION_TYPE_U', 
    'CHANNEL_2', 
    'LOAN_PURPOSE_CATEGORY_R', 
    'PREPAY_PENALTY_FLAG_Y', 
    'IO_FLAG_N', 
    'LOAN_PURPOSE_CATEGORY_P', 
    'CHANNEL_D', 
    'POOL_INSURANCE_FLAG_U', 
    'LOAN_TYPE_3', 
    'PREPAY_PENALTY_FLAG_U', 
    'PROPERTY_TYPE_6', 
    'LIFETIME_RATE_CAP_NAN', 
    'CURRENT_INVESTOR_CODE_253', 
    'POOL_INSURANCE_FLAG_N', 
    'CURRENT_INVESTOR_CODE_U', 
    'PERIODIC_RATE_FLOOR_NAN', 
    'OCCUPANCY_TYPE_U', 
    'IO_FLAG_Y', 
    'DOCUMENTATION_TYPE_2', 
    'LIFETIME_RATE_FLOOR_NAN', 
    'RATE_RESET_FREQUENCY_NAN', 
    'PERIODIC_RATE_CAP_NAN', 
    'PROPERTY_TYPE_2', 
    'OCCUPANCY_TYPE_3', 
    'PAY_RESET_FREQUENCY_NAN', 
    'PREPAY_PENALTY_FLAG_N', 
    'FIRST_RATE_RESET_PERIOD_NAN', 
    'CHANNEL_1', 
    'PROPERTY_TYPE_U', 
    'ORIGINATION_YEAR_2007', 
    'CURRENT_INVESTOR_CODE_240', 
    'CHANNEL_3', 
    'DOCUMENTATION_TYPE_1', 
    'ORIGINATION_YEAR_B1995', 
    'LLMA2_ORIG_RATE_ORIG_MR_SPREAD', 
    'ORIGINATION_YEAR_2008', 
    'PRODUCT_TYPE_80', 
    'CURRENT_INVESTOR_CODE_250', 
    'MARGIN_NAN',  
    'ORIGINATION_YEAR_2006', 
    'PERIODIC_RATE_CAP', 
    'ORIGINATION_YEAR_2005', 
    'SCHEDULED_MONTHLY_PANDI_NAN', 
    'ORIGINATION_YEAR_2003', 
    'ORIGINATION_YEAR_2000', 
    'ORIGINATION_YEAR_2004', 
    'PROPERTY_TYPE_1', 
    'LOAN_TYPE_2', 
    'SCHEDULED_PRINCIPAL_NAN', 
    'BUYDOWN_FLAG_Y', 
    'CONVERTIBLE_FLAG_Y', 
    'STATE_CA', 
    'PERIODIC_RATE_FLOOR', 
    'AGI_NAN', 
    'OCCUPANCY_TYPE_1', 
    'PRODUCT_TYPE_82', 
    'LIFETIME_RATE_FLOOR',
    'MARGIN']    
    return cols
    
def filtering_allfeatures(cols):    
    allcols = cols + ['DELINQUENCY_STATUS_NEXT_0', 'DELINQUENCY_STATUS_NEXT_3',
    'DELINQUENCY_STATUS_NEXT_6', 'DELINQUENCY_STATUS_NEXT_9',
    'DELINQUENCY_STATUS_NEXT_C', 'DELINQUENCY_STATUS_NEXT_F',
    'DELINQUENCY_STATUS_NEXT_R']    
    return allcols


In [17]:
def allclass_Ncomp_26numfeat():
    # 26 selected features from numerical_cols(size=50)  using the whole dataset with n_components=None:     
    cols = ['LOANAGE',     
    'COUNT_INT_RATE_LESS',
    'MORTGAGE_RATE',
    'LLMA2_ORIG_RATE_ORIG_MR_SPREAD',
    'LLMA2_HIST_LAST_12_MONTHS_MIS',
    'ORIGINAL_LTV',
    'ORIGINAL_BALANCE',
    'UR',
    'INITIAL_INTEREST_RATE',
    'CURRENT_BALANCE',
    'ORIGINAL_TERM',
    'LLMA2_PRIME',
    'MARGIN',
    'LLMA2_90_IN_LAST_12_MONTHS',
    'LLMA2_ORIG_RATE_SPREAD',
    'LLMA2_30_IN_LAST_12_MONTHS',
    'LLMA2_SUBPRIME',
    'NUM_PRIME_ZIP',
    'LLMA2_FC_IN_LAST_12_MONTHS',
    'LLMA2_CURRENT_INTEREST_SPREAD',
    'AGI',
    'MBA_DAYS_DELINQUENT',
    'LLMA2_C_IN_LAST_12_MONTHS',
    'CURRENT_INTEREST_RATE',
    'LIFETIME_RATE_FLOOR',
    'LLMA2_60_IN_LAST_12_MONTHS']
    return cols


def perclass_Ncomp_26numfeat():
    # 26 selected features from numerical_cols(size=50)  using a per-class dataset with n_components=None:     
    cols = ['LOANAGE',
    'MARGIN', 
    'MORTGAGE_RATE',
    'LLMA2_ORIG_RATE_ORIG_MR_SPREAD', 
    'LLMA2_HIST_LAST_12_MONTHS_MIS', 
    'COUNT_INT_RATE_LESS',
    'LIFETIME_RATE_FLOOR', 
    'INITIAL_INTEREST_RATE',
    'LIFETIME_RATE_CAP', 
    'LLMA2_PRIME',
    'LLMA2_ORIG_RATE_SPREAD', 
    'ORIGINAL_BALANCE',
    'CURRENT_BALANCE',
    'UR',
    'LLMA2_SUBPRIME',
    'MOD_RATE', 
    'LLMA2_CURRENT_INTEREST_SPREAD',
    'RATE_RESET_FREQUENCY',
    'CURRENT_INTEREST_RATE', 
    'PAY_RESET_FREQUENCY', 
    'DIF_RATE',  
    'NUM_MODIF', 
    'AGI', 
    'PERIODIC_RATE_FLOOR',
    'LLMA2_30_IN_LAST_12_MONTHS',
    'LLMA2_C_IN_LAST_12_MONTHS'] 
    return cols

def filtering_num_features(ncols):
    all_nan_cols = ['MBA_DAYS_DELINQUENT_NAN',
     'CURRENT_INTEREST_RATE_NAN',
     'LOANAGE_NAN',
     'CURRENT_BALANCE_NAN',
     'SCHEDULED_PRINCIPAL_NAN',
     'SCHEDULED_MONTHLY_PANDI_NAN',
     'LLMA2_CURRENT_INTEREST_SPREAD_NAN',
     'NUM_MODIF_NAN',
     'P_RATE_TO_MOD_NAN',
     'MOD_RATE_NAN',
     'DIF_RATE_NAN',
     'P_MONTHLY_PAY_NAN',
     'MOD_MONTHLY_PAY_NAN',
     'DIF_MONTHLY_PAY_NAN',
     'CAPITALIZATION_AMT_NAN',
     'MORTGAGE_RATE_NAN',
     'BACKEND_RATIO_NAN',
     'ORIGINAL_TERM_NAN',
     'SALE_PRICE_NAN',
     'PREPAY_PENALTY_TERM_NAN',
     'NUMBER_OF_UNITS_NAN',
     'MARGIN_NAN',
     'PERIODIC_RATE_CAP_NAN',
     'PERIODIC_RATE_FLOOR_NAN',
     'LIFETIME_RATE_CAP_NAN',
     'LIFETIME_RATE_FLOOR_NAN',
     'RATE_RESET_FREQUENCY_NAN',
     'PAY_RESET_FREQUENCY_NAN',
     'FIRST_RATE_RESET_PERIOD_NAN',
     'LLMA2_ORIG_RATE_SPREAD_NAN',
     'AGI_NAN',
     'UR_NAN',
     'LLMA2_ORIG_RATE_ORIG_MR_SPREAD_NAN',
     'NUM_PRIME_ZIP_NAN']

    sel_nan_cols = [x for x in all_nan_cols for y in ncols if x.find(y)==0]

    cat_cols = ['MBA_DELINQUENCY_STATUS_0', 'MBA_DELINQUENCY_STATUS_3',
                'MBA_DELINQUENCY_STATUS_6', 'MBA_DELINQUENCY_STATUS_9', 
                'MBA_DELINQUENCY_STATUS_C', 'MBA_DELINQUENCY_STATUS_F', 'MBA_DELINQUENCY_STATUS_R'] + \
     ['BUYDOWN_FLAG_N', 'BUYDOWN_FLAG_U', 'BUYDOWN_FLAG_Y'] + \
     ['NEGATIVE_AMORTIZATION_FLAG_N', 'NEGATIVE_AMORTIZATION_FLAG_U', 'NEGATIVE_AMORTIZATION_FLAG_Y'] +\
     ['PREPAY_PENALTY_FLAG_N', 'PREPAY_PENALTY_FLAG_U', 'PREPAY_PENALTY_FLAG_Y'] +\
     ['OCCUPANCY_TYPE_1', 'OCCUPANCY_TYPE_2', 'OCCUPANCY_TYPE_3', 'OCCUPANCY_TYPE_U'] +\
     ['PRODUCT_TYPE_10', 'PRODUCT_TYPE_20', 'PRODUCT_TYPE_30', 'PRODUCT_TYPE_40',
     'PRODUCT_TYPE_50', 'PRODUCT_TYPE_51', 'PRODUCT_TYPE_52', 'PRODUCT_TYPE_53',
     'PRODUCT_TYPE_54', 'PRODUCT_TYPE_5A', 'PRODUCT_TYPE_5Z', 'PRODUCT_TYPE_60',
     'PRODUCT_TYPE_61', 'PRODUCT_TYPE_62', 'PRODUCT_TYPE_63', 'PRODUCT_TYPE_6Z',
     'PRODUCT_TYPE_70', 'PRODUCT_TYPE_80', 'PRODUCT_TYPE_81', 'PRODUCT_TYPE_82',
     'PRODUCT_TYPE_83', 'PRODUCT_TYPE_84', 'PRODUCT_TYPE_8Z', 'PRODUCT_TYPE_U'] +\
     ['PROPERTY_TYPE_1', 'PROPERTY_TYPE_2', 'PROPERTY_TYPE_3', 'PROPERTY_TYPE_4',
     'PROPERTY_TYPE_5', 'PROPERTY_TYPE_6', 'PROPERTY_TYPE_7', 'PROPERTY_TYPE_8',
     'PROPERTY_TYPE_9', 'PROPERTY_TYPE_M', 'PROPERTY_TYPE_U', 'PROPERTY_TYPE_Z'] +\
     ['LOAN_PURPOSE_CATEGORY_P', 'LOAN_PURPOSE_CATEGORY_R', 'LOAN_PURPOSE_CATEGORY_U'] +\
     ['DOCUMENTATION_TYPE_1', 'DOCUMENTATION_TYPE_2', 'DOCUMENTATION_TYPE_3', 'DOCUMENTATION_TYPE_U'] +\
     ['CHANNEL_1', 'CHANNEL_2', 'CHANNEL_3', 'CHANNEL_4', 'CHANNEL_5', 'CHANNEL_6',
     'CHANNEL_7', 'CHANNEL_8', 'CHANNEL_9', 'CHANNEL_A', 'CHANNEL_B', 'CHANNEL_C',
     'CHANNEL_D', 'CHANNEL_U'] +\
     ['LOAN_TYPE_1', 'LOAN_TYPE_2', 'LOAN_TYPE_3', 'LOAN_TYPE_4', 'LOAN_TYPE_5', 'LOAN_TYPE_6', 'LOAN_TYPE_U'] +\
     ['IO_FLAG_N', 'IO_FLAG_U', 'IO_FLAG_Y'] +\
     ['CONVERTIBLE_FLAG_N', 'CONVERTIBLE_FLAG_U', 'CONVERTIBLE_FLAG_Y'] +\
     ['POOL_INSURANCE_FLAG_N', 'POOL_INSURANCE_FLAG_U', 'POOL_INSURANCE_FLAG_Y'] +\
     ['STATE_AK', 'STATE_AL', 'STATE_AR', 'STATE_AZ', 'STATE_CA', 'STATE_CO',
     'STATE_CT', 'STATE_DC', 'STATE_DE', 'STATE_FL', 'STATE_GA', 'STATE_HI',
     'STATE_IA', 'STATE_ID', 'STATE_IL', 'STATE_IN', 'STATE_KS', 'STATE_KY',
     'STATE_LA', 'STATE_MA', 'STATE_MD', 'STATE_ME', 'STATE_MI', 'STATE_MN',
     'STATE_MO', 'STATE_MS', 'STATE_MT', 'STATE_NC', 'STATE_ND', 'STATE_NE',
     'STATE_NH', 'STATE_NJ', 'STATE_NM', 'STATE_NV', 'STATE_NY', 'STATE_OH',
     'STATE_OK', 'STATE_OR', 'STATE_PA', 'STATE_PR', 'STATE_RI', 'STATE_SC',
     'STATE_SD', 'STATE_TN', 'STATE_TX', 'STATE_UT', 'STATE_VA', 'STATE_VT',
     'STATE_WA', 'STATE_WI', 'STATE_WV', 'STATE_WY'] +\
     ['CURRENT_INVESTOR_CODE_240', 'CURRENT_INVESTOR_CODE_250', 'CURRENT_INVESTOR_CODE_253', 'CURRENT_INVESTOR_CODE_U'] +\
     ['ORIGINATION_YEAR_B1995', 'ORIGINATION_YEAR_1995', 'ORIGINATION_YEAR_1996',
     'ORIGINATION_YEAR_1997', 'ORIGINATION_YEAR_1998', 'ORIGINATION_YEAR_1999',
     'ORIGINATION_YEAR_2000', 'ORIGINATION_YEAR_2001', 'ORIGINATION_YEAR_2002',
     'ORIGINATION_YEAR_2003', 'ORIGINATION_YEAR_2004', 'ORIGINATION_YEAR_2005',
     'ORIGINATION_YEAR_2006', 'ORIGINATION_YEAR_2007', 'ORIGINATION_YEAR_2008',
     'ORIGINATION_YEAR_2009', 'ORIGINATION_YEAR_2010', 'ORIGINATION_YEAR_2011',
     'ORIGINATION_YEAR_2012', 'ORIGINATION_YEAR_2013', 'ORIGINATION_YEAR_2014',
     'ORIGINATION_YEAR_2015', 'ORIGINATION_YEAR_2016', 'ORIGINATION_YEAR_2017',
     'ORIGINATION_YEAR_2018']

    lab_cols = ['DELINQUENCY_STATUS_NEXT_0', 'DELINQUENCY_STATUS_NEXT_3',
    'DELINQUENCY_STATUS_NEXT_6', 'DELINQUENCY_STATUS_NEXT_9',
    'DELINQUENCY_STATUS_NEXT_C', 'DELINQUENCY_STATUS_NEXT_F',
    'DELINQUENCY_STATUS_NEXT_R']

    allcols = ncols + sel_nan_cols + cat_cols + lab_cols
    return allcols    


In [25]:
startTime = datetime.now()

if not os.path.exists(os.path.join(PRO_DIR, FLAGS.prepro_dir)): #os.path.exists
    os.makedirs(os.path.join(PRO_DIR, FLAGS.prepro_dir))

#filtering_num_features(allclass_Ncomp_26numfeat())
allcols = None #filtering_num_features(allclass_Ncomp_26numfeat()) # filtering_allfeatures(allclasses_Ncomp_71feat()) # filtering_allfeatures(perclass_Ncomp_71feat()), filtering_num_features(perclass_Ncomp_26numfeat())

allfeatures_preprocessing(RAW_DIR, PRO_DIR, FLAGS.prepro_dir, FLAGS.train_period, FLAGS.valid_period, FLAGS.test_period, dividing='percentage', 
                          chunksize=FLAGS.prepro_chunksize, refNorm=FLAGS.ref_norm, with_index=FLAGS.prepro_with_index, output_hdf=True, filtering_cols=allcols)        
print('Preprocessing - Time: ', datetime.now() - startTime)

total_cols size:  107
['MBA_DAYS_DELINQUENT', 'CURRENT_INTEREST_RATE', 'LOANAGE', 'CURRENT_BALANCE', 'SCHEDULED_PRINCIPAL', 'SCHEDULED_MONTHLY_PANDI', 'LLMA2_CURRENT_INTEREST_SPREAD', 'LLMA2_C_IN_LAST_12_MONTHS', 'LLMA2_30_IN_LAST_12_MONTHS', 'LLMA2_60_IN_LAST_12_MONTHS', 'LLMA2_90_IN_LAST_12_MONTHS', 'LLMA2_FC_IN_LAST_12_MONTHS', 'LLMA2_REO_IN_LAST_12_MONTHS', 'LLMA2_0_IN_LAST_12_MONTHS', 'NUM_MODIF', 'P_RATE_TO_MOD', 'MOD_RATE', 'DIF_RATE', 'P_MONTHLY_PAY', 'MOD_MONTHLY_PAY', 'DIF_MONTHLY_PAY', 'CAPITALIZATION_AMT', 'MORTGAGE_RATE', 'FICO_SCORE_ORIGINATION', 'INITIAL_INTEREST_RATE', 'ORIGINAL_LTV', 'ORIGINAL_BALANCE', 'BACKEND_RATIO', 'ORIGINAL_TERM', 'SALE_PRICE', 'PREPAY_PENALTY_TERM', 'NUMBER_OF_UNITS', 'MARGIN', 'PERIODIC_RATE_CAP', 'PERIODIC_RATE_FLOOR', 'LIFETIME_RATE_CAP', 'LIFETIME_RATE_FLOOR', 'RATE_RESET_FREQUENCY', 'PAY_RESET_FREQUENCY', 'FIRST_RATE_RESET_PERIOD', 'LLMA2_ORIG_RATE_SPREAD', 'AGI', 'UR', 'LLMA2_ORIG_RATE_ORIG_MR_SPREAD', 'NUM_PRIME_ZIP']
Preprocessing File: 

2018-12-19 00:29:10,808 - drop_invalid_delinquency_status - INFO - dropping invalid transitions and delinquency status, fill nan values, drop duplicates
2018-12-19 00:29:13,559 - allfeatures_drop_cols - INFO - ...Columns Excluded from dataset...
2018-12-19 00:29:13,619 - allfeatures_drop_cols - INFO - ...Columns Excluded from dataset...
2018-12-19 00:29:13,670 - allfeatures_drop_cols - INFO - ...Columns Excluded from dataset...
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


STANDARD DEV zero:  []


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
2018-12-19 00:29:31,807 - allfeatures_drop_cols - INFO - ...Columns Excluded from dataset...


Records for train Set - Number of rows: 83641


2018-12-19 00:29:31,905 - allfeatures_extract_labels - INFO - ...Labels extracted from Dataset...
2018-12-19 00:29:32,816 - allfeatures_drop_cols - INFO - ...Columns Excluded from dataset...
2018-12-19 00:29:32,889 - allfeatures_extract_labels - INFO - ...Labels extracted from Dataset...


Records for valid Set - Number of rows: 2899


2018-12-19 00:29:33,025 - allfeatures_drop_cols - INFO - ...Columns Excluded from dataset...
2018-12-19 00:29:33,100 - allfeatures_extract_labels - INFO - ...Labels extracted from Dataset...


Records for test Set - Number of rows: 6335


2018-12-19 00:29:33,195 - allfeatures_extract_labels - INFO - training, validation and testing set into .h5 file


train/features size:  83641
valid/features size:  2899
test/features size:  6335
83641 2899 6335
Preprocessing Time per file:  0:00:25.153288
Preprocessing - Time:  0:00:25.170052
