1. Read File,
2. Prep Data for single feature
3. Combine prep data
4. dump to oven ready

Tasks in each feature extraction
in : data frame
out: processed dataframe

1. add some setup of columns
2. group by some aggregations, may be
3. drop if no longer used
4. gc collect
5  write dataframe to csv
5. return the processed data frame, and csv name


In [2]:
import pandas as pd
import os

In [3]:
train_file='eda/input/train_specific_hours_sample.csv'
valid_file='eda/input/valid_specific_hours_sample.csv'
test_file='input/test_parsed.csv'
comment_for_feature=""
overwrite=True
file_in_progress=None

In [13]:
def set_file(name):
    global file_in_progress
    file_in_progress=name
def feature_file(file):
    return str.lower(folder_path(file)+csv_name(file))
def csv_name(file):
    if 'csv' not in file:
        return file+comment_for_feature+'.csv' 
    return file
def folder_path(name):
    folder=get_folder(name)
    if not os.path.exists(folder):
        os.makedirs(folder)
    return folder
def get_folder(name):
    global file_in_progress
    if 'train' in file_in_progress:
        return 'eda/features/'+'train/'
    elif 'valid' in file_in_progress:
        return 'eda/features/'+'valid/'
    elif 'test' in file_in_progress:
        return 'eda/features/'+'test/'
    else:
        return 'eda/features/'
    
def log_feature(df,name):
    df.to_csv(feature_file(name),index=False,header=True)

def skip_if_already_exists(agg_name):
    global overwrite
    if not overwrite:
        if os.path.exists(feature_file(agg_name)):
            print(feature_file(agg_name) + ' already exists')
            df = pd.read_csv(feature_file(agg_name))
            return df
        else:
            return None
    else:
        if os.path.exists(feature_file(agg_name)):
            print('deleting ',feature_file(agg_name),'...')
            os.remove(feature_file(agg_name))
        return None

In [5]:
IN_TEST_HH='IN_TEST_HH'
def in_test_hh(df,key=IN_TEST_HH):
    gp = skip_if_already_exists(key)
    if gp is not None:
        return gp
    else:
        most_freq_hours_in_test_data = [4, 5, 9, 10, 13, 14]
        least_freq_hours_in_test_data = [6, 11, 15]
        import gc
        gc.collect()
        df[key] = (3 
                - 2*df['hour'].isin(most_freq_hours_in_test_data ) 
                - 1*df['hour'].isin( least_freq_hours_in_test_data ) ).astype('uint8')
        log_feature(df,key)
        return df

In [6]:
def aggregate_cols( df, group_cols, agg_name, agg_type='uint32', show_max=False, show_agg=True):
    if show_agg:
        print( "Aggregating by ", group_cols , '...' )
    gp = skip_if_already_exists(agg_name) 
    if gp is None:
        gp = df[group_cols][group_cols].groupby(group_cols).size().rename(agg_name).to_frame().reset_index()
        log_feature(gp,agg_name)
    df = df.merge(gp, on=group_cols, how='left')
    del gp
    if show_max:
        print( agg_name + " max value = ", df[agg_name].max() )
    df[agg_name] = df[agg_name].astype(agg_type)
    import gc
    gc.collect()
    return df

In [7]:
BASE_FEATURES='base_features'
def base_features(df,key=BASE_FEATURES):
    gp=skip_if_already_exists(key)
    if gp is None:
        log_feature(df,key)
        return df
    return gp

In [8]:
IP_DAY_TEST_HH='IP_DAY_TEST_HH'
IP_DAY_HH='IP_DAY_HH'
IP_OS_HH='IP_OS_HH'
IP_APP_HH='IP_APP_HH'
IP_APP_OS_HH='IP_APP_OS_HH'
APP_DAY_HH='APP_DAY_HH'
def ip_day_test(df,key = IP_DAY_TEST_HH):
    return aggregate_cols(df, ['ip', 'day', IN_TEST_HH], key, show_max=True )
def ip_day_hh(df,key = IP_DAY_HH):
    return aggregate_cols( df, ['ip', 'day', 'hour'], key, 'uint16', show_max=True );
def ip_os_hh(df,key = IP_OS_HH):
    return aggregate_cols( df, ['ip', 'day', 'os', 'hour'], key, 'uint16', show_max=True );
def ip_app_hh(df,key = IP_APP_HH):
    return aggregate_cols( df, ['ip', 'day', 'app', 'hour'], key, 'uint16', show_max=True );
def ip_app_os_hh(df,key = IP_APP_OS_HH):
    return aggregate_cols( df, ['ip', 'day', 'app', 'os', 'hour'], key, 'uint16', show_max=True );
def app_day_hh(df,key = APP_DAY_HH):
    return aggregate_cols( df, ['app', 'day', 'hour'], key, 'uint16', show_max=True );

In [9]:
feature_config={
    BASE_FEATURES :base_features,
    IN_TEST_HH    :in_test_hh,
    IP_DAY_TEST_HH:ip_day_test,
    IP_DAY_HH     :ip_day_hh,
    IP_OS_HH      : ip_os_hh,
    IP_APP_HH     :ip_app_hh,
    IP_APP_OS_HH  :ip_app_os_hh,
    APP_DAY_HH    :app_day_hh,
}

In [10]:
applicable_features=[
    BASE_FEATURES,
    IN_TEST_HH,  
    IP_DAY_TEST_HH,
    IP_DAY_HH     ,
    IP_OS_HH      ,
    IP_APP_HH     ,
    IP_APP_OS_HH  ,
    APP_DAY_HH    
]

In [11]:
def get_features(file):
    df=pd.read_csv(file)
    set_file(file)
    for each_feature in applicable_features:
        print('generating '+each_feature+'...')
        df=feature_config[each_feature](df)
        print(feature_file(each_feature))
    df.to_csv(feature_file('all_features'),index=False)
    print(feature_file('all_features'))
    del df
    import gc
    gc.collect()
    print('done')

In [112]:
get_features(train_file)
get_features(valid_file)


generating base_features...
deleting  eda/features/train/base_features.csv ...
eda/features/train/base_features.csv
generating IN_TEST_HH...
deleting  eda/features/train/in_test_hh.csv ...


KeyError: 'hour'

In [14]:
test_file='input/test_parsed.csv'
get_features(test_file)

generating base_features...
eda/features/test/base_features.csv
generating IN_TEST_HH...


KeyError: 'hour'