# Feature Engineering and Selection

This feature selection notebook does a filter followed by a wrapper for a binary dependent variable (binary classification). It's capable of doing the filter on more than one file. The variable files are called vars1.csv, vars2.csv ... Or you can make the input file name(s) anything you want.

The filter runs separately on each vars file and keeps the top num_filter variables from each file. If there are more than one vars files we'll again select the top num_filter variables across all the vars.csv files.

If balance = 0 the entire files are used. If balance != 0 then balance is the RATIO OF BADS TO GOODS retained for the rest of the feature selection. We keep all the rare class (bads) and downsample the goods. I think in general it's better to keep balance = 0.

In [None]:
#pip install mlxtend  

In [None]:
#pip install wheel

In [1]:
conda install lightgbm

Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 22.9.0
  latest version: 23.3.1

Please update conda by running

    $ conda update -n base -c defaults conda



# All requested packages already installed.

Retrieving notices: ...working... done

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import scipy.stats as sps
import matplotlib.pyplot as plt
import datetime as dt
import gc
from sklearn.ensemble import RandomForestClassifier
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from lightgbm import LGBMClassifier
%matplotlib inline
start_time = dt.datetime.now()

In [77]:
# set some parameters
num_files = 1
# I recommend set num_filter to be about 10 to 20% of the original # variables
num_filter = 200
# I recommend set num_wrapper to be about 50, then look for a saturation of the model performance as variables are added
# Then you can run it again with num_wrapper just a bit above this saturation point, not more than about twice this saturation number
num_wrapper = 20
balance = 0
detect_rate = .03
index_name = 'Recnum'
y_name = 'Fraud'
good_label = 0
bad_label = 1

## Run a filter on all the files

In [82]:
file_name = 'candidate_variables.csv'
df = pd.read_csv(file_name)

In [83]:
df.head()

Unnamed: 0,Recnum,Fraud,Dow_Risk,state_risk,benford_Cardnum,benford_Merchnum,Cardnum_day_since,Cardnum_count_0,Cardnum_avg_0,Cardnum_max_0,...,Card_Merchnum_desc_count_0_by_7_sq,Card_Merchnum_desc_count_0_by_14_sq,Card_Merchnum_desc_count_0_by_30_sq,Card_Merchnum_desc_count_0_by_60_sq,Card_Merchnum_desc_count_1_by_7_sq,Card_Merchnum_desc_count_1_by_14_sq,Card_Merchnum_desc_count_1_by_30_sq,Card_Merchnum_desc_count_1_by_60_sq,amount_cat,foreign
0,1,0,0.025994,0.00797,1.000894,1.000894,1461.0,1,3.62,3.62,...,0.020408,0.005102,0.001111,0.000278,0.020408,0.005102,0.001111,0.000278,1,False
1,2,0,0.025994,0.003304,1.000894,1.000894,1461.0,1,31.42,31.42,...,0.020408,0.005102,0.001111,0.000278,0.020408,0.005102,0.001111,0.000278,2,False
2,3,0,0.025994,0.017282,1.000894,1.000894,1461.0,1,178.49,178.49,...,0.020408,0.005102,0.001111,0.000278,0.020408,0.005102,0.001111,0.000278,3,False
3,4,0,0.025994,0.00797,1.000894,1.010684,1461.0,1,3.62,3.62,...,0.020408,0.005102,0.001111,0.000278,0.020408,0.005102,0.001111,0.000278,1,False
4,5,0,0.025994,0.00797,1.010684,1.031246,0.0,2,3.62,3.62,...,0.020408,0.005102,0.001111,0.000278,0.020408,0.005102,0.001111,0.000278,1,False


In [84]:
print("********** working on",file_name,"size is",df.shape)
df = df.set_index(index_name) 

********** working on candidate_variables.csv size is (96397, 1425)


In [85]:
df.head()

Unnamed: 0_level_0,Fraud,Dow_Risk,state_risk,benford_Cardnum,benford_Merchnum,Cardnum_day_since,Cardnum_count_0,Cardnum_avg_0,Cardnum_max_0,Cardnum_med_0,...,Card_Merchnum_desc_count_0_by_7_sq,Card_Merchnum_desc_count_0_by_14_sq,Card_Merchnum_desc_count_0_by_30_sq,Card_Merchnum_desc_count_0_by_60_sq,Card_Merchnum_desc_count_1_by_7_sq,Card_Merchnum_desc_count_1_by_14_sq,Card_Merchnum_desc_count_1_by_30_sq,Card_Merchnum_desc_count_1_by_60_sq,amount_cat,foreign
Recnum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0.025994,0.00797,1.000894,1.000894,1461.0,1,3.62,3.62,3.62,...,0.020408,0.005102,0.001111,0.000278,0.020408,0.005102,0.001111,0.000278,1,False
2,0,0.025994,0.003304,1.000894,1.000894,1461.0,1,31.42,31.42,31.42,...,0.020408,0.005102,0.001111,0.000278,0.020408,0.005102,0.001111,0.000278,2,False
3,0,0.025994,0.017282,1.000894,1.000894,1461.0,1,178.49,178.49,178.49,...,0.020408,0.005102,0.001111,0.000278,0.020408,0.005102,0.001111,0.000278,3,False
4,0,0.025994,0.00797,1.000894,1.010684,1461.0,1,3.62,3.62,3.62,...,0.020408,0.005102,0.001111,0.000278,0.020408,0.005102,0.001111,0.000278,1,False
5,0,0.025994,0.00797,1.010684,1.031246,0.0,2,3.62,3.62,3.62,...,0.020408,0.005102,0.001111,0.000278,0.020408,0.005102,0.001111,0.000278,1,False


In [86]:
# Pre-processing : 
    
# file_name = "vars"+str(i+1)+'.csv'
# df = df[df.index <= 84300] # remove the last two months as the out-of-time data (OOT)
df = df[df.index >= 2995] # remove the first 2 weeks of records since their variables aren't well formed
df['RANDOM'] = np.random.ranf(len(df)) # add a random number variable to make sure it doesn't come up as important
    
goods = df[df[y_name] == good_label]
bads = df[df[y_name] == bad_label]
# del df # don't need this file anymore

In [87]:
df.head()

Unnamed: 0_level_0,Fraud,Dow_Risk,state_risk,benford_Cardnum,benford_Merchnum,Cardnum_day_since,Cardnum_count_0,Cardnum_avg_0,Cardnum_max_0,Cardnum_med_0,...,Card_Merchnum_desc_count_0_by_14_sq,Card_Merchnum_desc_count_0_by_30_sq,Card_Merchnum_desc_count_0_by_60_sq,Card_Merchnum_desc_count_1_by_7_sq,Card_Merchnum_desc_count_1_by_14_sq,Card_Merchnum_desc_count_1_by_30_sq,Card_Merchnum_desc_count_1_by_60_sq,amount_cat,foreign,RANDOM
Recnum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2995,0,0.018626,0.000958,1.003215,1.073374,1.0,1,174.2,174.2,174.2,...,0.001701,0.00037,9.3e-05,0.006803,0.001701,0.00037,9.3e-05,3,False,0.045147
2996,0,0.018626,0.01344,1.202258,1.02144,1.0,1,250.0,250.0,250.0,...,0.005102,0.001111,0.000278,0.020408,0.005102,0.001111,0.000278,4,False,0.433471
2997,0,0.018626,0.00115,1.012697,1.200501,2.0,1,53.88,53.88,53.88,...,0.002551,0.000556,0.000139,0.010204,0.002551,0.000556,0.000139,2,False,0.287434
2998,0,0.018626,0.012689,1.056532,2.072316,1.0,1,1269.0,1269.0,1269.0,...,0.005102,0.001111,0.000278,0.020408,0.005102,0.001111,0.000278,5,False,0.426328
2999,0,0.018626,0.00115,1.039118,1.004553,2.0,1,288.38,288.38,288.38,...,0.005102,0.001111,0.000278,0.020408,0.005102,0.001111,0.000278,4,False,0.555681


In [88]:
%%time
filter_score_df_list = []

for i in range(num_files):

    num_goods = len(goods)
    num_bads = len(bads)
    num_vars = len(bads.columns)-2
    if(balance != 0):
        if(i == 0):
            num_goods_desired = int(min(num_goods,num_bads*balance))
            goods = goods.sample(n=num_goods_desired,random_state=1)
            goods_keep = list(goods.index)
            goods_keep.sort()
    
        if(i > 0):
            goods = goods.loc[goods_keep] 
            
    df_sampled = pd.concat([goods,bads])
    df_sampled.sort_index(inplace=True)
    filter_score = pd.DataFrame(np.zeros((num_vars+1, 2)))
    filter_score.columns = ['variable','filter score']   
    j = 0
    for column in df_sampled:
        filter_score.loc[j,'variable'] = column
        filter_score.loc[j,'filter score'] = sps.ks_2samp(goods[column],bads[column])[0]
        j = j+1
        if j%100 == 0:
            print(j)

    filter_score.sort_values(by=['filter score'], ascending=False, inplace=True)
    vars_keep = list(filter_score['variable'][1:num_filter+1]) 
    print(file_name, filter_score.head(21))
    if(i == 0): # if first time through need to initialize some stuff
        Y = pd.DataFrame(df_sampled[y_name], index=df_sampled.index)
        df_top = df_sampled.filter(vars_keep, axis=1)
            
    if(i > 0): # if more than one variable file we use this loop
        data_new_top = df_sampled.filter(vars_keep, axis=1)
        df_top = pd.concat([df_top, data_new_top], axis=1)

    filter_score_df_list.append(filter_score)
    
    del goods # delete these before starting the next file, if any
    del bads
    gc.collect()
    
filter_score = pd.concat(filter_score_df_list)

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
candidate_variables.csv                        variable  filter score
0                         Fraud      1.000000
421           card_zip3_total_7      0.676549
229            card_zip_total_7      0.666816
412           card_zip3_total_3      0.660260
430          card_zip3_total_14      0.659257
238           card_zip_total_14      0.652244
220            card_zip_total_3      0.652217
165          card_merch_total_7      0.637702
247           card_zip_total_30      0.637171
419             card_zip3_max_7      0.630957
156          card_merch_total_3      0.630782
439          card_zip3_total_30      0.630295
174         card_merch_total_14      0.630048
428            card_zip3_max_14      0.629515
236             card_zip_max_14      0.627930
227              card_zip_max_7      0.625088
245             card_zip_max_30      0.624168
485      Card_Merchdesc_total_7      0.621818
256           card_zip_total_60      0.61

In [89]:
filter_score.sort_values(by=['filter score'], ascending=False, inplace=True)
filter_score.reset_index(drop=True,inplace=True)

In [90]:
filter_score.head(31)

Unnamed: 0,variable,filter score
0,Fraud,1.0
1,card_zip3_total_7,0.676549
2,card_zip_total_7,0.666816
3,card_zip3_total_3,0.66026
4,card_zip3_total_14,0.659257
5,card_zip_total_14,0.652244
6,card_zip_total_3,0.652217
7,card_merch_total_7,0.637702
8,card_zip_total_30,0.637171
9,card_zip3_max_7,0.630957


In [91]:
filter_score.tail(10)

Unnamed: 0,variable,filter score
1415,card_merch_unique_count_for_zip3_7,0.000953
1416,Card_Merchdesc_unique_count_for_zip3_3,0.000888
1417,card_merch_unique_count_for_zip3_3,0.000823
1418,Card_Merchnum_desc_unique_count_for_zip3_7,0.000801
1419,Card_Merchdesc_unique_count_for_zip3_1,0.000704
1420,card_merch_unique_count_for_zip3_1,0.000693
1421,Card_Merchnum_desc_unique_count_for_zip3_3,0.000671
1422,Merchnum_desc_unique_count_for_zip3_3,0.000667
1423,Card_Merchnum_desc_unique_count_for_zip3_1,0.000585
1424,card_merch_unique_count_for_Cardnum_1,0.0


In [92]:
filter_score.shape

(1425, 2)

In [93]:
filter_score.head(80).to_csv('filter_top.csv')
vars_keep = list(filter_score['variable'][num_files:num_filter+3])
print(i,' vars_keep:',vars_keep)

0  vars_keep: ['card_zip3_total_7', 'card_zip_total_7', 'card_zip3_total_3', 'card_zip3_total_14', 'card_zip_total_14', 'card_zip_total_3', 'card_merch_total_7', 'card_zip_total_30', 'card_zip3_max_7', 'card_merch_total_3', 'card_zip3_total_30', 'card_merch_total_14', 'card_zip3_max_14', 'card_zip_max_14', 'card_zip_max_7', 'card_zip_max_30', 'Card_Merchdesc_total_7', 'card_zip_total_60', 'Card_Merchnum_desc_total_7', 'card_zip3_total_1', 'card_merch_total_30', 'card_zip3_max_30', 'Card_Merchdesc_total_14', 'Card_Merchnum_desc_total_14', 'card_zip3_total_60', 'card_zip_total_1', 'Card_Merchdesc_total_3', 'Card_Merchnum_desc_total_3', 'Card_Merchdesc_total_30', 'card_zip3_max_3', 'Card_Merchnum_desc_total_30', 'card_merch_max_14', 'card_zip_max_60', 'card_merch_max_30', 'Card_Merchdesc_max_14', 'card_merch_max_7', 'Card_Merchnum_desc_max_14', 'Card_Merchdesc_max_30', 'Card_Merchdesc_max_7', 'Card_Merchnum_desc_max_30', 'card_zip_max_3', 'card_merch_total_60', 'Card_Merchnum_desc_max_7',

In [94]:
vars_keep_df = pd.DataFrame({'col':vars_keep})
vars_keep_df.to_csv('vars_keep_filter.csv',index=False)
df_keep = df_top.filter(vars_keep, axis=1)
df_keep.head()

Unnamed: 0_level_0,card_zip3_total_7,card_zip_total_7,card_zip3_total_3,card_zip3_total_14,card_zip_total_14,card_zip_total_3,card_merch_total_7,card_zip_total_30,card_zip3_max_7,card_merch_total_3,...,Cardnum_max_60,Merchnum_desc_max_7,Card_Merchdesc_med_14,Cardnum_med_0,Merchnum_max_7,merch_zip_max_7,card_zip3_med_14,amount_cat,merch_zip_med_0,Merchnum_med_0
Recnum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2995,1023.04,1023.04,174.2,1023.04,1023.04,174.2,1023.04,1023.04,792.26,174.2,...,792.26,2454.99,174.2,174.2,2454.99,2454.99,174.2,3,174.2,174.2
2996,250.0,250.0,250.0,250.0,250.0,250.0,250.0,250.0,250.0,250.0,...,1000.0,250.0,250.0,250.0,250.0,250.0,250.0,4,250.0,250.0
2997,76.85,76.85,76.85,76.85,76.85,76.85,76.85,76.85,53.88,76.85,...,1377.0,1097.6,38.425,53.88,1097.6,1097.6,38.425,2,53.88,53.88
2998,1269.0,1269.0,1269.0,1269.0,1269.0,1269.0,1269.0,1269.0,1269.0,1269.0,...,1269.0,4666.0,1269.0,1269.0,4666.0,4666.0,1269.0,5,1269.0,1269.0
2999,288.38,288.38,288.38,288.38,288.38,288.38,288.38,288.38,288.38,288.38,...,2288.99,601.52,288.38,288.38,601.52,601.52,288.38,4,288.38,288.38


In [95]:
df_keep.shape

(93409, 200)

In [96]:
Y.head()

Unnamed: 0_level_0,Fraud
Recnum,Unnamed: 1_level_1
2995,0
2996,0
2997,0
2998,0
2999,0


In [97]:
Y = Y.values
Y_save = Y.copy()

In [98]:
# Y = np.array(Y)
X = df_keep
print(Y)

[[0]
 [0]
 [0]
 ...
 [0]
 [0]
 [0]]


In [99]:
print('time to here:', dt.datetime.now() - start_time)

time to here: 6:58:41.054181


In [100]:
print(X.shape,Y.shape)

(93409, 200) (93409, 1)


In [101]:
print(type(X),type(Y))

<class 'pandas.core.frame.DataFrame'> <class 'numpy.ndarray'>


In [102]:
# I'd like to define a scoring for the wrapper that's KS, but I haven't gotten around to this yet.
# def KSscore(classifier, x,y)
#     goods = 

In [103]:
def fdr(classifier, x, y, cutoff=detect_rate):
# Calculates FDR score for the given classifier on dataset x and y with cutoff value
# get the probability list from the given classifier
    return fdr_prob(y, classifier.predict_proba(x), cutoff)

def fdr_prob(y, y_prob, cutoff=detect_rate):
    if len(y_prob.shape) != 1:    # sometimes the proba list can contain many columns, one for each category
        y_prob = y_prob[:, -1:]   # only the last one (fraud_label==1) is used here.
    num_fraud = len(y[y == 1])    # count the total nunber of frauds   
# sort the proba list from high to low while retain the true (not predicted) fraud label
    sorted_prob = np.asarray(sorted(zip(y_prob, y), key=lambda x: x[0], reverse=True))
    cutoff_bin = sorted_prob[0:int(len(y) * cutoff), 1:]  # 3% cutoff
# return the FDR score (#fraud_in_cutoff / #total_fraud)
    return len(cutoff_bin[cutoff_bin == 1]) / num_fraud   

## Run a wrapper on the remaining top variables

In [None]:
%%time
import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings(action='once')
pd.options.mode.chained_assignment = None
# If you're doing forward selection it's enough to stop at num_wrapper variables. 
# If you're doing backward selection you need to go through all the variables to get a sorted list of num_wrapper variables.

# I can't figure out how to get rid of this annoying warning! I don't know what I'm doing wrong...

nfeatures = len(X.columns)
clf = RandomForestClassifier(n_estimators=5) # simple, fast nonlinear model for the wrapper
# clf = LGBMClassifier(n_estimators=40,num_leaves=4) # simple, fast nonlinear model for the wrapper
sfs = SFS(clf,k_features=num_wrapper,forward=True,verbose=0,scoring=fdr,cv=4,n_jobs=-1) # use for forward selection
# sfs = SFS(clf,k_features=1,forward=False,verbose=0,scoring=fdr,cv=4,n_jobs=-1) # use for backward selection
sfs.fit(X,Y)

In [80]:
print('time to here:', dt.datetime.now() - start_time)

time to here: 6:57:42.325493


In [None]:
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
fig1 = plot_sfs(sfs.get_metric_dict(), kind='std_dev', figsize=(15, 6))
# plt.xticks(np.arange(0, len(X.columns), step=5))
plt.xticks(np.arange(0, num_wrapper, step=5))
plt.yticks(np.arange(0,1,step=.1))
plt.ylim([.5, .8])
plt.xlim(0,num_wrapper)
plt.title('Forward Stepwise Selection')
plt.grid()
plt.savefig('performance_nvars.png')
plt.show()

In [None]:
vars_FS = pd.DataFrame.from_dict(sfs.get_metric_dict()).T

In [32]:
ordered_vars_FS = vars_FS.copy()
for i in range(len(ordered_vars_FS)):
    ordered_vars_FS.loc[i+1,'add variables in this order'] = int(i+1)
    if i+1 == 1:
        ordered_vars_FS.loc[i+1,'variable name'] = (list(ordered_vars_FS.loc[i+1,'feature_names'])[0])
    else:
        ordered_vars_FS.loc[i+1,'variable name'] = (list(set(ordered_vars_FS.loc[i+1,'feature_names']) - set(ordered_vars_FS.loc[i,'feature_names'])))[0]
# You might also need this following line. It converts a list to a string
#         ordered_vars_FS.loc[i+1,'variable name'] = ordered_vars_FS.loc[i+1,'variable name'][0]

In [33]:
ordered_vars_FS

Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names,ci_bound,std_dev,std_err,add variables in this order,variable name
1,"(11,)","[0.6908396946564885, 0.5670498084291188, 0.709...",0.595961,"(card_merch_total_14,)",0.188308,0.117474,0.067823,1.0,card_merch_total_14
2,"(11, 12)","[0.7404580152671756, 0.6360153256704981, 0.744...",0.660912,"(card_merch_total_14, card_zip3_max_14)",0.145474,0.090752,0.052396,2.0,card_zip3_max_14
3,"(11, 12, 134)","[0.7251908396946565, 0.7164750957854407, 0.744...",0.676256,"(card_merch_total_14, card_zip3_max_14, zip3_a...",0.14635,0.091299,0.052711,3.0,zip3_actual/avg_60
4,"(11, 12, 134, 180)","[0.7557251908396947, 0.7241379310344828, 0.744...",0.693439,"(card_merch_total_14, card_zip3_max_14, zip3_a...",0.134332,0.083801,0.048383,4.0,Card_Merchdesc_med_7
5,"(11, 12, 76, 134, 180)","[0.6984732824427481, 0.7394636015325671, 0.759...",0.695362,"(card_merch_total_14, card_zip3_max_14, Cardnu...",0.108961,0.067974,0.039245,5.0,Cardnum_total_14
6,"(11, 12, 25, 76, 134, 180)","[0.7900763358778626, 0.735632183908046, 0.7366...",0.710626,"(card_merch_total_14, card_zip3_max_14, card_z...",0.125805,0.078482,0.045312,6.0,card_zip_total_1
7,"(11, 12, 25, 64, 76, 134, 180)","[0.7824427480916031, 0.7432950191570882, 0.759...",0.717312,"(card_merch_total_14, card_zip3_max_14, card_z...",0.125404,0.078232,0.045167,7.0,card_merch_total_0
8,"(11, 12, 25, 64, 76, 134, 180, 199)","[0.7900763358778626, 0.7394636015325671, 0.755...",0.717309,"(card_merch_total_14, card_zip3_max_14, card_z...",0.126831,0.079122,0.045681,8.0,Merchnum_med_0
9,"(11, 12, 25, 64, 76, 134, 157, 180, 199)","[0.7900763358778626, 0.735632183908046, 0.7748...",0.722076,"(card_merch_total_14, card_zip3_max_14, card_z...",0.128294,0.080035,0.046208,9.0,card_zip3_med_3
10,"(11, 12, 25, 64, 76, 133, 134, 157, 180, 199)","[0.7900763358778626, 0.7509578544061303, 0.774...",0.725907,"(card_merch_total_14, card_zip3_max_14, card_z...",0.129767,0.080954,0.046739,10.0,Card_Merchnum_desc_avg_1


In [34]:
ordered_vars_FS.to_csv('Wrapper_selection_info.csv', index=False)

In [35]:
vars_keep = ordered_vars_FS['variable name']
vars_keep_list = ordered_vars_FS['variable name'].tolist()
vars_keep.to_csv('final_vars_list.csv',index=False)
vars_keep

1          card_merch_total_14
2             card_zip3_max_14
3           zip3_actual/avg_60
4         Card_Merchdesc_med_7
5             Cardnum_total_14
6             card_zip_total_1
7           card_merch_total_0
8               Merchnum_med_0
9              card_zip3_med_3
10    Card_Merchnum_desc_avg_1
11             merch_zip_med_0
12    Card_Merchnum_desc_avg_3
13        Card_Merchdesc_avg_3
14    Card_Merchnum_desc_med_1
15        Card_Merchdesc_med_3
16        Card_Merchdesc_med_1
17    Card_Merchnum_desc_med_3
18           card_zip_total_14
19         Merchnum_desc_avg_0
20              card_zip_med_3
Name: variable name, dtype: object

In [36]:
filter_score.set_index('variable',drop=True,inplace=True)
filter_score = filter_score.iloc[1:,:]
filter_score

Unnamed: 0_level_0,filter score
variable,Unnamed: 1_level_1
card_zip3_total_7,0.676549
card_zip_total_7,0.666816
card_zip3_total_3,0.660260
card_zip3_total_14,0.659257
card_zip_total_14,0.652244
...,...
card_merch_unique_count_for_zip3_1,0.000693
Card_Merchnum_desc_unique_count_for_zip3_3,0.000671
Merchnum_desc_unique_count_for_zip3_3,0.000667
Card_Merchnum_desc_unique_count_for_zip3_1,0.000585


In [37]:
vars_keep_sorted = pd.DataFrame(vars_keep_list)
vars_keep_sorted.columns=['variable']
vars_keep_sorted.set_index('variable',drop=True,inplace=True)
vars_keep_sorted.head()

card_merch_total_14
card_zip3_max_14
zip3_actual/avg_60
Card_Merchdesc_med_7
Cardnum_total_14


In [38]:
vars_keep_sorted = pd.concat([vars_keep_sorted,filter_score],axis=1,join='inner')

In [39]:
vars_keep_sorted.reset_index(inplace=True)
vars_keep_sorted.reset_index(inplace=True)
vars_keep_sorted['index'] = vars_keep_sorted['index'] + 1
vars_keep_sorted.rename(columns={'index':'wrapper order'},inplace=True)
vars_keep_sorted.to_csv('vars_keep_sorted.csv',index=False)
vars_keep_sorted

Unnamed: 0,wrapper order,variable,filter score
0,1,card_merch_total_14,0.630048
1,2,card_zip3_max_14,0.629515
2,3,zip3_actual/avg_60,0.511141
3,4,Card_Merchdesc_med_7,0.489783
4,5,Cardnum_total_14,0.534929
5,6,card_zip_total_1,0.610773
6,7,card_merch_total_0,0.548902
7,8,Merchnum_med_0,0.471259
8,9,card_zip3_med_3,0.498349
9,10,Card_Merchnum_desc_avg_1,0.511187


In [40]:
vars_keep_list.append(index_name)
vars_keep_list.append(y_name)
vars_keep_list

['card_merch_total_14',
 'card_zip3_max_14',
 'zip3_actual/avg_60',
 'Card_Merchdesc_med_7',
 'Cardnum_total_14',
 'card_zip_total_1',
 'card_merch_total_0',
 'Merchnum_med_0',
 'card_zip3_med_3',
 'Card_Merchnum_desc_avg_1',
 'merch_zip_med_0',
 'Card_Merchnum_desc_avg_3',
 'Card_Merchdesc_avg_3',
 'Card_Merchnum_desc_med_1',
 'Card_Merchdesc_med_3',
 'Card_Merchdesc_med_1',
 'Card_Merchnum_desc_med_3',
 'card_zip_total_14',
 'Merchnum_desc_avg_0',
 'card_zip_med_3',
 'Recnum',
 'Fraud']

In [41]:
filter_score

Unnamed: 0_level_0,filter score
variable,Unnamed: 1_level_1
card_zip3_total_7,0.676549
card_zip_total_7,0.666816
card_zip3_total_3,0.660260
card_zip3_total_14,0.659257
card_zip_total_14,0.652244
...,...
card_merch_unique_count_for_zip3_1,0.000693
Card_Merchnum_desc_unique_count_for_zip3_3,0.000671
Merchnum_desc_unique_count_for_zip3_3,0.000667
Card_Merchnum_desc_unique_count_for_zip3_1,0.000585


In [42]:
%%time
df = pd.read_csv(file_name)
df.shape

CPU times: user 15.3 s, sys: 2.37 s, total: 17.7 s
Wall time: 18.5 s


(96397, 1425)

In [43]:
df_keep = df.filter(vars_keep_list, axis=1)
# df_keep = df[df.index.isin(vars_keep_list)]
print(df_keep.shape)

(96397, 22)


In [44]:
df_keep.to_csv('vars_final.csv',index=False)

In [45]:
print("duration: ", dt.datetime.now() - start_time)

duration:  1:14:33.360299
