In [1]:
import gc
import time
import numpy as np
import pandas as pd
from datetime import datetime

### Example for extend the features

In [22]:
d1 = {'card_id':[1, 2, 1, 3], 
      'A':[1, 2, 1, 2], 
      'B':[2, 1, 2, 2], 
      'C':[4, 5, 1, 5], 
      'D':[7, 5, 4, 8],}

numeric_cols = ['C', 'D']
category_cols = ['A', 'B']

t1 = pd.DataFrame(d1)
t1

Unnamed: 0,card_id,A,B,C,D
0,1,1,2,4,7
1,2,2,1,5,5
2,1,1,2,1,4
3,3,2,2,5,8


In [23]:
features = {}
card_all = t1['card_id'].values.tolist()
for card in card_all:
    features[card] = {}
features

{1: {}, 2: {}, 3: {}}

In [24]:
columns = t1.columns.tolist()
columns

['card_id', 'A', 'B', 'C', 'D']

In [25]:
idx = columns.index('card_id')
category_cols_index = [columns.index(col) for col in category_cols]
numeric_cols_index = [columns.index(col) for col in numeric_cols]
print(category_cols_index, numeric_cols_index)


[1, 2] [3, 4]


In [26]:
# Combine categorical and continuous features
for i in range(t1.shape[0]):
    va = t1.loc[i].values
    card = va[idx]
    for cate_ind in category_cols_index:
        for num_ind in numeric_cols_index:
            print(columns[cate_ind], va[cate_ind], columns[num_ind])
            col_name = '&'.join([columns[cate_ind], str(va[cate_ind]), columns[num_ind]])
            features[card][col_name] = features[card].get(col_name, 0) + va[num_ind]
features

A 1 C
A 1 D
B 2 C
B 2 D
A 2 C
A 2 D
B 1 C
B 1 D
A 1 C
A 1 D
B 2 C
B 2 D
A 2 C
A 2 D
B 2 C
B 2 D


{1: {'A&1&C': 5, 'A&1&D': 11, 'B&2&C': 5, 'B&2&D': 11},
 2: {'A&2&C': 5, 'A&2&D': 5, 'B&1&C': 5, 'B&1&D': 5},
 3: {'A&2&C': 5, 'A&2&D': 8, 'B&2&C': 5, 'B&2&D': 8}}

In [27]:
df = pd.DataFrame(features).T.reset_index()
cols = df.columns.tolist()
df.columns = ['card_id'] + cols[1:]
df

Unnamed: 0,card_id,A&1&C,A&1&D,B&2&C,B&2&D,A&2&C,A&2&D,B&1&C,B&1&D
0,1,5.0,11.0,5.0,11.0,,,,
1,2,,,,,5.0,5.0,5.0,5.0
2,3,,,5.0,8.0,5.0,8.0,,


This method of feature creation can efficiently capture hidden information in the dataset. However, it tends to generate a significant number of missing values, which should be considered in the subsequent modeling process due to the potential issue of a sparse feature matrix.

# Generate features combination

In [28]:
train = pd.read_csv('./dataset/preprocess/train_pre.csv')
test =  pd.read_csv('./dataset/preprocess/test_pre.csv')
transaction = pd.read_csv('./dataset/preprocess/transaction_d_pre.csv')

In [30]:
numeric_cols = ['purchase_amount', 'installments']

category_cols = ['authorized_flag', 'city_id', 'category_1',
       'category_3', 'merchant_category_id','month_lag','most_recent_sales_range',
                 'most_recent_purchases_range', 'category_4',
                 'purchase_month', 'purchase_hour_section', 'purchase_day']

id_cols = ['card_id', 'merchant_id']

In [32]:
# 创建字典用于保存数据
features = {}
card_all = train['card_id'].append(test['card_id']).values.tolist()
for card in card_all:
    features[card] = {}
     
# 标记不同类型字段的索引
columns = transaction.columns.tolist()
idx = columns.index('card_id')
category_cols_index = [columns.index(col) for col in category_cols]
numeric_cols_index = [columns.index(col) for col in numeric_cols]

# 记录运行时间
s = time.time()
num = 0

# 执行循环，并在此过程中记录时间
for i in range(transaction.shape[0]):
    va = transaction.loc[i].values
    card = va[idx]
    for cate_ind in category_cols_index:
        for num_ind in numeric_cols_index:
            col_name = '&'.join([columns[cate_ind], str(va[cate_ind]), columns[num_ind]])
            features[card][col_name] = features[card].get(col_name, 0) + va[num_ind]
    num += 1
    if num%1000000==0:
        print(time.time()-s, "s")
del transaction
gc.collect()

  card_all = train['card_id'].append(test['card_id']).values.tolist()


105.268230676651 s
209.76702666282654 s
314.1025788784027 s
418.27787947654724 s
523.1293902397156 s
628.3903036117554 s
731.7226657867432 s
836.4218597412109 s
942.1538763046265 s
1047.1697945594788 s
1151.4360394477844 s
1256.1012179851532 s
1360.415964603424 s
1464.6962883472443 s
1567.9305574893951 s
1670.964239358902 s
1774.5150382518768 s
1878.2272095680237 s
1981.8261063098907 s
2085.841588497162 s
2189.4571118354797 s
2292.7393362522125 s
2395.8548493385315 s
2498.587147951126 s
2601.8525953292847 s
2705.001594543457 s
2808.0665283203125 s
2911.482572078705 s
3014.3314888477325 s
3117.380373954773 s
3220.5958938598633 s


584

In [33]:
# 字典转dataframe
df = pd.DataFrame(features).T.reset_index()
del features
cols = df.columns.tolist()
df.columns = ['card_id'] + cols[1:]

# 生成训练集与测试集
train = pd.merge(train, df, how='left', on='card_id')
test =  pd.merge(test, df, how='left', on='card_id')
del df
train.to_csv("./dataset/preprocess/train_dict.csv", index=False)
test.to_csv("./dataset/preprocess/test_dict.csv", index=False)

gc.collect()

0

# Generate features based on statistical results

We can generate new featuers by their statistical results, such as sum, mean, varience. In this way, the new features are not likely contain so many NAN values.

In [34]:
transaction = pd.read_csv('./dataset/preprocess/transaction_g_pre.csv')

In [35]:
numeric_cols = ['authorized_flag',  'category_1', 'installments',
       'category_3',  'month_lag','purchase_month','purchase_day','purchase_day_diff', 'purchase_month_diff',
       'purchase_amount', 'category_2', 
       'purchase_month', 'purchase_hour_section', 'purchase_day',
       'most_recent_sales_range', 'most_recent_purchases_range', 'category_4']
categorical_cols = ['city_id', 'merchant_category_id', 'merchant_id', 'state_id', 'subsector_id']

In [36]:
aggs = {}

for col in numeric_cols:
    aggs[col] = ['nunique', 'mean', 'min', 'max', 'var', 'skew', 'sum']
for col in categorical_cols:
    aggs[col] = ['nunique']
aggs['card_id'] = ['size', 'count']
cols = ['card_id']

for key in aggs.keys():
    cols.extend([key+'_'+stat for stat in aggs[key]])

df = transaction[transaction['month_lag'] < 0].groupby('card_id').agg(aggs).reset_index()
df.columns = cols[:1] + [co+'_hist' for co in cols[1:]]

df2 = transaction[transaction['month_lag'] >= 0].groupby('card_id').agg(aggs).reset_index()
df2.columns = cols[:1] + [co+'_new' for co in cols[1:]]
df = pd.merge(df, df2, how='left', on='card_id')

df2 = transaction.groupby('card_id').agg(aggs).reset_index()
df2.columns = cols
df = pd.merge(df, df2, how='left',on='card_id')
del transaction
gc.collect()

train = pd.merge(train, df, how='left', on='card_id')
test =  pd.merge(test, df, how='left', on='card_id')
del df
train.to_csv("./dataset/preprocess/train_groupby.csv", index=False)
test.to_csv("./dataset/preprocess/test_groupby.csv", index=False)

gc.collect()

0

# Merge two kinds of new features

Up to this point, we have completed the feature extraction from two different perspectives. However, as of now, the features from the two sets of solutions are still stored in separate data files. We need to merge them to proceed with modeling. The merging process is relatively straightforward; you just need to horizontally concatenate "train_dict" (or "test_dict") and "train_group" (or "test_group") based on the "card_id" column and then remove any duplicate columns.

In [37]:
train_dict = pd.read_csv("./dataset/preprocess/train_dict.csv")
test_dict = pd.read_csv("./dataset/preprocess/test_dict.csv")
train_groupby = pd.read_csv("./dataset/preprocess/train_groupby.csv")
test_groupby = pd.read_csv("./dataset/preprocess/test_groupby.csv")

In [38]:
for co in train_dict.columns:
    if co in train_groupby.columns and co!='card_id':
        del train_groupby[co]
for co in test_dict.columns:
    if co in test_groupby.columns and co!='card_id':
        del test_groupby[co]

In [39]:
train_dict.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target,authorized_flag&1&purchase_amount,authorized_flag&1&installments,city_id&19&purchase_amount,city_id&19&installments,...,merchant_category_id&789&purchase_amount,merchant_category_id&789&installments,merchant_category_id&687&purchase_amount,merchant_category_id&687&installments,merchant_category_id&220&purchase_amount,merchant_category_id&220&installments,merchant_category_id&322&purchase_amount,merchant_category_id&322&installments,merchant_category_id&603&purchase_amount,merchant_category_id&603&installments
0,67,C_ID_92a2005557,5,2,1,-0.820283,-170.641218,0.0,-1.422815,0.0,...,,,,,,,,,,
1,62,C_ID_3d0044924f,4,1,0,0.392913,-213.239185,507.0,-4.782308,7.0,...,,,,,,,,,,
2,57,C_ID_d639edf6cd,2,2,0,0.688056,-28.528749,0.0,-0.705405,0.0,...,,,,,,,,,,
3,70,C_ID_186d6a6901,4,3,0,0.142495,-54.145736,89.0,-0.707839,1.0,...,,,,,,,,,,
4,72,C_ID_cdbd2c0db2,1,3,0,-0.159749,-88.966702,179.0,,,...,,,,,,,,,,


Missing values will be filled with 0 in this case. It's important to note that these missing values are not truly missing but rather represent values that were not accounted for during the feature creation process. These values can be logically interpreted as 0. Therefore, filling missing values with 0 here is essentially a data completion step.

In [40]:
train = pd.merge(train_dict, train_groupby, how='left', on='card_id').fillna(0)
test = pd.merge(test_dict, test_groupby, how='left', on='card_id').fillna(0)

In [41]:
train.to_csv("./dataset/preprocess/train.csv", index=False)
test.to_csv("./dataset/preprocess/test.csv", index=False)

del train_dict, test_dict, train_groupby, test_groupby
gc.collect()

0