In [1]:
########################################################
# Author:                   
# Date: Nov.8, 2018         
# Version: v3            
# Notice:      
# 1. FeatureExtract
# 2. directories structure
#   code/
#   model/
#   feature/
#   answer/
#   train/
#   A/
#   MAKE SURE ALL OF THE DIR IS EXISTED!!!
# 3. 特征均存储在../feature目录
# 
# 4. 最终的特征汇总在features_all_train_test_20181106.csv
#    包含所有train和A的数据
#########################################################


import numpy as np
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures

# common prefix
TRAIN_PREFIX = "../train/"
TEST_PREFIX = "../B/"
FEATURE_PREFIX = "../feature/"

# File Encoder
FILE_ENCODER = "utf-8"

#Train-dataset
BANK_DETAIL_TRAIN_FILE = TRAIN_PREFIX + "bank_detail_train.csv"
BILL_DETAIL_TRAIN_FILE = TRAIN_PREFIX + "bill_detail_train.csv"
BROWSE_HISTORY_TRAIN_FILE = TRAIN_PREFIX +"browse_history_train.csv"
LOANTIME_TRAIN_FILE = TRAIN_PREFIX + "loantime_train.csv"
OVERDUE_TRAIN_FILE = TRAIN_PREFIX + "overdue_train.csv"
USERINFO_TRAIN_FILE = TRAIN_PREFIX + "userinfo_train.csv"

#Test-dataset
BANK_DETAIL_TEST_FILE = TEST_PREFIX + "bank_detail_B.csv"
BILL_DETAIL_TEST_FILE = TEST_PREFIX + "bill_detail_B.csv"
BROWSE_HISTORY_TEST_FILE = TEST_PREFIX + "browse_history_B.csv"
LOANTIME_TEST_FILE = TEST_PREFIX + "loantime_B.csv"
USERINFO_TEST_FILE = TEST_PREFIX + "userinfo_B.csv"
USERID_TO_PREDICT_FILE = FEATURE_PREFIX + "to_predict_userid.csv" #因为A目录没有写入权限



In [2]:
def constructBrowseDetailFeature(loantime_file, browse_detail_file, output_file):
    loantime = pd.read_csv(loantime_file,header=0,names=['用户标识','放款时间'])
    loantime['放款时间']=loantime['放款时间']//86400
    browse_detail = pd.read_csv(browse_detail_file,header=0,
                        names=['用户标识','浏览时间','浏览行为数据','浏览子行为编号'])
    browse_detail['浏览时间']=browse_detail['浏览时间']//86400
    
    feature=loantime
    d= pd.merge(browse_detail, loantime,how='left', on = "用户标识")

    #----------------------------------------放款前特征统计------------------------------------------#
    #统计放款前用户浏览子行为总数以及浏览行为数据总和
    gb=d[(d['浏览时间']<=d['放款时间'])].groupby(["用户标识"],as_index=False)
    x1=gb['浏览行为数据'].agg({'放款前浏览行为数据sum' : 'sum','放款前浏览行为数据max' : 'max','放款前浏览行为数据mean' : 'mean'
                        ,'放款前浏览行为数据min' : 'min','放款前浏览行为数据std' : 'std','放款前浏览行为数据var' : 'var'})
    xx=gb['浏览子行为编号'].apply(lambda x:np.unique(x).size)
    x2=gb['浏览子行为编号'].agg({'放款前浏览子行为编号count' : 'count'})
    x2['放款前浏览子行为编号计数（去重）']=xx

    feature=pd.merge(feature, x1,how='left', on = "用户标识")
    feature=pd.merge(feature, x2,how='left', on = "用户标识")
    
    
#     gb = d[['用户标识','浏览时间']].groupby(["用户标识"], as_index = False)
#     timestamp_agg = gb.agg(['sum','max','min','mean','median','var','count']).reset_index()
#     feature = pd.merge(feature, timestamp_agg , how = 'left', on = '用户标识')

    #统计放款前用户浏览子行为个类别统计信息
    d=pd.get_dummies(d,columns=['浏览子行为编号'])#22919547 rows × 14 columns
    gb=d[(d['浏览时间']<=d['放款时间'])].groupby(["用户标识"],as_index=False)
    x1=gb['浏览子行为编号_1'].agg({'放款前浏览子行为编号_1' : 'sum'})
    x2=gb['浏览子行为编号_2'].agg({'放款前浏览子行为编号_2' : 'sum'})
    x3=gb['浏览子行为编号_3'].agg({'放款前浏览子行为编号_3' : 'sum'})
    x4=gb['浏览子行为编号_4'].agg({'放款前浏览子行为编号_4' : 'sum'})
    x5=gb['浏览子行为编号_5'].agg({'放款前浏览子行为编号_5' : 'sum'})
    x6=gb['浏览子行为编号_6'].agg({'放款前浏览子行为编号_6' : 'sum'})
    x7=gb['浏览子行为编号_7'].agg({'放款前浏览子行为编号_7' : 'sum'})
    x8=gb['浏览子行为编号_8'].agg({'放款前浏览子行为编号_8' : 'sum'})
    x9=gb['浏览子行为编号_9'].agg({'放款前浏览子行为编号_9' : 'sum'})
    x10=gb['浏览子行为编号_10'].agg({'放款前浏览子行为编号_10' : 'sum'})
    x11=gb['浏览子行为编号_11'].agg({'放款前浏览子行为编号_11' : 'sum'})

    feature=pd.merge(feature, x1,how='left', on = "用户标识")
    feature=pd.merge(feature, x2,how='left', on = "用户标识")
    feature=pd.merge(feature, x3,how='left', on = "用户标识")
    feature=pd.merge(feature, x4,how='left', on = "用户标识")
    feature=pd.merge(feature, x5,how='left', on = "用户标识")
    feature=pd.merge(feature, x6,how='left', on = "用户标识")
    feature=pd.merge(feature, x7,how='left', on = "用户标识")
    feature=pd.merge(feature, x8,how='left', on = "用户标识")
    feature=pd.merge(feature, x9,how='left', on = "用户标识")
    feature=pd.merge(feature, x10,how='left', on = "用户标识")
    feature=pd.merge(feature, x11,how='left', on = "用户标识")

    d= pd.merge(browse_detail, loantime,how='left', on = "用户标识")#22919547 rows × 5 columns
    
    #----------------------------------------放款后特征统计------------------------------------------#
    #统计放款后用户浏览子行为总数以及浏览行为数据总和
    gb=d[(d['浏览时间']>d['放款时间'])].groupby(["用户标识"],as_index=False)
    x1=gb['浏览行为数据'].agg({'放款后浏览行为数据sum' : 'sum','放款后浏览行为数据max' : 'max','放款后浏览行为数据mean' : 'mean'
                         ,'放款后浏览行为数据min' : 'min','放款后浏览行为数据std' : 'std','放款后浏览行为数据var' : 'var'})
    xx=gb['浏览子行为编号'].apply(lambda x:np.unique(x).size)
    x2=gb['浏览子行为编号'].agg({'放款后浏览子行为编号count' : 'count'})
    x2['放款后浏览子行为编号计数（去重）']=xx

    feature=pd.merge(feature, x1,how='left', on = "用户标识")
    feature=pd.merge(feature, x2,how='left', on = "用户标识")
    
#     gb = d[['用户标识','浏览时间']].groupby(["用户标识"], as_index = False)
#     timestamp_agg = gb.agg(['sum','max','min','mean','median','var','count']).reset_index()
#     feature = pd.merge(feature, timestamp_agg , how = 'left', on = '用户标识')

    #统计放款前用户浏览子行为个类别统计信息
    d=pd.get_dummies(d,columns=['浏览子行为编号'])#22919547 rows × 14 columns
    gb=d[(d['浏览时间']<=d['放款时间'])].groupby(["用户标识"],as_index=False)
    x1=gb['浏览子行为编号_1'].agg({'放款后浏览子行为编号_1' : 'sum'})
    x2=gb['浏览子行为编号_2'].agg({'放款后浏览子行为编号_2' : 'sum'})
    x3=gb['浏览子行为编号_3'].agg({'放款后浏览子行为编号_3' : 'sum'})
    x4=gb['浏览子行为编号_4'].agg({'放款后浏览子行为编号_4' : 'sum'})
    x5=gb['浏览子行为编号_5'].agg({'放款后浏览子行为编号_5' : 'sum'})
    x6=gb['浏览子行为编号_6'].agg({'放款后浏览子行为编号_6' : 'sum'})
    x7=gb['浏览子行为编号_7'].agg({'放款后浏览子行为编号_7' : 'sum'})
    x8=gb['浏览子行为编号_8'].agg({'放款后浏览子行为编号_8' : 'sum'})
    x9=gb['浏览子行为编号_9'].agg({'放款后浏览子行为编号_9' : 'sum'})
    x10=gb['浏览子行为编号_10'].agg({'放款后浏览子行为编号_10' : 'sum'})
    x11=gb['浏览子行为编号_11'].agg({'放款后浏览子行为编号_11' : 'sum'})

    feature=pd.merge(feature, x1,how='left', on = "用户标识")
    feature=pd.merge(feature, x2,how='left', on = "用户标识")
    feature=pd.merge(feature, x3,how='left', on = "用户标识")
    feature=pd.merge(feature, x4,how='left', on = "用户标识")
    feature=pd.merge(feature, x5,how='left', on = "用户标识")
    feature=pd.merge(feature, x6,how='left', on = "用户标识")
    feature=pd.merge(feature, x7,how='left', on = "用户标识")
    feature=pd.merge(feature, x8,how='left', on = "用户标识")
    feature=pd.merge(feature, x9,how='left', on = "用户标识")
    feature=pd.merge(feature, x10,how='left', on = "用户标识")
    feature=pd.merge(feature, x11,how='left', on = "用户标识")
    print(feature.shape)
    
    
    d= browse_detail[['用户标识','浏览时间']]
    gb = d.groupby(["用户标识"], as_index = False)
    timestamp_agg = gb.agg(['sum','max','min','mean','median','var','count']).reset_index()
    feature = pd.merge(feature, timestamp_agg , how = 'left', on = '用户标识')
    print(feature.shape)

    feature.to_csv(output_file,index=None,encoding=FILE_ENCODER)

# 正式代码

In [3]:
print("begin")
constructBrowseDetailFeature(LOANTIME_TRAIN_FILE, BROWSE_HISTORY_TRAIN_FILE, 
                           "../feature/browse_detail_train20181106.csv")
constructBrowseDetailFeature(LOANTIME_TEST_FILE, BROWSE_HISTORY_TEST_FILE, 
                           "../feature/browse_detail_test20181106.csv")
print("end")

begin


FileNotFoundError: File b'../train/loantime_train.csv' does not exist