# STEP3: FeatureEngineering 特征工程
## Version Change：
**11-07** v2版本的改动：<br>
1. 对bill_detail_train_time20181106, bill_detail_test_time20181106，bill_detail_train_notime20181106, bill_detail_test_notime20181106做特征工程

**11-08** v3版本的改动：<br>
1. 对bill_detail_train_all20181106, bill_detail_test_all20181106做特征工程


Input：
- Step2产生的../feature目录下所有的feature文件

Output：
- 输出经过特征变换的结果，并保存在 ../feature/features_all_train_test_20181106.csv



In [11]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split


# common prefix
TRAIN_PREFIX = "../train/"
TEST_PREFIX = "../B/"
FEATURE_PREFIX = "../feature/"

# File Encoder
FILE_ENCODER = "utf-8"


def saveFeatureToFile(features, output_file):
    features.to_csv(output_file,index=None,encoding=FILE_ENCODER)


In [12]:
def extractSimpleFeature(train_file, test_file):
#     print("extractSimpleFeature")
#     train = pd.read_csv(train_file,encoding=FILE_ENCODER)
#     test = pd.read_csv(test_file,encoding=FILE_ENCODER)
#     test=test.fillna(-1)
#     dataset= pd.concat([train,test],axis=0).reset_index()
#     label=dataset[['用户标识','标签']]
#     dataset=dataset.drop(['index','用户标识'],axis=1)

#     dataset=pd.get_dummies(dataset,
#                            columns=['用户性别','用户职业','用户教育程度','用户婚姻状态','用户户口类型']
#                           ).drop(['标签'],axis=1)
#     print(dataset.shape)
#     poly = PolynomialFeatures(interaction_only=True)
#     dataset=poly.fit_transform(dataset)
    
#     df = pd.DataFrame(dataset.reshape(dataset.shape[0],301)) # 301是一个待调整参数
#     df.interpolate(axis=1).values.reshape(dataset.shape)

#     trains=pd.concat([label,df],axis=1)
#     print(trains.shape)
#     return trains
    train = pd.read_csv(train_file,encoding=FILE_ENCODER)
    test = pd.read_csv(test_file,encoding=FILE_ENCODER)
    test=test.fillna(-1)
    dataset= pd.concat([train,test],axis=0).reset_index()
    dataset=dataset.drop(['index'],axis=1)

    dataset=pd.get_dummies(dataset,
                           columns=['用户性别','用户职业','用户教育程度','用户婚姻状态','用户户口类型']
                          )
    return dataset




In [13]:
# 1. bill_detail_train_time20181106, bill_detail_test_time20181106
def extractBillFeatureTime(trains, train_file, test_file):
    print("extractBillFeatureTime")
    train = pd.read_csv(train_file,encoding=FILE_ENCODER) 
    test = pd.read_csv(test_file,encoding=FILE_ENCODER) 
    dataset= pd.concat([train,test],axis=0)
    temp=dataset[:]
    temp.drop('放款时间',axis=1, inplace=True)
    
    #统计行和列的缺失值数
    #dataset=dataset.fillna(-1)
    #temp['用户账单表特征_时间已知缺失统计']=(dataset==-1).sum(axis=1)
    #特征轮：构造放款后与放款前差值特征
    feature_name=temp.columns 

    for name in feature_name:
        if name.find('款后')> 0 and name.find('与') <=0:
            x1=name.replace('款后','款前')
            if x1 in feature_name:
                temp[name+"与"+x1+"差值"]=temp[name]-temp[x1]        
    #用均值填充缺失值
    #temp.fillna(temp.mean(),inplace=True)
    temp.fillna(0,inplace=True)
    trains=pd.merge(trains,temp,how='left', on = "用户标识")
    print(trains.shape)
    return trains
    
    

In [14]:
# 2. bill_detail_train_notime20181106, bill_detail_test_notime20181106
def extractBillFeatureNoTime(trains, train_file, test_file):
    print("extractBillFeatureNoTime")
    train = pd.read_csv(train_file,encoding=FILE_ENCODER) 
    test = pd.read_csv(test_file,encoding=FILE_ENCODER) 
    # 随机删除部分时间未知样本，保证训练集与测试集的特征分布相似
    print(train.shape)
    train_true=train[train['时间未知上期账单金额_sum'].isnull().values==True]
    print(train_true.shape)
    train_false=train[train['时间未知上期账单金额_sum'].isnull().values==False]
    print(train_false.shape)
    t1,t2=train_test_split(train_false, test_size = 0.5,random_state=36)
    print(t1.shape)
    train=pd.concat([train_true,t1],axis=0)
    print(train.shape)

    ####################################################################

    dataset= pd.concat([train,test],axis=0)
    #dataset['账单时间未知标志']=1
    temp=dataset[:]
    #temp.drop('放款时间',axis=1, inplace=True)
    print(temp.shape)
    #统计行和列的缺失值数
    #dataset=dataset.fillna(-1)
    #temp['用户账单表特征_时间未知缺失统计']=(dataset==-1).sum(axis=1)        
    #用均值填充缺失值
    #temp.fillna(temp.mean(),inplace=True)
    temp.fillna(0,inplace=True)
    print(temp.shape)
    trains=pd.merge(trains,temp,how='inner', on = "用户标识")
    print("final trains feature shape:")
    print(trains.shape)
    return trains



In [15]:
# 3. bill_detail_train_all20181106, bill_detail_test_all20181106
def extractBillFeatureAll(trains, train_file, test_file):
    print("extractBillFeatureAll")
    train = pd.read_csv(train_file,encoding=FILE_ENCODER)
    test = pd.read_csv(test_file,encoding=FILE_ENCODER)
    dataset= pd.concat([train,test],axis=0)
    temp=dataset[:]
    #temp.drop('放款时间',axis=1, inplace=True)
    print(temp.shape)#(69495, 242)
    #统计行和列的缺失值数
    #dataset=dataset.fillna(-1)
    #temp['用户账单表特征_时间未知缺失统计']=(dataset==-1).sum(axis=1)        
    #用均值填充缺失值
    #temp.fillna(temp.mean(),inplace=True)
    temp.fillna(0,inplace=True)
    print(temp.shape)#(69495, 243)
    trains=pd.merge(trains,temp,how='left', on = "用户标识")#1154+243
    print(trains.shape)#(69495, 1633)
    return trains

In [16]:
def extractBillFeature2(trains, train_file, test_file):
    print("extractBillFeature2")
    train = pd.read_csv(train_file,encoding=FILE_ENCODER)
    test = pd.read_csv(test_file,encoding=FILE_ENCODER)
    dataset= pd.concat([train,test],axis=0)
    temp=dataset[:]
    #temp.drop('放款时间',axis=1, inplace=True)
    print(temp.shape)#(69495, 242)
    #统计行和列的缺失值数
    #dataset=dataset.fillna(-1)
    #temp['用户账单表特征_时间未知缺失统计']=(dataset==-1).sum(axis=1)        
    #用均值填充缺失值
    #temp.fillna(temp.mean(),inplace=True)
    temp.fillna(0,inplace=True)
    print(temp.shape)
    trains=pd.merge(trains,temp,how='left', on = "用户标识")
    print(trains.shape)
    return trains

In [17]:
def extractBillFeatureBasic(trains, train_file, test_file):
    print("extractBillFeatureBasic")
    train = pd.read_csv(train_file,encoding=FILE_ENCODER)
    test = pd.read_csv(test_file,encoding=FILE_ENCODER)
    dataset= pd.concat([train,test],axis=0)
    temp=dataset[:]
    temp.drop('放款时间',axis=1, inplace=True)
    print(temp.shape)
    #统计行和列的缺失值数
    #dataset=dataset.fillna(-1)
    #temp['用户账单表初级特征缺失统计']=(dataset==-1).sum(axis=1)
    #特征轮：构造放款后与放款前差值特征
    feature_name=temp.columns 
    for name in feature_name:
        if name.find('款后')> 0 and name.find('与') <=0:
            x1=name.replace('款后','款前')
            if x1 in feature_name:
                temp[name+"与"+x1+"差值"]=temp[name]-temp[x1]        
    #用均值填充缺失值
    #emp.fillna(temp.mean(),inplace=True)
    temp.fillna(0,inplace=True)
    print(temp.shape)
    trains=pd.merge(trains,temp,how='left', on = "用户标识")
    print(trains.shape)#69495 rows × 445  columns
    return trains

In [18]:
def extractBrowseHistoryFeature(trains, train_file, test_file):
    print("extractBrowseHistoryFeature")
    train = pd.read_csv(train_file,encoding=FILE_ENCODER)
    test = pd.read_csv(test_file,encoding=FILE_ENCODER)
    dataset= pd.concat([train,test],axis=0)

    temp=dataset[:]
    #temp.drop('浏览时间',axis=1, inplace=True)
    #print(temp.shape)
    #统计行和列的缺失值数
    #dataset=dataset.fillna(-1)
    #temp['用户浏览行为缺失统计']=(dataset==-1).sum(axis=1)
    #用均值填充缺失值
    #temp.fillna(temp.mean(),inplace=True)
    temp.fillna(0,inplace=True)
    #print(temp.shape)
    trains=pd.merge(trains,temp,how='left', on = "用户标识")
    print(trains.shape)
    return trains

In [19]:
def extractBankDetailFeature(trains, train_file, test_file):
    print("extractBankDetailFeature")
    train = pd.read_csv(train_file,encoding=FILE_ENCODER)
    test = pd.read_csv(test_file,encoding=FILE_ENCODER)
    dataset= pd.concat([train,test],axis=0)
    temp=dataset[:]
    temp.drop('放款时间',axis=1, inplace=True)
    print(temp.shape)
    #统计行和列的缺失值数
    #dataset=dataset.fillna(-1)
    #temp['用户银行流水记录缺失统计']=(dataset==-1).sum(axis=1)
    #用均值填充缺失值
    #temp.fillna(temp.mean(),inplace=True)
    temp.fillna(0,inplace=True)
    print(temp.shape)
    trains=pd.merge(trains,temp,how='left', on = "用户标识")
    print(trains.shape)
    return trains

In [20]:
#Step3.1 不要分开执行每个函数，要一起执行，防止trains反复merge，造成feature重复
#特征工程：特征变换、哑变量（Dummies）构造、多项式变换等
print("begin")
trains = extractSimpleFeature("../feature/basic_train_20181106.csv", 
                              "../feature/basic_test_20181106.csv")

trains = extractBrowseHistoryFeature(trains,
                                     "../feature/browse_detail_train20181106.csv",
                                     "../feature/browse_detail_test20181106.csv")

trains = extractBankDetailFeature(trains,
                                     "../feature/bank_detail_train20181106.csv",
                                     "../feature/bank_detail_test20181106.csv")
#trains = extractBankDetailFeature(trains,
#                                     "../feature/bank_detail2_train20181106.csv",
#                                     "../feature/bank_detail2_test20181106.csv")

trains = extractBillFeatureBasic(trains, 
                                   "../feature/bill_detail_train_basic20181106.csv", 
                                   "../feature/bill_detail_test_basic20181106.csv")

trains = extractBillFeatureTime(trains, 
                                "../feature/bill_detail_train_time20181106.csv", 
                                "../feature/bill_detail_test_time20181106.csv")

trains = extractBillFeatureNoTime(trains, 
                                  "../feature/bill_detail_train_notime20181106.csv", 
                                  "../feature/bill_detail_test_notime20181106.csv")

trains = extractBillFeatureAll(trains, 
                                  "../feature/bill_detail_train_all20181106.csv", 
                                  "../feature/bill_detail_test_all20181106.csv")
trains = extractBillFeature2(trains, 
                                  "../feature/bill_detail_feature2_train_20181106.csv", 
                                  "../feature/bill_detail_feature2_test_20181106.csv")
#Step3.2
#保存
print("save features to ../feature/features_all_train_test_20181106.csv")
saveFeatureToFile(trains, "../feature/features_all_train_test_20181106.csv")
print("end")


begin


FileNotFoundError: File b'../feature/basic_train_20181106.csv' does not exist