In [10]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.metrics import roc_auc_score, log_loss, accuracy_score
import gc

## Data

In [14]:
train_X = pd.read_csv('data/train.csv')
test_X = pd.read_csv('data/test.csv')

In [6]:
train_X.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
0,0.30137,7,0.044131,9,0.8,4,1,1,4,1,0.02174,0.0,0.397959,39
1,0.452055,6,0.048052,9,0.8,2,4,0,4,1,0.0,0.0,0.122449,39
2,0.287671,4,0.137581,11,0.533333,0,6,1,4,1,0.0,0.0,0.397959,39
3,0.493151,4,0.150486,1,0.4,2,6,0,2,1,0.0,0.0,0.397959,39
4,0.150685,4,0.220635,9,0.8,2,10,5,2,0,0.0,0.0,0.397959,5


In [15]:
train_Y = pd.read_csv('data/y_train.csv')
test_Y = pd.read_csv('data/y_test.csv')

In [16]:
train_Y.head()

Unnamed: 0,income_label
0,0
1,0
2,0
3,0
4,0


In [17]:
continuous_feature = ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']
category_feature = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']

In [18]:
class DataParse:
    def __init__(self, category_feature, continuous_feature, ignore_feature=[], feature_dict={}, feature_size=0, field_size=0):
        self.feature_dict = feature_dict
        self.feature_size = feature_size
        self.field_size = field_size
        self.ignore_feature = ignore_feature
        self.category_feature = category_feature
        self.continuous_feature = continuous_feature
    
    def FeatureDictionary(self, train, test):
        """
        目的是给每一个特征维度都进行编号。
        1. 对于离散特征，one-hot之后每一列都是一个新的特征维度(计算编号时，不算0)。所以，原来的一维度对应的是很多维度，编号也是不同的。
        2. 对于连续特征，原来的一维特征依旧是一维特征。
        返回一个feat_dict，用于根据原特征名称和特征取值 快速查询出 对应的特征编号。
        train: 原始训练集
        test:  原始测试集
        continuous_feature: 所有数值型特征
        ignore_feature: 所有忽略的特征. 除了数值型和忽略的，剩下的全部认为是离散型
        feat_dict, feat_size
             1. feat_size: one-hot之后总的特征维度。
             2. feat_dict是一个{}， key是特征string的col_name, value可能是编号（int），可能也是一个字典。
             如果原特征是连续特征： value就是int，表示对应的特征编号；
             如果原特征是离散特征：value就是dict，里面是根据离散特征的 实际取值 查询 该维度的特征编号。 因为离散特征one-hot之后，
             一个取值就是一个维度，而一个维度就对应一个编号。
        """
        df = pd.concat([train, test], axis=0)
        feat_dict = {}
        total_cnt = 0
        
        for col in df.columns:
            # 连续特征只有一个编号
            if col in self.continuous_feature:
                feat_dict[col] = total_cnt
                total_cnt = total_cnt + 1
            elif col in self.category_feature:
                unique_vals = df[col].unique()
                unique_cnt = df[col].nunique()
                feat_dict[col] = dict(zip(unique_vals, range(total_cnt, total_cnt + unique_cnt)))
                total_cnt = total_cnt + unique_cnt
        
        self.feature_size = total_cnt
        self.feature_dict = feat_dict
        print('feat_dict=', feat_dict)
        print('feature_size=', total_cnt)
    
    def parse(self, df):
        dfi = df.copy()
        dfv = df.copy()
        for col in dfi.columns:
            if col in self.ignore_feature:
                dfi.drop([col], axis=1, inplace=True)
                dfv.drop([col], axis=1, inplace=True)

            elif col in self.continuous_feature:  # 连续特征1个维度，对应1个编号，这个编号是一个定值
                dfi[col] = self.feature_dict[col]

            elif col in self.category_feature:  # 离散特征。不同取值对应不同的特征维度，编号也是不同的。
                dfi[col] = dfi[col].map(self.feature_dict[col])
                dfv[col] = 1.0

        feature_index = dfi.values.tolist()
        feature_val = dfv.values.tolist()
        self.field_size = len(feature_index[0])
        del dfi, dfv
        gc.collect()

        return feature_index, feature_val

In [21]:
dataParse = DataParse(continuous_feature=continuous_feature, category_feature=category_feature)
dataParse.FeatureDictionary(train_X, test_X)
train_feature_index, train_feature_val = dataParse.parse(train_X)
test_feature_index, test_feature_val = dataParse.parse(test_X)

feat_dict= {'capital_loss': 64, 'native_country': {0: 70, 1: 83, 2: 76, 3: 94, 4: 82, 5: 67, 6: 90, 7: 85, 8: 91, 9: 75, 10: 92, 11: 77, 12: 101, 13: 93, 14: 88, 15: 107, 16: 74, 17: 104, 18: 106, 19: 69, 20: 78, 21: 105, 22: 80, 23: 68, 24: 95, 25: 86, 26: 71, 27: 102, 28: 98, 29: 97, 30: 79, 31: 81, 32: 89, 33: 73, 34: 99, 35: 72, 36: 87, 37: 84, 38: 100, 39: 66, 40: 103, 41: 96}, 'hours_per_week': 65, 'fnlwgt': 10, 'age': 0, 'workclass': {0: 6, 1: 4, 2: 5, 3: 9, 4: 3, 5: 7, 6: 2, 7: 1, 8: 8}, 'occupation': {0: 46, 1: 35, 2: 48, 3: 41, 4: 36, 5: 43, 6: 37, 7: 44, 8: 39, 9: 49, 10: 38, 11: 47, 12: 40, 13: 45, 14: 42}, 'education': {0: 23, 1: 13, 2: 26, 3: 24, 4: 22, 5: 19, 6: 15, 7: 17, 8: 18, 9: 11, 10: 20, 11: 12, 12: 14, 13: 25, 14: 21, 15: 16}, 'sex': {0: 62, 1: 61}, 'capital_gain': 63, 'marital_status': {0: 30, 1: 33, 2: 29, 3: 31, 4: 28, 5: 32, 6: 34}, 'education_num': 27, 'relationship': {0: 51, 1: 50, 2: 55, 3: 53, 4: 54, 5: 52}, 'race': {0: 59, 1: 58, 2: 57, 3: 60, 4: 56}}
fe

In [28]:
train_Y = train_Y.values.reshape(-1, 1)
test_Y = test_Y.values.reshape(-1, 1)

## Model