In [21]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.metrics import roc_auc_score, log_loss, accuracy_score
import gc
from tqdm import tqdm

## Data Wrangling

In [2]:
train_X = pd.read_csv('data/train.csv')
test_X = pd.read_csv('data/test.csv')

In [3]:
train_X.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
0,0.30137,7,0.044131,9,0.8,4,1,1,4,1,0.02174,0.0,0.397959,39
1,0.452055,6,0.048052,9,0.8,2,4,0,4,1,0.0,0.0,0.122449,39
2,0.287671,4,0.137581,11,0.533333,0,6,1,4,1,0.0,0.0,0.397959,39
3,0.493151,4,0.150486,1,0.4,2,6,0,2,1,0.0,0.0,0.397959,39
4,0.150685,4,0.220635,9,0.8,2,10,5,2,0,0.0,0.0,0.397959,5


In [4]:
train_X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 14 columns):
age               32561 non-null float64
workclass         32561 non-null int64
fnlwgt            32561 non-null float64
education         32561 non-null int64
education_num     32561 non-null float64
marital_status    32561 non-null int64
occupation        32561 non-null int64
relationship      32561 non-null int64
race              32561 non-null int64
sex               32561 non-null int64
capital_gain      32561 non-null float64
capital_loss      32561 non-null float64
hours_per_week    32561 non-null float64
native_country    32561 non-null int64
dtypes: float64(6), int64(8)
memory usage: 3.5 MB


In [5]:
train_Y = pd.read_csv('data/y_train.csv')
test_Y = pd.read_csv('data/y_test.csv')

In [6]:
train_Y.head()

Unnamed: 0,income_label
0,0
1,0
2,0
3,0
4,0


In [10]:
train_Y.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 1 columns):
income_label    32561 non-null int64
dtypes: int64(1)
memory usage: 254.5 KB


In [11]:
continuous_feature = ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']
category_feature = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']

In [12]:
class DataParse:
    def __init__(self, category_feature, continuous_feature, ignore_feature=[], feature_dict={}, feature_size=0, field_size=0):
        self.feature_dict = feature_dict
        self.feature_size = feature_size
        self.field_size = field_size
        self.ignore_feature = ignore_feature
        self.category_feature = category_feature
        self.continuous_feature = continuous_feature
    
    def FeatureDictionary(self, train, test):
        """
        目的是给每一个特征维度都进行编号。
        1. 对于离散特征，one-hot之后每一列都是一个新的特征维度(计算编号时，不算0)。所以，原来的一维度对应的是很多维度，编号也是不同的。
        2. 对于连续特征，原来的一维特征依旧是一维特征。
        返回一个feat_dict，用于根据原特征名称和特征取值 快速查询出 对应的特征编号。
        train: 原始训练集
        test:  原始测试集
        continuous_feature: 所有数值型特征
        ignore_feature: 所有忽略的特征. 除了数值型和忽略的，剩下的全部认为是离散型
        feat_dict, feat_size
             1. feat_size: one-hot之后总的特征维度。
             2. feat_dict是一个{}， key是特征string的col_name, value可能是编号（int），可能也是一个字典。
             如果原特征是连续特征： value就是int，表示对应的特征编号；
             如果原特征是离散特征：value就是dict，里面是根据离散特征的 实际取值 查询 该维度的特征编号。 因为离散特征one-hot之后，
             一个取值就是一个维度，而一个维度就对应一个编号。
        """
        df = pd.concat([train, test], axis=0)
        feat_dict = {}
        total_cnt = 0
        
        for col in df.columns:
            # 连续特征只有一个编号
            if col in self.continuous_feature:
                feat_dict[col] = total_cnt
                total_cnt = total_cnt + 1
            elif col in self.category_feature:
                unique_vals = df[col].unique()
                unique_cnt = df[col].nunique()
                feat_dict[col] = dict(zip(unique_vals, range(total_cnt, total_cnt + unique_cnt)))
                total_cnt = total_cnt + unique_cnt
        
        self.feature_size = total_cnt
        self.feature_dict = feat_dict
        print('feat_dict=', feat_dict)
        print('feature_size=', total_cnt)
    
    def parse(self, df):
        dfi = df.copy()
        dfv = df.copy()
        for col in dfi.columns:
            if col in self.ignore_feature:
                dfi.drop([col], axis=1, inplace=True)
                dfv.drop([col], axis=1, inplace=True)

            elif col in self.continuous_feature:  # 连续特征1个维度，对应1个编号，这个编号是一个定值
                dfi[col] = self.feature_dict[col]

            elif col in self.category_feature:  # 离散特征。不同取值对应不同的特征维度，编号也是不同的。
                dfi[col] = dfi[col].map(self.feature_dict[col])
                dfv[col] = 1.0

        feature_index = dfi.values.tolist()
        feature_val = dfv.values.tolist()
        self.field_size = len(feature_index[0])
        del dfi, dfv
        gc.collect()

        return feature_index, feature_val

In [13]:
dataParse = DataParse(continuous_feature=continuous_feature, category_feature=category_feature)
dataParse.FeatureDictionary(train_X, test_X)
train_feature_index, train_feature_val = dataParse.parse(train_X)
test_feature_index, test_feature_val = dataParse.parse(test_X)

feat_dict= {'age': 0, 'workclass': {7: 1, 6: 2, 4: 3, 1: 4, 2: 5, 0: 6, 5: 7, 8: 8, 3: 9}, 'fnlwgt': 10, 'education': {9: 11, 11: 12, 1: 13, 12: 14, 6: 15, 15: 16, 7: 17, 8: 18, 5: 19, 10: 20, 14: 21, 4: 22, 0: 23, 3: 24, 13: 25, 2: 26}, 'education_num': 27, 'marital_status': {4: 28, 2: 29, 0: 30, 3: 31, 5: 32, 1: 33, 6: 34}, 'occupation': {1: 35, 4: 36, 6: 37, 10: 38, 8: 39, 12: 40, 3: 41, 14: 42, 5: 43, 7: 44, 13: 45, 0: 46, 11: 47, 2: 48, 9: 49}, 'relationship': {1: 50, 0: 51, 5: 52, 3: 53, 4: 54, 2: 55}, 'race': {4: 56, 2: 57, 1: 58, 0: 59, 3: 60}, 'sex': {1: 61, 0: 62}, 'capital_gain': 63, 'capital_loss': 64, 'hours_per_week': 65, 'native_country': {39: 66, 5: 67, 23: 68, 19: 69, 0: 70, 26: 71, 35: 72, 33: 73, 16: 74, 9: 75, 2: 76, 11: 77, 20: 78, 30: 79, 22: 80, 31: 81, 4: 82, 1: 83, 37: 84, 7: 85, 25: 86, 36: 87, 14: 88, 32: 89, 6: 90, 8: 91, 10: 92, 13: 93, 3: 94, 24: 95, 41: 96, 29: 97, 28: 98, 34: 99, 38: 100, 12: 101, 27: 102, 40: 103, 17: 104, 21: 105, 18: 106, 15: 107}}
fe

In [19]:
train_Y = train_Y.values.reshape(-1, 1)
test_Y = test_Y.values.reshape(-1, 1)

In [41]:
train_db = tf.data.Dataset.from_tensor_slices((train_feature_index, train_feature_val, train_Y))
test_db = tf.data.Dataset.from_tensor_slices((test_feature_index, test_feature_val, test_Y))

In [66]:
train_db = train_db.shuffle(2000).batch(128)

## Model

In [137]:
epochs = 10
embedding_size = 8
learning_rate = 0.001
n_hidden_1 = 32
n_hidden_2 = 32
drop_rate = 0.1
display_step = 50

In [97]:
random_normal = tf.initializers.RandomNormal()
weights = {
    'embedding': tf.Variable(random_normal([dataParse.feature_size, embedding_size])),
    'layer1': tf.Variable(random_normal([dataParse.field_size*embedding_size, n_hidden_1])),
    'layer2': tf.Variable(random_normal([n_hidden_1, n_hidden_2])),
    'out':  tf.Variable(random_normal([n_hidden_2, 1]))
}
biases = {
    'layer1': tf.Variable(tf.zeros([n_hidden_1])),
    'layer2': tf.Variable(tf.zeros([ n_hidden_2])),
    'out': tf.Variable(tf.zeros([1]))
}

In [133]:
def fnn_net(feature_index, feature_value):
    embeddings = tf.nn.embedding_lookup(weights['embedding'], feature_index)  # [None, field_size, embedding_size]
    feat_value = tf.reshape(feature_value, shape=[-1, dataParse.field_size, 1])  # [None, field_size, 1]
    embeddings = tf.multiply(embeddings, feat_value)
    embeddings = tf.reshape(embeddings, [-1, dataParse.field_size * embedding_size])
    
    layer_1 = tf.add(tf.matmul(embeddings, weights['layer1']), biases['layer1'])
    layer_1 = tf.nn.dropout(layer_1, rate=drop_rate)
    
    layer_2 = tf.add(tf.matmul(layer_1, weights['layer2']), biases['layer2'])
    layer_2 = tf.nn.dropout(layer_2, rate=drop_rate)
    
    out = tf.matmul(layer_2, weights['out']) + biases['out']
    return out

In [134]:
optimizer = tf.optimizers.Adam(lr=learning_rate)

In [135]:
def run_optimization(feature_index, feature_value, y):
    with tf.GradientTape() as tape:
        logit = fnn_net(feature_index, feature_value)
        y = tf.cast(y, tf.float32)
        loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logit))
        trainable_variables = list(weights.values()) + list(biases.values())
        grad = tape.gradient(loss, trainable_variables)
        optimizer.apply_gradients(zip(grad, trainable_variables))

In [138]:
for epoch in range(epochs):
    for step, (feature_index, feature_value, y) in enumerate(train_db):
        run_optimization(feature_index, feature_value, y)
        
        if step % display_step == 0:
            pred = fnn_net(feature_index, feature_value)
            y = tf.cast(y, tf.float32)
            loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=pred))
            auc = tf.reduce_mean(roc_auc_score(y, tf.nn.sigmoid(pred)))
            print("step: %i, loss: %f, auc: %f" % (step, loss, auc)) 

step: 0, loss: 0.364972, auc: 0.897436
step: 50, loss: 0.371608, auc: 0.850343
step: 100, loss: 0.343314, auc: 0.902018
step: 150, loss: 0.276011, auc: 0.935547
step: 200, loss: 0.340474, auc: 0.849132
step: 250, loss: 0.363641, auc: 0.896572
step: 0, loss: 0.317753, auc: 0.915012
step: 50, loss: 0.358479, auc: 0.896283
step: 100, loss: 0.275526, auc: 0.923720
step: 150, loss: 0.271648, auc: 0.938435
step: 200, loss: 0.255456, auc: 0.907660
step: 250, loss: 0.360771, auc: 0.886762
step: 0, loss: 0.327674, auc: 0.915761
step: 50, loss: 0.322903, auc: 0.909831
step: 100, loss: 0.329224, auc: 0.900357
step: 150, loss: 0.345543, auc: 0.885933
step: 200, loss: 0.344086, auc: 0.889237
step: 250, loss: 0.345721, auc: 0.885933
step: 0, loss: 0.366712, auc: 0.885795
step: 50, loss: 0.319644, auc: 0.887857
step: 100, loss: 0.319292, auc: 0.917526
step: 150, loss: 0.318819, auc: 0.910200
step: 200, loss: 0.352013, auc: 0.898246
step: 250, loss: 0.308153, auc: 0.912205
step: 0, loss: 0.313411, auc