In [None]:
# Census Income Data
#  이 데이터는 미국 인구총조사를 통해 조사된 사람에 대해 여러가지 정보들(예, 나이, 교육, 직업 등)과
# 수입(Income)이 기록되어 있다. 이 Census Income Data의 여러 features를 이용해서 이 사람의 수입이 
# $50,000 을 넘을지를  넘지못할지를 예측하는 이진분류기 모델을 만드는 것이다.
# 이진 분류기를 만들기 위해 logistic Regreesion을 사용한다. 
# Census Income Data의 features들은 연속값 과 이진값으로 나누어진다. 전체 특징들은 다음과 같다.

In [None]:
# 이번 강의에서 우이른 이진 분류 문제를 성별, 교육, 그리고 직업 등의 특성들에 관한 인구조사 데이터를 
# 기반으로 한사람의 연봉이 $50,000 이 넘는지를 판단하려고 한다. 로지스틱 회귀 모델을 주어진 개인 정보를
# 가지고 교육할 것이고, 모델은 개인의 연봉이 $50,000 이상일 가능성으로 해석될 수 있는 0과 1사이 숫자를 출력

In [None]:
# 인구 조사 데이터 에 대한 컬럼 설명 .. census income data column....
Column Namne     type       Description
age             Continuous   The age of the individual
workclass       Categorical  The type of employer the individual has (government, miliatary, private 등)
fnlwgt          Continuous   The number of people the census takers believe that observation represents (sample weight). final weight will  not be used.
education       Categorical  The highest level of education achieved for that individual.
education_num   Continuous   The highest level of education in numerical form
marital_status  Categorical  Marital status of the individual 혼인상태
occupation      Categorical  The occupation of the individual
relationship    Categorical  Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried
race            Categorical  Amer-Indican-Eskimo, Asian-Pac-Islander, Black, White, Other
gender          Categorical  Female, Male.
capital_gain    Continuous   Captial gain recorded
captial_loss    Continuous   Captial losses recorded
hours_per_week  Continuous   Hours worked per week
native_country  Categorical  Country of origin of the individual
income_bracket   Caetgorical  '>50k' or '<=50k', meaning whether the person makes more than $50,000 annually.

In [None]:
# data는 다음 폴더에 있다. 
# https://archive.ics.uci.edu/ml/machine-learning-databases/adult/
# wget 으로 다운받아서 ./data/census_income/ 에 adult.data,  adult.names, adutl.test를 다운로드한다.

In [1]:
# tf.estimator API를 이용한 TensorFlow Wide & Deep Tutorial 예제..
#

import os
import tensorflow as tf
import pandas as pd


In [None]:
# flags = tf.app.flags
# FLAGS = flags.FLAGS

In [2]:
def define_wide_deep_flags(flags):
    ''' model type 과 학습을 위한 flag들을 지정합니다.'''
#     flags_core.define_base()
#     flags_core.define_benchmark()
    
# #     flags.adopt_module_key_flags(flags_core)
    flags = tf.app.flags
#     FLAGS = flags.FLAGS
    
    flags.DEFINE_string('model_type', 'wide_n_deep', "valid model types: {'wide','deep', 'wide_n_deep'}.")
    flags.DEFINE_string('data_dir', './data/census_income/', 'path to data directory')
    flags.DEFINE_string('model_dir', './model/census_model/', 'path to base dirctory for output models.')
    flags.DEFINE_integer('train_steps', 200, 'number of training steps')
    flags.DEFINE_integer('epochs_between_evals', 2, 'number of epochs betwen evaluations')
    flags.DEFINE_integer('batch_size', 40, ' batch size for training or evaluation. must divide evenly into the data sizes.')
    flags.DEFINE_string('f', '', 'kernel') # unknown command line flag 'f' 오류 수정..
    

In [3]:
# 햑습에 사용할 컬럼들을 정의한다.
columns=[
    'age', 'workclass', 'fnlwgt', 'education', 'education_num', 
    'marital_status', 'occupation', 'relationship', 'race', 'gender',
    'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
    'income_bracket']
label_column = 'label'

categorical_columns = ['workclass', 'education', 'marital_status', 'occupation', 'relationship',
                      'race', 'gender', 'native_country']
continuous_columns = ['age', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']

In [None]:
# # 컬럼들의 기본값을 지정한다.
# _csv_column_defatults = [[0], [''], [0], [''], [0],[''],[''],[''],[''],[''],
#                          [0], [0],[0], [''],['']]


In [None]:
# training 과 validation 에 사용할 train validation 데이터 갯수를 지정한다.
_num_examples = {
    'train':32561,
    'validation':16281,
}

In [None]:
loss_prefix = {'wide':'linear/', 'deep':'dnn/'}

In [None]:
# def define_wide_deep_flags():
#     ''' model type 과 학습을 위한 flag들을 지정합니다.'''
# #     flags_core.define_base()
# #     flags_core.define_benchmark()
    
# #     flags.adopt_module_key_flags(flags_core)
#     flags = tf.app.flags
#     FLAGS = flags.FLAGS
    
#     flags.DEFINE_string(model_type, 'wide_deep', 'select model topology')
#     flags.DEFINE_string(data_dir, './data/census_income/', 'path to data directory')
#     flags.DEFINE_string(model_dir, './model/census_model/', 'path to model directory')
#     flags.DEFINE_integer(train_epochs, 40, 'number of epochs')
#     flags.DEFINE_integer(epochs_between_evals, 2, 'number of epochs betwen evaluations')
#     flags.DEFINE_integer(batch_size, 40, ' batch size for training or evaluation. must divide evenly into the data sizes.')
    

In [4]:
def build_model_columns():
    ''' feature columns 를 설정한다. '''
    # continuous base column
    age = tf.feature_column.numeric_column('age')
    education_num = tf.feature_column.numeric_column('education_num')
    capital_gain = tf.feature_column.numeric_column('capital_gain')
    capital_loss = tf.feature_column.numeric_column('capital_loss')
    hours_per_week = tf.feature_column.numeric_column('hours_per_week')
    
    # categorical base column
    gender = tf.feature_column.categorical_column_with_vocabulary_list(
        'gender', ['female', 'male'])
    race = tf.feature_column.categorical_column_with_vocabulary_list(
        'race', ['Amer-Indian-Eskimo', 'Asian-Pac-Island', 'Black', 
                 'Other', 'White'])
#     education = tf.feature_column.categorical_column_with_vocabulary_list(
#         'education', [
#             'Bachelos', 'HS-grad', '11th', 'Masters', '9th', 'Some-college',
#             'Assoc-acdm', 'Assoc-voc', '7th-8th', 'Doctorate', 'Prof-school',
#             '5th-6th', '10th', '1st-4th', 'Preschool', '12th'
#         ])
    education = tf.feature_column.categorical_column_with_hash_bucket(
        'education', hash_bucket_size=1000
    )
#     marital_status =  tf.feature_column.categorical_column_with_vocabulary_list(
#         'martial_status', [
#             'Married-civ-spouse', 'Divorced', 'Married-spouse-absent',
#             'Never-married', 'Separated', 'Married-AF-spouse', 'Widowed'
#         ]) # 혼인상태.
    marital_status = tf.feature_column.categorical_column_with_hash_bucket(
        'marital_status', hash_bucket_size=100)
#     relationship = tf.feature_column.categorical_column_with_vocabulary_list(
#         'relationship', [
#             'Husband', 'Not-in-family', 'Wife', 'Own-child', 'Unmarried',
#             'Other-relative'
#         ])
    relationship = tf.feature_column.categorical_column_with_hash_bucket(
        'relationship', hash_bucket_size=100)
#     workclass = tf.feature_column.categorical_column_with_vocabulary_list(
#         'workclass', [
#             'Self-emp-not-inc', 'Private', 'State-gov', 'Federal-gov', 
#             'Local-gov', '?', 'Self-emp-inc', 'Without-pay', 'Never-worked'
#         ]
#     )
    workclass = tf.feature_column.categorical_column_with_hash_bucket(
        'workclass', hash_bucket_size=100)
    
    # To show an exmaple of hashing:
    occupation = tf.feature_column.categorical_column_with_hash_bucket (
        'occupation', hash_bucket_size=1000
    )
    native_country = tf.feature_column.categorical_column_with_hash_bucket(
        'native_country', hash_bucket_size=1000)
    
    # Transformations.
    age_buckets = tf.feature_column.bucketized_column(
        age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65]
    )
    
    # Wide columns and deep columns
    base_columns = [
        gender, native_country, education, occupation, workclass,
        marital_status, relationship, age_buckets,
    ]
    
    crossed_columns=[
        tf.feature_column.crossed_column(
            ['education', 'occupation'], hash_bucket_size=int(1e4)),
        tf.feature_column.crossed_column(
            [age_buckets, 'education', 'occupation'], hash_bucket_size=int(1e4)),
    ]
    
    wide_columns = base_columns+crossed_columns
    
#     deep_colums = [
#         age,
#         education_num,
#         capital_gain,
#         captial_loss,
#         hours_per_week,
#         tf.feature_column.indicator_column(workclass),
#         tf.feature_column.indicator_column(education),
#         tf.feature_column.indicator_column(marital_status),
#         tf.feature_column.indicator_column(relationship),
#         #To show an example of embedding
#         tf.feature_column.embedding_column(occupation, dimension=8),
#     ]
    deep_columns = [
        tf.feature_column.embedding_column(workclass, dimension=8),
        tf.feature_column.embedding_column(education, dimension=8),
        tf.feature_column.embedding_column(marital_status, dimension=8),
        tf.feature_column.embedding_column(gender, dimension=8),
        tf.feature_column.embedding_column(relationship, dimension=8),
        tf.feature_column.embedding_column(race, dimension=8),
        tf.feature_column.embedding_column(native_country, dimension=8),
        tf.feature_column.embedding_column(occupation, dimension=8),
        age,
        education_num, 
        capital_gain,
        capital_loss,
        hours_per_week,
    ]
    
    return wide_columns, deep_columns



In [5]:
def build_estimator(model_dir, model_type):
    ''' Estimator model type에 따라서 estimator를 설정한다. '''
    wide_columns, deep_columns = build_model_columns()
#     hidden_units = [100, 75, 50, 25]
    hidden_units = [100, 50]
    
#     # create a tf.estimator.RunConfig to enusre the model is run on CPU, which
#     # trains faster than GPU for this model.
#     run_config = tf.estimator.RunConfig().replace(
#         session_config = tf.ConfigProto(device_count={'GPU': 0 }))
    
    if model_type == 'wide':
        return tf.estimator.LinearClassifier(
            model_dir = model_dir,
            feature_columns = wide_columns
#             , config = run_config
        )
    elif model_type == 'deep' :
        return tf.estimator.DNNClassifier(
            model_dir = model_dir,
            feature_columns = deep_columns,
            hidden_units = hidden_units
#             , config = run_config
        )
    else :
        return tf.estimator.DNNLinearCombinedClassifier(
            model_dir = model_dir,
            linear_feature_columns = wide_columns,
            dnn_feature_columns = deep_columns,
            dnn_hidden_units = hidden_units
#             , config = run_config
        )

In [None]:
# def input_fn(data_file, num_epochs, shuffle, batch_size):
#     '''Estimator를 위한 input function을 정의한다.'''
    
#     assert tf.gfile.Exists(data_file), (
#         '{} not found. Please make suer you have run data_download.py and '
#         'set the --data_dir agrument to the correct path'.format(data_file)
#     )
    
#     # csv 파일을 파싱한다.
#     def parse_csv(value):
#         print('Parsing  {}'.format(data_file))
#         columns = tf.decode_csv(value, record_defaults = _csv_column_defaults)
#         features = dict(zip(_csv_columns, cloumns))
#         labels = features.pop('incomen_bracket')
#         return features, tf.equal(labels, '>50k')
    
#     # extract lines from input files using the dataset api
#     dataset =  tf.data.TextLineDataset(data_file)
    
#     if shuffle:
#         dataset = dataset.shuffle(buffer_size=_num_examples['train'])
        
#     dataset = dataset.map(parse_csv, num_parallel_calls=5)
    
#     # we call repeat after shuffling, rather than before, to prevent separte
#     # epcosh from blending together.
#     dataset = dataset.repeat(num_epochs)
#     dataset = dataset.batch(batch_size)
#     return dataset

In [11]:
def input_fn(df):
    '''Input builder function'''
    # Create a dictionary mapping freom each continuous fature column name (k) to
    # the values of that column stored in a constant Tensor.
    continuous_cols = {k:tf.constant(df[k].values)for k in continuous_columns}
    
    # Create a dictionary mapping from each categorical feature column name (k) to
    # the values of that column stored in a tf.SparseTensor.
    categorical_cols = {k: tf.SparseTensor(
        indices=[[i,0] for i in range(df[k].size)],
        values = df[k].values,
#         shape=[df[k].size, 1]) for k in categorical_columns}
        dense_shape=[df[k].size, 1]) for k in categorical_columns}
    
    # Merge the tow dictionaries into one
#     feature_cols = dict(continuous_cols.items()+categorical_cols.items()) #TypeError: unsupported operand type(s) for +: 'dict_items' and 'dict_items'
#     feature_cols = dict(continuous_cols.items()|categorical_cols.items())
#     feature_cols = dict(**continuous_cols, **categorical_cols)\
    feature_cols = dict(list(continuous_cols.items())+list(categorical_cols.items()))
    # converts the label column  into a constant Tensor
    label = tf.constant(df[label_column].values)
    
    #return the feature columns and the lable
    return feature_cols, label

In [7]:
tf.logging.set_verbosity(tf.logging.INFO)
flags = tf.app.flags
FLAGS = flags.FLAGS
define_wide_deep_flags(flags)



In [8]:
print('data dir  : {}'.format(FLAGS.data_dir))
# 파일로 부터 트레이닝 데이터와 테스트 데이터를 읽어온다.
train_file = os.path.join(FLAGS.data_dir, 'adult.data')
test_file = os.path.join(FLAGS.data_dir, 'adult.test')
print('train file  : {}'.format(train_file))
print('test file  : {}'.format(test_file))

data dir  : ./data/census_income/
train file  : ./data/census_income/adult.data
test file  : ./data/census_income/adult.test


In [12]:
df_train = pd.read_csv(train_file, names=columns, skipinitialspace=True)
df_test = pd.read_csv(test_file, names=columns, skipinitialspace=True, skiprows=1)
# test_file에 첫라인은 실데이터가 아니다.

df_train[label_column]= (df_train['income_bracket'].apply(lambda x: ">50k" in x)).astype(int)
df_test[label_column] = (df_test['income_bracket'].apply(lambda x: ">50k" in x)).astype(int)

model_dir = FLAGS.model_dir
print('model directory : {}'.format(model_dir))

model = build_estimator(FLAGS.model_dir, FLAGS.model_type)
model.train(input_fn = lambda : input_fn(df_train), steps=FLAGS.train_steps)
results = model.evaluate(input_fn = lambda : input_fn(df_test), steps=1)

for key in results:
    print(" {} : {}".format(key, results[key]))
    

model directory : ./model/census_model/
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': './model/census_model/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f5de4f1c2d0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensor

In [None]:
# def export_model(model, model_type, export_dir):
#     '''SaveModel format으로 export 한다
#     Args:
#         model : Estimator object
#         model_type :  model type을 나타내는 string. 예: 'wide', 'deep' 혹은 'wide_deep'
#         export_dir : model을 export 할 폴더 경로
#     '''
#     wide_columns, deep_columns = build_model_columns()
    
#     if model_type == 'wide':
#         columns = wide_columns
#     elif model_type == 'deep':
#         columns = deep_columns
#     else :
#         columns = wide_columns + deep_columns
        
#     feature_spec = tf.feature_column.make_parse_example_spec(columns)
#     example_input_fn = (
#         tf.estimator.export.build_parsing_serving_input_receiver_Fn(feature_spec))
#     model.export_savedmodel(export_dir, exmaple_input_fn)

In [None]:
# def run_wide_deep(flags_obj):
#     '''
#     Wide-Deep training 과 evaluation을 실행한다.
#     인자 Arguments:
#         flags_obj : parsed 플래그들 (flags)
#     '''
    
#     # model_dir 경로에 파일이 있으면 삭제한다.
#     shutil.rmtree(flags_obj.model_dir, ignore_errors = True)
#     # tf.estimatore api.를 이용해서 학습 모델을 생성한다.
#     model = build_estimator(flags_obj.model_dir, flags_obj.model_type)
    
#     # 파일로 부터 트레이닝 데이터와 테스트 데이터를 읽어온다.
#     train_file = os.path.join(flags_obj.data_dir, 'adult.data')
#     test_file = os.path.join(flags_obj.data_dir, 'adult.test')
    
#     # training을 진행하고, flags.epochs_between_evals epoch 마다 evaluation을 진행한다.
#     def trian_input_fn():
#         return input_fn(train_file, flags_obj.epochs_between_evals, True, flags_obj.batch_size)
    
#     def eval_input_fn():
#         return input_fn(test_file, 1, False, flags_obj.batch_size)
    
#     run_params = {
#         'batch_size': flags_obj.batch_size,
#         'train_epochs' : flags_obj.train_epochs,
#         'model_type' : flags_obj.model_type,
#     }

In [None]:

# tf.logging.set_verbosity(tf.logging.INFO)
# define_wide_deep_flags()
