In [50]:
import tensorflow as tf
import tempfile
import urllib
train_file = tempfile.NamedTemporaryFile()
test_file = tempfile.NamedTemporaryFile()

In [51]:
import pandas as pd
CSV_COLUMNS = [
    "age", "workclass", "fnlwgt", "education", "education_num",
    "marital_status", "occupation", "relationship", "race", "gender",
    "capital_gain", "capital_loss", "hours_per_week", "native_country",
    "income_bracket"]
df_train = pd.read_csv("./adult_data.csv", names=CSV_COLUMNS, skipinitialspace=True)
df_test = pd.read_csv("./adult_test.csv", names=CSV_COLUMNS, skipinitialspace=True, skiprows=1)
print(df_test.columns)

Index(['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'gender',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
       'income_bracket'],
      dtype='object')


In [52]:
train_labels=(df_train["income_bracket"].apply(lambda x:">50k" in x)).astype(int)
test_labels = (df_test["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)

In [53]:
def input_fn(data_file, num_epochs, shuffle):
  """Input builder function."""
  df_data = pd.read_csv(
      tf.gfile.Open(data_file),
      names=CSV_COLUMNS,
      skipinitialspace=True,
      engine="python",
      skiprows=1)
  # remove NaN elements
  df_data = df_data.dropna(how="any", axis=0)
  labels = df_data["income_bracket"].apply(lambda x: ">50K" in x).astype(int)
  return tf.estimator.inputs.pandas_input_fn(
      x=df_data,
      y=labels,
      batch_size=100,
      num_epochs=num_epochs,
      shuffle=shuffle,
      num_threads=5)  


In [54]:

gender = tf.feature_column.categorical_column_with_vocabulary_list("gender",["female","male"])
occupation = tf.feature_column.categorical_column_with_hash_bucket(
    "occupation", hash_bucket_size=1000)
education = tf.feature_column.categorical_column_with_vocabulary_list(
    "education", [
        "Bachelors", "HS-grad", "11th", "Masters", "9th",
        "Some-college", "Assoc-acdm", "Assoc-voc", "7th-8th",
        "Doctorate", "Prof-school", "5th-6th", "10th", "1st-4th",
        "Preschool", "12th"
    ])
marital_status = tf.feature_column.categorical_column_with_vocabulary_list(
    "marital_status", [
        "Married-civ-spouse", "Divorced", "Married-spouse-absent",
        "Never-married", "Separated", "Married-AF-spouse", "Widowed"
    ])
relationship = tf.feature_column.categorical_column_with_vocabulary_list(
    "relationship", [
        "Husband", "Not-in-family", "Wife", "Own-child", "Unmarried",
        "Other-relative"
    ])
workclass = tf.feature_column.categorical_column_with_vocabulary_list(
    "workclass", [
        "Self-emp-not-inc", "Private", "State-gov", "Federal-gov",
        "Local-gov", "?", "Self-emp-inc", "Without-pay", "Never-worked"
    ])
native_country = tf.feature_column.categorical_column_with_hash_bucket(
    "native_country", hash_bucket_size=1000)
#for continuous data
age=tf.feature_column.numeric_column("age")
education_num = tf.feature_column.numeric_column("education_num")
capital_gain = tf.feature_column.numeric_column("capital_gain")
capital_loss = tf.feature_column.numeric_column("capital_loss")
hours_per_week = tf.feature_column.numeric_column("hours_per_week")

#bucketized columns
age_buckets=tf.feature_column.bucketized_column(age, boundaries=[18,25,  30, 35, 40, 45, 50, 55, 60, 65])
education_x_occupation = tf.feature_column.crossed_column(
    ["education", "occupation"], hash_bucket_size=1000)
age_buckets_x_education_x_occupation = tf.feature_column.crossed_column([age_buckets,"education","occupation"],
                                                                       hash_bucket_size=1000)

In [55]:
base_columns = [
    gender, native_country, education, occupation, workclass, relationship,
    age_buckets,
]
crossed_columns = [
    tf.feature_column.crossed_column(
        ["education", "occupation"], hash_bucket_size=1000),
    tf.feature_column.crossed_column(
        [age_buckets, "education", "occupation"], hash_bucket_size=1000),
    tf.feature_column.crossed_column(
        ["native_country", "occupation"], hash_bucket_size=1000)
]


model_dir = tempfile.mkdtemp()
m= tf.estimator.LinearClassifier(model_dir = model_dir, feature_columns= base_columns + crossed_columns)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpolu2e66z', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f723547f8d0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [57]:
m.train(
    input_fn=input_fn("./adult_data.csv" , num_epochs=None, shuffle=True),
    steps=2500)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into /tmp/tmpolu2e66z/model.ckpt.
INFO:tensorflow:loss = 69.31472, step = 1
INFO:tensorflow:global_step/sec: 83.7671
INFO:tensorflow:loss = 36.77567, step = 101 (1.195 sec)
INFO:tensorflow:global_step/sec: 155.712
INFO:tensorflow:loss = 28.358149, step = 201 (0.642 sec)
INFO:tensorflow:global_step/sec: 155.983
INFO:tensorflow:loss = 29.087587, step = 301 (0.641 sec)
INFO:tensorflow:global_step/sec: 155.31
INFO:tensorflow:loss = 40.878227, step = 401 (0.644 sec)
INFO:tensorflow:global_step/sec: 152.174
INFO:tensorflow:loss = 36.946056, step = 501 (0.657 sec)
INFO:tensorflow:global_step/sec: 158.902
INFO:tensorflow:loss = 28.819906, step = 601 (0.630 sec)
INFO:tensorflow:global_step/sec: 119.228
INFO:tensorflow:los

<tensorflow.python.estimator.canned.linear.LinearClassifier at 0x7f723531b1d0>

In [58]:
results = m.evaluate(
    input_fn=input_fn("./adult_test.csv", num_epochs=1, shuffle=False),
    steps=None)
print("model directory = %s" % model_dir)
for key in sorted(results):
  print("%s: %s" % (key, results[key]))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-08-07-16:49:03
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmpolu2e66z/model.ckpt-2500
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-08-07-16:49:10
INFO:tensorflow:Saving dict for global step 2500: accuracy = 0.83256555, accuracy_baseline = 0.76377374, auc = 0.87981945, auc_precision_recall = 0.6895981, average_loss = 0.35740533, global_step = 2500, label/mean = 0.23622628, loss = 35.698875, precision = 0.66746414, prediction/mean = 0.24696377, recall = 0.5803432
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 2500: /tmp/tmpolu2e66z/model.ckpt-2500
model directory = /tmp/tmpolu2e66z
accuracy: 0.83256555
accuracy_baseline: 0.76377374
auc: 0.87981945
auc_precision_recall: 0.6895981
average_loss: 0.35740533
global_step: 2500
label/mean: 0

In [60]:
m = tf.estimator.LinearClassifier(
    model_dir=model_dir, feature_columns=base_columns + crossed_columns,
    optimizer=tf.train.FtrlOptimizer(
      learning_rate=0.1,
      l1_regularization_strength=1.0,
      l2_regularization_strength=1.0),
    model_dir =model_dir )


SyntaxError: keyword argument repeated (<ipython-input-60-f5931b663b03>, line 7)