https://www.tensorflow.org/tutorials/wide

# Load data

In [1]:
import tempfile

In [69]:
from flightdelay.fld import io as flio
airlines_df, airports_df, flights_df = flio.load_data()

In [70]:
flights_df = flights_df[:10000]

In [71]:
flights_df = flights_df.rename(index=str,columns={'DAY_OF_WEEK':'workclass',
                                     'DISTANCE':'age',
                                     'DEPARTURE_DELAY':'income_bracket',
                                     'MONTH':'education'})

In [72]:
df_train = flights_df[:7000]
df_test = flights_df[7000:]

In [73]:
len(flights_df)

10000

# Format data

In [74]:
df_train = df_train[['age','workclass','education','income_bracket']]
df_test = df_test[['age','workclass','education','income_bracket']]

In [75]:
LABEL_COLUMN = "label"
df_train[LABEL_COLUMN] = df_train["income_bracket"].apply(lambda x:  x > 15).astype(int)
df_test[LABEL_COLUMN] = df_test["income_bracket"].apply(lambda x: x > 15).astype(int)

In [76]:
df_train.head(10)

Unnamed: 0,age,workclass,education,income_bracket,label
0,1448,4,1,-11.0,0
1,2330,4,1,-8.0,0
2,2296,4,1,-2.0,0
3,2342,4,1,-5.0,0
4,1448,4,1,-1.0,0
5,1589,4,1,-5.0,0
6,1299,4,1,-6.0,0
7,2125,4,1,14.0,0
8,1464,4,1,-11.0,0
9,1747,4,1,3.0,0


In [77]:
CATEGORICAL_COLUMNS = ["workclass", "education"]
CONTINUOUS_COLUMNS = ["age"]

In [78]:
df_train.head(10)

Unnamed: 0,age,workclass,education,income_bracket,label
0,1448,4,1,-11.0,0
1,2330,4,1,-8.0,0
2,2296,4,1,-2.0,0
3,2342,4,1,-5.0,0
4,1448,4,1,-1.0,0
5,1589,4,1,-5.0,0
6,1299,4,1,-6.0,0
7,2125,4,1,14.0,0
8,1464,4,1,-11.0,0
9,1747,4,1,3.0,0


# Set up tf

In [79]:
import tensorflow as tf

def input_fn(df):
    # Creates a dictionary mapping from each continuous feature column name (k) to
    # the values of that column stored in a constant Tensor.
    continuous_cols = {k: tf.constant(df[k].values)
                     for k in CONTINUOUS_COLUMNS}
    # Creates a dictionary mapping from each categorical feature column name (k)
    # to the values of that column stored in a tf.SparseTensor.
    categorical_cols = {k: tf.SparseTensor(
      indices=[[i, 0] for i in range(df[k].size)],
      values=df[k].values,
      dense_shape=[df[k].size, 1])
                      for k in CATEGORICAL_COLUMNS}
    # Merges the two dictionaries into one.
    feature_cols = {**continuous_cols,**categorical_cols}#dict(continuous_cols.items() + categorical_cols.items())
    # Converts the label column into a constant Tensor.
    label = tf.constant(df[LABEL_COLUMN].values)
    # Returns the feature columns and the label.
    return feature_cols, label

def train_input_fn():
    return input_fn(df_train)

def eval_input_fn():
    return input_fn(df_test)

# Set up tf variables

### Categorical vars

In [80]:
education = tf.contrib.layers.sparse_column_with_hash_bucket("education", hash_bucket_size=1000)



In [81]:
workclass = tf.contrib.layers.sparse_column_with_hash_bucket("workclass", hash_bucket_size=100)



### Continuous vars

In [82]:
age = tf.contrib.layers.real_valued_column("age")

# Define log reg model

In [83]:
model_dir = tempfile.mkdtemp()
m = tf.contrib.learn.LinearClassifier(feature_columns=[
  education, workclass, age],
  model_dir=model_dir)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_task_type': None, '_task_id': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x15806f128>, '_master': '', '_num_ps_replicas': 0, '_environment': 'local', '_is_chief': True, '_evaluation_master': '', '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1
}
, '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_save_checkpoints_steps': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000}


# Train model

In [84]:
m.fit(input_fn=train_input_fn, steps=200)



TypeError: Input 'input' of 'StringToHashBucketFast' Op has type int64 that does not match expected type of string.

In [23]:
results = m.evaluate(input_fn=eval_input_fn, steps=1)
for key in sorted(results):
    print("%s: %s" % (key, results[key]))

Instructions for updating:
Please switch to tf.summary.scalar. Note that tf.summary.scalar uses the node name instead of the tag. This means that TensorFlow will automatically de-duplicate summary names based on the scope they are created in. Also, passing a tensor or list of tags to a scalar summary op is no longer supported.
INFO:tensorflow:Starting evaluation at 2017-03-10-21:17:49
INFO:tensorflow:Evaluation [1/1]
INFO:tensorflow:Finished evaluation at 2017-03-10-21:17:50
INFO:tensorflow:Saving dict for global step 200: accuracy = 0.779805, accuracy/baseline_label_mean = 0.236226, accuracy/threshold_0.500000_mean = 0.779805, auc = 0.748045, global_step = 200, labels/actual_label_mean = 0.236226, labels/prediction_mean = 0.254771, loss = 0.478646, precision/positive_threshold_0.500000_mean = 0.610687, recall/positive_threshold_0.500000_mean = 0.187207
accuracy: 0.779805
accuracy/baseline_label_mean: 0.236226
accuracy/threshold_0.500000_mean: 0.779805
auc: 0.748045
global_step: 200
la

# Add regularization

In [19]:
m = tf.contrib.learn.LinearClassifier(feature_columns=[
  gender, native_country, education, occupation, workclass, marital_status, race,
  age_buckets, education_x_occupation, age_buckets_x_education_x_occupation],
  optimizer=tf.train.FtrlOptimizer(
    learning_rate=0.1,
    l1_regularization_strength=1.0,
    l2_regularization_strength=1.0),
  model_dir=model_dir)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_task_type': None, '_task_id': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x12cfc9f98>, '_master': '', '_num_ps_replicas': 0, '_environment': 'local', '_is_chief': True, '_evaluation_master': '', '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1
}
, '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_save_checkpoints_steps': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000}


In [20]:
m.fit(input_fn=train_input_fn, steps=200)

Instructions for updating:
The default behavior of sparse_feature_cross is changing, the default
value for hash_key will change to SPARSE_FEATURE_CROSS_DEFAULT_HASH_KEY.
From that point on sparse_feature_cross will always use FingerprintCat64
to concatenate the feature fingerprints. And the underlying
_sparse_feature_cross_op.sparse_feature_cross operation will be marked
as deprecated.
Instructions for updating:
The default behavior of sparse_feature_cross is changing, the default
value for hash_key will change to SPARSE_FEATURE_CROSS_DEFAULT_HASH_KEY.
From that point on sparse_feature_cross will always use FingerprintCat64
to concatenate the feature fingerprints. And the underlying
_sparse_feature_cross_op.sparse_feature_cross operation will be marked
as deprecated.
Instructions for updating:
Please switch to tf.summary.scalar. Note that tf.summary.scalar uses the node name instead of the tag. This means that TensorFlow will automatically de-duplicate summary names based on the scope 

LinearClassifier(params={'head': <tensorflow.contrib.learn.python.learn.estimators.head._BinaryLogisticHead object at 0x126b42908>, 'feature_columns': [_SparseColumnKeys(column_name='gender', is_integerized=False, bucket_size=None, lookup_config=_SparseIdLookupConfig(vocabulary_file=None, keys=('Female', 'Male'), num_oov_buckets=0, vocab_size=2, default_value=-1), combiner='sum', dtype=tf.string), _SparseColumnHashed(column_name='native_country', is_integerized=False, bucket_size=1000, lookup_config=None, combiner='sum', dtype=tf.string), _SparseColumnHashed(column_name='education', is_integerized=False, bucket_size=1000, lookup_config=None, combiner='sum', dtype=tf.string), _SparseColumnHashed(column_name='occupation', is_integerized=False, bucket_size=1000, lookup_config=None, combiner='sum', dtype=tf.string), _SparseColumnHashed(column_name='workclass', is_integerized=False, bucket_size=100, lookup_config=None, combiner='sum', dtype=tf.string), _SparseColumnHashed(column_name='marit

In [21]:
results = m.evaluate(input_fn=eval_input_fn, steps=1)
for key in sorted(results):
    print("%s: %s" % (key, results[key]))

Instructions for updating:
The default behavior of sparse_feature_cross is changing, the default
value for hash_key will change to SPARSE_FEATURE_CROSS_DEFAULT_HASH_KEY.
From that point on sparse_feature_cross will always use FingerprintCat64
to concatenate the feature fingerprints. And the underlying
_sparse_feature_cross_op.sparse_feature_cross operation will be marked
as deprecated.
Instructions for updating:
The default behavior of sparse_feature_cross is changing, the default
value for hash_key will change to SPARSE_FEATURE_CROSS_DEFAULT_HASH_KEY.
From that point on sparse_feature_cross will always use FingerprintCat64
to concatenate the feature fingerprints. And the underlying
_sparse_feature_cross_op.sparse_feature_cross operation will be marked
as deprecated.
Instructions for updating:
Please switch to tf.summary.scalar. Note that tf.summary.scalar uses the node name instead of the tag. This means that TensorFlow will automatically de-duplicate summary names based on the scope 