# Linear Model

In [1]:
%%bash
gsutil cp gs://shane-bq-export/300k_export.csv .

Copying gs://shane-bq-export/300k_export.csv...
/ [0 files][    0.0 B/ 16.4 MiB]                                                / [1 files][ 16.4 MiB/ 16.4 MiB]                                                
Operation completed over 1 objects/16.4 MiB.                                     


In [2]:
import tempfile
import pandas as pd
import tensorflow as tf

In [3]:
df = pd.read_csv('300k_export.csv')

In [4]:
df.head()

Unnamed: 0,CustName,Segment,Account,TxDate,TxProduct,TxType,TxValue
0,C00000000,LowVolHighVal,Current,2015-06-30 00:00:00 UTC,Cash,DB,122196
1,C00000000,LowVolHighVal,Current,2015-06-30 00:00:00 UTC,Cash,CR,110389
2,C00000000,LowVolHighVal,Current,2015-06-30 00:00:00 UTC,Cash,CR,131852
3,C00000001,HighVolLowVal,Current,2015-06-30 00:00:00 UTC,Cash,CR,44
4,C00000001,HighVolLowVal,Current,2015-06-30 00:00:00 UTC,Cash,DB,35


In [5]:
LABEL_COLUMN = "Label"
CATEGORICAL_COLUMNS = ["TxProduct", "TxType", "Account"]
CONTINUOUS_COLUMNS = ["TxValue"]

In [6]:
df[LABEL_COLUMN] = (
    df["Segment"].apply(lambda x: "Retail" in x)).astype(int)

training_data = df[df.TxDate != '2015-12-31 00:00:00 UTC']
test_data = df[df.TxDate == '2015-12-31 00:00:00 UTC']

In [7]:
training_data.head()

Unnamed: 0,CustName,Segment,Account,TxDate,TxProduct,TxType,TxValue,Label
0,C00000000,LowVolHighVal,Current,2015-06-30 00:00:00 UTC,Cash,DB,122196,0
1,C00000000,LowVolHighVal,Current,2015-06-30 00:00:00 UTC,Cash,CR,110389,0
2,C00000000,LowVolHighVal,Current,2015-06-30 00:00:00 UTC,Cash,CR,131852,0
3,C00000001,HighVolLowVal,Current,2015-06-30 00:00:00 UTC,Cash,CR,44,0
4,C00000001,HighVolLowVal,Current,2015-06-30 00:00:00 UTC,Cash,DB,35,0


In [8]:
model_dir = tempfile.mkdtemp()
print("model directory = %s" % model_dir)

model directory = /tmp/tmpDcLsbl


In [9]:
TxType = tf.contrib.layers.sparse_column_with_keys(column_name="TxType", keys=["CR", "DB"])
TxProduct = tf.contrib.layers.sparse_column_with_hash_bucket("TxProduct", hash_bucket_size=1024)
Account = tf.contrib.layers.sparse_column_with_hash_bucket("Account", hash_bucket_size=1024)
TxValue  = tf.contrib.layers.real_valued_column("TxValue")
wide_columns = [TxType, TxProduct, Account, TxValue]



In [10]:
def input_fn(df):
  continuous_cols = {k: tf.constant(df[k].values) for k in CONTINUOUS_COLUMNS}
  categorical_cols = {k: tf.SparseTensor(
      indices=[[i, 0] for i in range(df[k].size)],
      values=df[k].values,
      shape=[df[k].size, 1])
                      for k in CATEGORICAL_COLUMNS}
  feature_cols = dict(continuous_cols)
  feature_cols.update(categorical_cols)
  label = tf.constant(df[LABEL_COLUMN].values)
  
  print(feature_cols)
  print(label)
  
  return feature_cols, label

In [11]:
m = tf.contrib.learn.LinearClassifier(model_dir=model_dir, feature_columns=wide_columns)

m.fit(input_fn=lambda: input_fn(training_data), steps=200)
results = m.evaluate(input_fn=lambda: input_fn(test_data), steps=1)
for key in sorted(results):
  print("%s: %s" % (key, results[key]))

Explicitly set `enable_centered_bias` to 'True' if you want to keep existing behaviour.


{'TxValue': <tf.Tensor 'Const:0' shape=(253623,) dtype=int64>, 'TxProduct': <tensorflow.python.framework.ops.SparseTensor object at 0x7f5bd286cd10>, 'TxType': <tensorflow.python.framework.ops.SparseTensor object at 0x7f5bd2880790>, 'Account': <tensorflow.python.framework.ops.SparseTensor object at 0x7f5b99107cd0>}
Tensor("Const_1:0", shape=(253623,), dtype=int64)




{'TxValue': <tf.Tensor 'Const:0' shape=(22717,) dtype=int64>, 'TxProduct': <tensorflow.python.framework.ops.SparseTensor object at 0x7f5bd2880790>, 'TxType': <tensorflow.python.framework.ops.SparseTensor object at 0x7f5b91707750>, 'Account': <tensorflow.python.framework.ops.SparseTensor object at 0x7f5b9178d7d0>}
Tensor("Const_1:0", shape=(22717,), dtype=int64)
accuracy: 0.951094
accuracy/baseline_target_mean: 0.293701
accuracy/threshold_0.500000_mean: 0.951094
auc: 0.942664
global_step: 200
labels/actual_target_mean: 0.293701
labels/prediction_mean: 0.280474
loss: 2.1154
precision/positive_threshold_0.500000_mean: 0.896591
recall/positive_threshold_0.500000_mean: 0.942146
