In [2]:
#importing libraries

import tensorflow as tf
import tensorflow.feature_column as fc

import os
import sys

import matplotlib.pyplot as plt
import pandas

tf.enable_eager_execution()

In [2]:
train_file = "train_balanced.csv"
test_file = "my_test.csv"


train_df = pandas.read_csv(train_file, header = None)
test_df = pandas.read_csv(test_file, header = None)

train_df = train_df[:][:]

train_df.columns = ["ID","datetime","siteid","offerid","category","merchant","countrycode","browserid","devid","click","day","time"]
k = [int(i) for i in list(train_df['click'])]
train_df['click'] = k

train_df.head()

Unnamed: 0,ID,datetime,siteid,offerid,category,merchant,countrycode,browserid,devid,click,day,time
0,ID3CXgm0G,19/01/17 16:32,4235268,329345,57029,2050923,a,Mozilla Firefox,Mobile,0,weekday,992
1,IDoEA0Vvc,11/01/17 09:48,2342470,481133,17714,74363610,a,Edge,Tablet,0,weekday,588
2,ID7lAr1rU,10/01/17 13:44,1831975,168978,67817,51368560,f,Edge,Tablet,0,weekday,824
3,IDFRNUMeS,19/01/17 11:27,4248305,677780,57029,2050923,b,Mozilla Firefox,Mobile,0,weekday,687
4,IDdyA6MND,14/01/17 10:13,8347162,361756,56811,23703974,b,Mozilla Firefox,Mobile,0,weekend,613


In [3]:
def easy_input_function(df, label_key, num_epochs, shuffle, batch_size):
  label = df[label_key]
  ds = tf.data.Dataset.from_tensor_slices((dict(df),label))

  if shuffle:
    ds = ds.shuffle(10000)

  ds = ds.batch(batch_size).repeat(num_epochs)

  return ds


ds = easy_input_function(train_df, label_key='click', num_epochs=5, shuffle=True, batch_size=10)

#ds

In [4]:
_CSV_COLUMNS = [
    "ID","datetime","siteid","offerid","category","merchant","countrycode","browserid","devid","click","day","time"
]



_CSV_COLUMN_DEFAULTS = [[''],[''],[0],[0],[0],[0],[''],[''],[''],[0],[''],[0]]



def input_fn(data_file, num_epochs, shuffle, batch_size):

  def parse_csv(value):
    
    
    #tf.logging.info('Parsing {}'.format(data_file))
    columns = tf.decode_csv(value, record_defaults=_CSV_COLUMN_DEFAULTS)
    features = dict(zip(_CSV_COLUMNS, columns))
    labels = features.pop('click')
    classes = tf.equal(labels, 1)  # binary classification
    return features, classes

  # Extract lines from input files using the Dataset API.
  dataset = tf.data.TextLineDataset(data_file)

  if shuffle:
    dataset = dataset.shuffle(buffer_size=1820)

  dataset = dataset.map(parse_csv, num_parallel_calls=5)

  # We call repeat after shuffling, rather than before, to prevent separate
  # epochs from blending together.
  dataset = dataset.repeat(num_epochs)
  dataset = dataset.batch(batch_size)
  return dataset

In [5]:
import functools


train_inpf = functools.partial(input_fn, train_file, num_epochs=2, shuffle=True, batch_size=64)
test_inpf = functools.partial(input_fn, test_file,num_epochs=1, shuffle=False, batch_size=64)

In [6]:
siteid = fc.numeric_column('siteid')
offerid = fc.numeric_column('offerid')
category = fc.numeric_column('category')
merchant = fc.numeric_column('merchant')
time = fc.numeric_column('time')

time_buckets = tf.feature_column.bucketized_column(
    time, boundaries=[361,721,1081,1440])



#countrycode = tf.feature_column.categorical_column_with_hash_bucket('countrycode', hash_bucket_size=1000)
countrycode = tf.feature_column.categorical_column_with_vocabulary_list('countrycode', ['b', 'c', 'a', 'd', 'e', 'f'])
#browserid = tf.feature_column.categorical_column_with_hash_bucket('browserid',hash_bucket_size=1000)
browserid = tf.feature_column.categorical_column_with_vocabulary_list('browserid', ['Safari', 'Opera', 'Mozilla Firefox', 'Internet Explorer', 'Google Chrome', 'Edge'])
#devid = tf.feature_column.categorical_column_with_hash_bucket('devid',hash_bucket_size=1000)
devid = tf.feature_column.categorical_column_with_vocabulary_list('devid', ['Desktop','Tablet','Mobile'])
week_day_or_end = fc.categorical_column_with_vocabulary_list(
    'day',
    ['weekday','weekend'])

'''
crossed_columns = [
    tf.feature_column.crossed_column(
        ['merchant', 'offerid'], hash_bucket_size=1000),
    tf.feature_column.crossed_column(
        ['siteid','category'], hash_bucket_size=1000)
]
'''

crossed_columns = [
    tf.feature_column.crossed_column(
    [time_buckets,week_day_or_end],hash_bucket_size=500)
    ,tf.feature_column.crossed_column(
    [countrycode,time_buckets],hash_bucket_size=1000)
]


#my_numeric_columns = [siteid,offerid,category,merchant]
my_numeric_columns = [time_buckets]
my_categorical_columns = [countrycode,browserid,devid,week_day_or_end]


In [7]:
#classifier = tf.estimator.LinearClassifier(feature_columns=my_categorical_columns,model_dir='./log_reg_93')#,optimizer=tf.train.FtrlOptimizer(learning_rate=0.1))
classifier = tf.estimator.LinearClassifier(feature_columns=my_numeric_columns+my_categorical_columns+crossed_columns)
classifier.train(train_inpf)
result = classifier.evaluate(test_inpf)

for key,value in sorted(result.items()):
  print('%s: %s' % (key, value))

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmp50ktnnv7', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f6f58f4b400>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into /tmp/tmp50ktnnv7/model.ckpt.
INFO:ten

In [20]:
list(train_df['siteid'])[:10]

[4235268,
 2342470,
 1831975,
 4248305,
 8347162,
 3032043,
 1860680,
 7620805,
 7171883,
 5269067]

In [9]:
df = pandas.read_csv('train_balanced.csv')

df.columns = ["ID","datetime","siteid","offerid","category","merchant","countrycode","browserid","devid","click",'day','time']

for i in df.columns:
    print(df.iloc[0][i],type(df.iloc[0][i]))


df.head()

IDoEA0Vvc <class 'str'>
11/01/17 09:48 <class 'str'>
2342470 <class 'numpy.int64'>
481133 <class 'numpy.int64'>
17714 <class 'numpy.int64'>
74363610 <class 'numpy.int64'>
a <class 'str'>
Edge <class 'str'>
Tablet <class 'str'>
0 <class 'numpy.int64'>
weekday <class 'str'>
588 <class 'numpy.int64'>


Unnamed: 0,ID,datetime,siteid,offerid,category,merchant,countrycode,browserid,devid,click,day,time
0,IDoEA0Vvc,11/01/17 09:48,2342470,481133,17714,74363610,a,Edge,Tablet,0,weekday,588
1,ID7lAr1rU,10/01/17 13:44,1831975,168978,67817,51368560,f,Edge,Tablet,0,weekday,824
2,IDFRNUMeS,19/01/17 11:27,4248305,677780,57029,2050923,b,Mozilla Firefox,Mobile,0,weekday,687
3,IDdyA6MND,14/01/17 10:13,8347162,361756,56811,23703974,b,Mozilla Firefox,Mobile,0,weekend,613
4,IDk2SeWDF,13/01/17 14:47,3032043,81511,92704,64322853,b,Google Chrome,Mobile,0,weekday,887


In [10]:
k = list(df['click'])
k.count(0)/len(k)

0.8468715697036224