# Outbrain advertisement click prediction
by Ryan Goy, https://www.linkedin.com/in/goyryan

useful links:
 - [10 minutes to pandas](http://pandas.pydata.org/pandas-docs/version/0.15.2/10min.html)
 - [EDA graphs](https://www.kaggle.com/anokas/outbrain-click-prediction/outbrain-eda)
 - [multilayer perceptron example](https://github.com/aymericdamien/TensorFlow-Examples/blob/master/examples/3_NeuralNetworks/multilayer_perceptron.py)

In [1]:
import numpy as np
import tensorflow as tf
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import pandas as pd
import os
import gc
import matplotlib.pyplot as plt
import seaborn as sns
import random
import time
import csv

%matplotlib inline

### Import Datasets

We'll start with importing the minimum amount of data in order to get a MVP for now and add more feature later when we're on the scoreboard. 

 - click_train and clicks_test both have a display_id (the unique context it was displayed in) and an ad_id (the id of an ad it was)
 - events has information correlating to each display_id
 - promoted_content has information correlating to each ad_id
 - documents_meta has information elaborating on the document the ad links (nested in promoted_content)
 - page_views is the huge dataset with information about the type of people who visit the linked ads
 - documents_topics, documents_entities, and documents_categories all provide information about the content in the ad
 
Thus, the possible characteristics we can train on are the following:

 - events: document_id, timestamp, platform, geo_location
 - promoted_content: document_id, campaign_id, advertiser_id
 - etc (will deal with other sets later)
 
For now, I'm only using click_train, clicks_test, events, and promoted_content

In [2]:
clicks_train = pd.read_csv('../data/clicks_train.csv')
clicks_test = pd.read_csv('../data/clicks_test.csv')

In [3]:
print clicks_train.shape
print clicks_test.shape

(87141731, 3)
(32225162, 2)


In [4]:
events = pd.read_csv('../data/events.csv')
promoted_content = pd.read_csv('../data/promoted_content.csv')
documents_metadata = pd.read_csv('../data/documents_meta.csv')
documents_topics = pd.read_csv('../data/documents_topics.csv')
#documents_entities = pd.read_csv('../data/documents_entities.csv')
documents_categories = pd.read_csv('../data/documents_categories.csv')

  interactivity=interactivity, compiler=compiler, result=result)


### View/investigate datasets

also see [this](https://www.kaggle.com/anokas/outbrain-click-prediction/outbrain-eda) for graphs

In [5]:
clicks_train.head()

Unnamed: 0,display_id,ad_id,clicked
0,1,42337,0
1,1,139684,0
2,1,144739,1
3,1,156824,0
4,1,279295,0


In [6]:
print clicks_train.shape

(87141731, 3)


In [7]:
clicks_test.head()

Unnamed: 0,display_id,ad_id
0,16874594,66758
1,16874594,150083
2,16874594,162754
3,16874594,170392
4,16874594,172888


In [8]:
print clicks_test.shape

(32225162, 2)


In [9]:
events.head()

Unnamed: 0,display_id,uuid,document_id,timestamp,platform,geo_location
0,1,cb8c55702adb93,379743,61,3,US>SC>519
1,2,79a85fa78311b9,1794259,81,2,US>CA>807
2,3,822932ce3d8757,1179111,182,2,US>MI>505
3,4,85281d0a49f7ac,1777797,234,2,US>WV>564
4,5,8d0daef4bf5b56,252458,338,2,SG>00


In [10]:
promoted_content.head()

Unnamed: 0,ad_id,document_id,campaign_id,advertiser_id
0,1,6614,1,7
1,2,471467,2,7
2,3,7692,3,7
3,4,471471,2,7
4,5,471472,2,7


In [11]:
print promoted_content.shape

(559583, 4)


In [12]:
documents_metadata.head()

Unnamed: 0,document_id,source_id,publisher_id,publish_time
0,1595802,1.0,603.0,2016-06-05 00:00:00
1,1524246,1.0,603.0,2016-05-26 11:00:00
2,1617787,1.0,603.0,2016-05-27 00:00:00
3,1615583,1.0,603.0,2016-06-07 00:00:00
4,1615460,1.0,603.0,2016-06-20 00:00:00


In [13]:
#documents_entities.head()

In [14]:
documents_categories.head()

Unnamed: 0,document_id,category_id,confidence_level
0,1595802,1611,0.92
1,1595802,1610,0.07
2,1524246,1807,0.92
3,1524246,1608,0.07
4,1617787,1807,0.92


In [15]:
documents_topics.head()

Unnamed: 0,document_id,topic_id,confidence_level
0,1595802,140,0.073113
1,1595802,16,0.059416
2,1595802,143,0.045421
3,1595802,170,0.038867
4,1524246,113,0.19645


### Format and load training data
each click has document_from, timestamp, platform, geo_location, document_to, campaign_id, advertiser_id (so far)

In [16]:
# ad_id, document_from, timestamp, platform, geo_location, document_to, campaign_id, advertiser_id

def load_raw_joined_train_df():
    
    # join clicks_train, events, and promoted_content
    ads_info = promoted_content.merge(documents_metadata, on='document_id', how='inner')
    #ads_info = ads_info.merge(documents_topics, on='document_id', how='inner')
    #ads_info = ads_info.merge(documents_categories, on='document_id', how='inner')
    #ads_info = ads_info.drop('document_id', 1)
    
    combined = clicks_train.merge(events, on='display_id', how='inner')
    combined = combined.drop('document_id', 1)
    #combined = combined.drop('display_id', 1)
    combined = combined.drop('uuid', 1)
    
    combined = combined.merge(ads_info, on='ad_id', how='inner', suffixes=('_from', '_to'))
    #combined = combined.drop('ad_id', 1)
    
    # load labels
    labels = combined['clicked'].as_matrix()
    one_hot = np.zeros((len(labels), 2))
    one_hot[np.arange(len(labels)), labels] = 1
    
    return combined, one_hot.astype("float")
    

def load_next_train_data(data, labels, batch_size):
    index = random.randint(1, len(clicks_train.index)-batch_size)
    return data[index:index+batch_size], labels[index:index+batch_size]

def load_raw_joined_test_df():
    # join clicks_train, events, and promoted_content
    ads_info = promoted_content.merge(documents_metadata, on='document_id', how='inner')
    #ads_info = ads_info.merge(documents_topics, on='document_id', how='inner')
    #ads_info = ads_info.merge(documents_categories, on='document_id', how='inner')
    #ads_info = ads_info.drop('document_id', 1)
    
    combined = clicks_test.merge(events, on='display_id', how='inner')
    combined = combined.drop('document_id', 1)
    #combined = combined.drop('display_id', 1)
    combined = combined.drop('uuid', 1)
    
    combined = combined.merge(ads_info, on='ad_id', how='inner', suffixes=('_from', '_to'))
    #combined = combined.drop('ad_id', 1)
    
    return combined, combined['display_id'], combined['ad_id']
    


In [17]:
start = time.time()
data, labels = load_raw_joined_train_df()
end = time.time()

In [18]:
print 'seconds to load train data: ' + str(end-start)

seconds to load train data: 24.7753078938


In [19]:
data.head()

Unnamed: 0,display_id,ad_id,clicked,timestamp,platform,geo_location,document_id,campaign_id,advertiser_id,source_id,publisher_id,publish_time
0,1,42337,0,61,3,US>SC>519,938164,5969,1499,5802.0,,2016-07-27 00:00:00
1,2236,42337,0,156650,3,US>WI>669,938164,5969,1499,5802.0,,2016-07-27 00:00:00
2,3497,42337,1,239231,1,US>CA>807,938164,5969,1499,5802.0,,2016-07-27 00:00:00
3,6361,42337,0,435860,1,US>CA>828,938164,5969,1499,5802.0,,2016-07-27 00:00:00
4,7709,42337,0,530186,1,US>NV>839,938164,5969,1499,5802.0,,2016-07-27 00:00:00


In [20]:
data['platform'] = pd.to_numeric(data['platform'], errors='coerce')

Now, since everything is categorical, we need to do some feature engineering

In [42]:
def frequency_featurize(df, features, freq_tables=None):
    training = False
    if freq_tables == None:
        positive_examples = df[df['clicked'] == 1]
        training = True
        freq_tables = []
        
    for i in range(len(features)):
        if training:
            tcount_table = df[features[i]].value_counts()
            pcount_table = positive_examples[features[i]].value_counts()
            freq_table = pcount_table.divide(tcount_table)
            #freq_table = tcount_table
            freq_tables.append(freq_table)
        else:
            freq_table = freq_tables[i]
        new_vec = freq_table[df[features[i]]]
        df[features[i] + '_freq'] = new_vec.values
    return freq_tables

def one_hot_featurize(df, features):
    return pd.get_dummies(df, columns=features)

def norm_df(df):
    new_df = (df - df.mean()) / (df.max() - df.min())
    new_df = new_df.fillna(0.0)
    return new_df

def process(df, freq_tables=None):
    freq_tables = frequency_featurize(df, ['platform', 'ad_id', 'advertiser_id', 'source_id', 'campaign_id'], freq_tables)
    simple_data = df[['timestamp', 'platform_freq', 'ad_id_freq', 'advertiser_id_freq', 'source_id_freq', 'campaign_id_freq', 'platform']]
    #normed = norm_df(simple_data)
    #print normed.head()
    ret = one_hot_featurize(simple_data, ['platform'])
    return ret.fillna(0.0), freq_tables

In [29]:
train_normed, freq_tables = process(data)

In [30]:
data.head()

Unnamed: 0,display_id,ad_id,clicked,timestamp,platform,geo_location,document_id,campaign_id,advertiser_id,source_id,publisher_id,publish_time,platform_freq,ad_id_freq,advertiser_id_freq,source_id_freq,campaign_id_freq
0,1,42337,0,61,3.0,US>SC>519,938164,5969,1499,5802.0,,2016-07-27 00:00:00,12785378.0,11777,24304,24304.0,11777
1,2236,42337,0,156650,3.0,US>WI>669,938164,5969,1499,5802.0,,2016-07-27 00:00:00,12785378.0,11777,24304,24304.0,11777
2,3497,42337,1,239231,1.0,US>CA>807,938164,5969,1499,5802.0,,2016-07-27 00:00:00,37519782.0,11777,24304,24304.0,11777
3,6361,42337,0,435860,1.0,US>CA>828,938164,5969,1499,5802.0,,2016-07-27 00:00:00,37519782.0,11777,24304,24304.0,11777
4,7709,42337,0,530186,1.0,US>NV>839,938164,5969,1499,5802.0,,2016-07-27 00:00:00,37519782.0,11777,24304,24304.0,11777


In [31]:
train_normed.head()

Unnamed: 0,timestamp,platform_freq,ad_id_freq,advertiser_id_freq,source_id_freq,campaign_id_freq,platform_1.0,platform_2.0,platform_3.0
0,61,12785378.0,11777,24304,24304.0,11777,0,0,1
1,156650,12785378.0,11777,24304,24304.0,11777,0,0,1
2,239231,37519782.0,11777,24304,24304.0,11777,1,0,0
3,435860,37519782.0,11777,24304,24304.0,11777,1,0,0
4,530186,37519782.0,11777,24304,24304.0,11777,1,0,0


# Processing with a multilayer perceptron

### Set up the neural network

In [None]:
# training params
learning_rate = 1e-4
training_epochs = 5
batch_size = 10000
epoch_size = train_normed.shape[0] / batch_size
display_step = 1

In [42]:
# network params
n_hidden_1 = 10
n_hidden_2 = 10
n_input = 6
n_classes = 2

In [43]:
# tf Graph input
x = tf.placeholder("float", [None, n_input])
y = tf.placeholder("float", [None, n_classes])

In [44]:
# Create model
def multilayer_perceptron(x, weights, biases):
    # Hidden layer with RELU activation
    layer_1 = tf.add(tf.matmul(x, weights['h1']), biases['b1'])
    layer_1 = tf.nn.relu(layer_1)
    # Hidden layer with RELU activation
    layer_2 = tf.add(tf.matmul(layer_1, weights['h2']), biases['b2'])
    layer_2 = tf.nn.relu(layer_2)
    # Output layer with linear activation
    out_layer = tf.matmul(layer_2, weights['out']) + biases['out']
    return out_layer

In [45]:

# Store layers weight & bias
weights = {
    'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])),
    'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),
    'out': tf.Variable(tf.random_normal([n_hidden_2, n_classes]))
}
biases = {
    'b1': tf.Variable(tf.random_normal([n_hidden_1])),
    'b2': tf.Variable(tf.random_normal([n_hidden_2])),
    'out': tf.Variable(tf.random_normal([n_classes]))
}

In [46]:
# Construct model
pred = multilayer_perceptron(x, weights, biases)

# Define loss and optimizer
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(pred, y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

# Initializing the variables
init = tf.initialize_all_variables()

# add ops to save and restore all variables
saver = tf.train.Saver()

### Start training

In [47]:
# Launch the graph
sess = tf.Session()
sess.run(init)

# Training cycle
for epoch in range(training_epochs):
    avg_cost = 0.
    total_batch = epoch_size
    # Loop over all batches
    for i in range(total_batch):
        batch_x, batch_y = load_next_train_data(train_normed, labels, batch_size)
        # Run optimization op (backprop) and cost op (to get loss value)
        _, c = sess.run([optimizer, cost], feed_dict={x: batch_x,
                                                      y: batch_y})
        # Compute average loss
        avg_cost += c / total_batch
    # Display logs per epoch step
    if epoch % display_step == 0:
        print "Epoch:" + '%04d' % (epoch+1) + " cost=" + \
            "{:.9f}".format(avg_cost)
print("Optimization Finished!")

#     start = time.time()
#     last_index = 0
#     splits = [5000000, 10000000, 15000000, 20000000, 25000000, 30000000, 32225162]
#     classification = []
#     for i in splits:
#         increment = outbrain_test[last_index:i]
#         with sess.as_default():
#             classification += tf.nn.softmax(sess.run(pred, feed_dict={x: increment})).eval()[:,1].tolist()
#         last_index = i
#     end = time.time()
#     print 'Seconds to classify test data: ' + str(end-start)

Epoch:0001cost=0.751343075
Epoch:0002cost=0.492395859
Epoch:0003cost=0.494877853
Epoch:0004cost=0.489054633
Epoch:0005cost=0.490858332
Optimization Finished!


In [31]:
with sess.as_default():
    # Test model
    correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
    # Calculate accuracy
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
    testi, testl = load_next_train_data(train_normed, labels, batch_size)
    print("Accuracy:", accuracy.eval({x: testi, y: testl}))
    print tf.nn.softmax(sess.run(pred, feed_dict={x: train_normed[500000:500005]})).eval()

NameError: name 'sess' is not defined

In [None]:
start = time.time()
problemx, problemy = train()
end = time.time()
print 'seconds to train: ' + str(end-start)

In [None]:
save_path = saver.save(sess, "/home/ryan/cs/kaggle/outbrain/src/model.ckpt")
print "Model saved in file"

In [None]:
# if 'data' in locals():
#     del data
# if 'labels' in locals():
#     del labels

In [48]:
print 'Number of entries in test_data: ' + str(clicks_test.shape[0])
start = time.time()
outbrain_test, disp_ids, ad_ids = load_raw_joined_test_df()
test_processed = process(outbrain_test, freq_tables)[0]
end = time.time()
print 'Seconds to load test data: ' + '%04d' % (end-start)

Number of entries in test_data: 32225162
Seconds to load test data: 0017


In [49]:
test_processed.head()

Unnamed: 0,timestamp,platform_freq,ad_id_freq,advertiser_id_freq,source_id_freq,campaign_id_freq,platform_1,platform_2,platform_3
0,1026,12785378,5642.0,682481.0,563928.0,6063.0,0,0,1
1,59173,12785378,5642.0,682481.0,563928.0,6063.0,0,0,1
2,501082,37519782,5642.0,682481.0,563928.0,6063.0,1,0,0
3,733229,37519782,5642.0,682481.0,563928.0,6063.0,1,0,0
4,897830,12785378,5642.0,682481.0,563928.0,6063.0,0,0,1


In [None]:
classification =

In [52]:
compressed = dict.fromkeys(disp_ids)

In [53]:
i = 0
start = time.time()
while i < len(classification):
    start_id = i
    curr_disp_id = disp_ids[i]
    while i < len(classification) and curr_disp_id == disp_ids[i]:
        i += 1
    indices = np.argsort(classification[start_id : i])[::-1]
    sorted_ads = np.array(ad_ids[start_id: i])[indices]
    ads_str = np.char.mod('%i', sorted_ads)
    ads_str = " ".join(ads_str)
    compressed[disp_ids[start_id]] = ads_str
print time.time() - start

IndexError: index 1 is out of bounds for axis 0 with size 1

In [None]:
start = time.time()
with open('dict.csv', 'wb') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(['display_id', 'ad_id'])
    for key, value in compressed.items():
        writer.writerow([key, value])
print "Seconds to write to .csv: " + str(time.time() - start)